This notebook tests the feasibility of using ChatGPT to generate sentences

In [87]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [88]:
import os, sys
from code_files.utils.nounited_creator import amuse_request, invero_request, chatgpt_request, create_nounited_dataset, save_nounited_dataset
from code_files.utils.utils_functions import load_dotenv
import json

env_dict = load_dotenv('./.env')

In [65]:
datasets_path = './datasets/'

amuse_url = 'http://127.0.0.1:3002/api/model' # put here your endpoint for AMuSE-WSD!
invero_url = 'http://127.0.0.1:3003/api/model' # put here your endpoint for invero-xl!

In [66]:
unambiguous_candidates_path = './va_resources/candidates_unambiguous.json' # path to the unambiguous candidates to be used to create the nominal part
# load the unambiguous nominal events:
with open(unambiguous_candidates_path, 'r') as json_file:
    candidates_unambiguous = json.load(json_file)

In [67]:
unambiguous_synsets_list = list(candidates_unambiguous.keys())
len(unambiguous_synsets_list)

2899

In [68]:
chatgpt_request(["what is the meaning of life?","what is your name?"], env_dict['OPENAI_KEY'])

{'id': 'cmpl-6SEUPuVcs8lPKyWAoteXspI89lll4',
 'object': 'text_completion',
 'created': 1672186901,
 'model': 'text-davinci-003',
 'choices': [{'text': '\n\nThe meaning of life is different for every individual. For some, it may mean being happy, while for others, it may mean pursuing challenging goals or giving back to the community. Ultimately, the answer to this question is subjective and can only be answered by the individual.',
   'index': 0,
   'logprobs': None,
   'finish_reason': 'stop'},
  {'text': '\n\nMy name is Bob.',
   'index': 1,
   'logprobs': None,
   'finish_reason': 'stop'}],
 'usage': {'prompt_tokens': 12, 'completion_tokens': 63, 'total_tokens': 75}}

In [91]:
import re
import time
from tqdm import tqdm

chatgpt_dataset = []
num_phrases_per_synset = 30
chatgpt_request_intervals = 3
chatgpt_chunk_size = 2

pbar = tqdm(range(0,len(unambiguous_synsets_list),chatgpt_chunk_size), disable=False)
err_encountered = 0
pbar_desc = lambda: f"Errors: {err_encountered}"
for c in pbar:
    pbar.set_description(pbar_desc())
    query_text_list = []
    for synset_name in unambiguous_synsets_list[c:c+chatgpt_chunk_size]:
        query_text_list += [f'generate {num_phrases_per_synset} phrases in which the WordNet synset "{synset_name}" is used as an event']
    retrials = 0
    max_retrials = 5
    retry = True
    while retry is True:
        res = chatgpt_request(query_text_list, env_dict['OPENAI_KEY'], timeout=40)
        if res is not None:
            for synset_name, answer in zip(unambiguous_synsets_list[c:c+chatgpt_chunk_size], res):
                sentences_res_list = re.split(r"\n+\d+. ", answer)[1:]
                if synset_name not in sentences_res_list[0]: # it didn't took too literally to use the synset in the phrase
                    chatgpt_dataset += sentences_res_list
                    retry = False
        if retry is True:
            retrials += 1
            if retrials >= max_retrials:
                err_encountered += 1
                break
        time.sleep(chatgpt_request_intervals)

Errors: 0:   0%|          | 4/1450 [01:29<9:00:07, 22.41s/it]


KeyboardInterrupt: 

In [92]:
chatgpt_dataset

['The ever-present tolerance of the president in his interaction with foreign leaders was remarkable. ',
 "The town's increasing tolerance of different religions has been an asset to diversity. ",
 'Despite tremendous political strain, countries brokered a policy of tolerance. ',
 'In order to reduce petty conflict, tolerance amongst neighbours was promoted. ',
 'The students taught the schoolmates the value of tolerance in the face of adversity. ',
 'The nation instilled a sense of tolerance in its citizens, granting them trust and freedom. ',
 'The couple remarked on their mutual tolerance, which enabled them to live peacefully together. ',
 'The UN has continually stressed the need for global tolerance. ',
 'The mayor made a statement calling for tolerance in the occupied city. ',
 'Community organizations led educational seminars on the importance of tolerance. ',
 'The judge ruled in favour of tolerance when making her judgement. ',
 'The government deployed a policy of increasing

In [93]:
with open('./datasets/chatgpt/noun_sentences.json', 'w') as fout:
    json.dump(chatgpt_dataset, fout, indent=4)

In [94]:
sentences_testing_dataset = ["Marco is running a marathon.", "The eating of Marco is very loud."] # this is for testing 
sentences_dataset = chatgpt_dataset # a list of sentences, the starting dataset
chunk_size = 16 # number of sentences to query amuse and invero
window_span_error = 3 # the displacement between invero and amuse tokenization indices. The greater, the less are the incorrelations errors
lang = "EN"

noUniteD_srl_result, nominal_event_count = create_nounited_dataset(
    sentences_list = sentences_dataset,
    unambiguous_candidates_path = unambiguous_candidates_path,
    amuse_url = amuse_url, invero_url = invero_url,
    chunk_size = chunk_size, window_span_error = window_span_error, lang = lang
)

Nominal found: 117, Verbal found: 309, Incorrelations: 1, Chunk errors: 4, Sentences: 169: 100%|██████████| 15/15 [00:09<00:00,  1.64it/s]


In [96]:
select_sample = 1
print([(i,w) for i,w in enumerate(noUniteD_srl_result[select_sample]['words']) ], '\n')
print([(i,w) for i,w in enumerate(noUniteD_srl_result[select_sample]['predicates']) ], '\n')
print(noUniteD_srl_result[select_sample]['roles'], '\n')
print([(i,w) for i,w in enumerate(noUniteD_srl_result[select_sample]['lemmas']) ], '\n')
print([(i,w) for i,w in enumerate(noUniteD_srl_result[select_sample]['pos_tags']) ], '\n')
print(noUniteD_srl_result[select_sample]['num_v'], noUniteD_srl_result[select_sample]['num_n'])

[(0, 'The'), (1, 'town'), (2, "'s"), (3, 'increasing'), (4, 'tolerance'), (5, 'of'), (6, 'different'), (7, 'religions'), (8, 'has'), (9, 'been'), (10, 'an'), (11, 'asset'), (12, 'to'), (13, 'diversity'), (14, '.')] 

[(0, '_'), (1, '_'), (2, '_'), (3, 'INCREASE_ENLARGE_MULTIPLY'), (4, '_'), (5, '_'), (6, '_'), (7, '_'), (8, '_'), (9, '_'), (10, '_'), (11, '_'), (12, '_'), (13, '_'), (14, '_')] 

{'3': ['_', '_', '_', '_', 'Patient', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_']} 

[(0, 'the'), (1, 'town'), (2, "'s"), (3, 'increase'), (4, 'tolerance'), (5, 'of'), (6, 'different'), (7, 'religion'), (8, 'have'), (9, 'be'), (10, 'an'), (11, 'asset'), (12, 'to'), (13, 'diversity'), (14, '.')] 

[(0, 'DET'), (1, 'NOUN'), (2, 'PART'), (3, 'VERB'), (4, 'NOUN'), (5, 'ADP'), (6, 'ADJ'), (7, 'NOUN'), (8, 'AUX'), (9, 'AUX'), (10, 'DET'), (11, 'NOUN'), (12, 'ADP'), (13, 'NOUN'), (14, 'PUNCT')] 

3 0


In [None]:
save_nounited_dataset(
    noUniteD_srl_result = noUniteD_srl_result, 
    dir_path = './datasets/chatgpt_nounited_srl/', lang = 'EN', 
    train_ratio = 0.8, num_dataset_divisions = 2, shuffle = True
)

Conclusions: responses from the service are slow. Moreover, sentences are very similar.