In [1]:
#%pip install gensim
#%pip install nltk


In [2]:
from gensim.models import KeyedVectors
import numpy as np
import random
from nltk.stem import SnowballStemmer
import json
import random

## Import and preprocess the data

In [3]:
with open('dataset.json', encoding='utf-8') as file:
    dataset = json.load(file)

In [4]:
# Transform the JSON into a list of [[w1,w2,w3,w4,w5],solution] items
datalist = [[list(d.values())[0:5], d['solution']] for d in dataset]

## Choose distractors

In [5]:
# Import a italian word2vec model (https://mlunicampania.gitlab.io/italian-word2vec/)
model = KeyedVectors.load("SG-300-W10N20E50/W2V.kv", mmap='r+')

# Save the words and their vectors
vocabs = model.index_to_key 
vectors = model.vectors

In [6]:
# Create a list for the words of our datalist that are not in the word2vec vocabulary
not_in_vocabs = []

# Check if all the words in datalist are in the vocabulary
for sublist in datalist:
    for word in sublist[0] + [sublist[1]]:
        if word not in vocabs and word not in not_in_vocabs:
            not_in_vocabs.append(word)

# Print the words that are not in the vocabulary
print("Las siguientes palabras no están en 'vocabs':", not_in_vocabs)

#Print how many words are not in the vocabulary
print(len(not_in_vocabs), "words are not in the vocabulary")

#Print the percentage of words that are not in the vocabulary
print(len(not_in_vocabs)/(len(datalist)*5),"% of the words are not in the vocabulary")
# As we can see, the percentage that are not in the vocabulary is very low, so we can ignore them

Las siguientes palabras no están en 'vocabs': ['x men', 'fantastici 4', 'gran premio', '68', 'terra santa', 'san giovanni', 'new york', 'cin cin', 'de amicis', "non c'è", "d'annunzio", 'secondo tempo', 'terza età', 'totò e peppino']
14 words are not in the vocabulary
0.009333333333333334 % of the words are not in the vocabulary


In [7]:
# Stemmer for italian words (to check if two words share the same root)
stemmer = SnowballStemmer('italian')

# Create an empty list to store all the JSON objects
final_json = []

for i in range(len(datalist)):

    # Our 5 words
    words = datalist[i][0].copy()

    # Delete the words that are not in the italian vocabulary because we will not calculate the distractors with them.
    # (We can do this because there is not any game that has more tha 1 not_in_vocabs word, so we will always have at least 4 words to calculate the distractors)
    words = [word for word in words if word not in not_in_vocabs]
    if len(words) < 4:
        print("ALERT: Less than 4 words found in vocabulary") # This never happens, 

    # Our 5 word and the solution. (This list will be used to avoid creating new distractors that are similar to the solution, the 5 words or the other distractors)
    banned_words = words.copy()
    banned_words.append(datalist[i][1])

    # Create a list to store the distractors
    distractor_list = []


    for _ in range(3):
        # Choose randomly between 2 and {len(words) - 1} words
        random_index_list = random.sample(list(range(len(words))), random.randint(2, len(words) - 1))
        random_words = [words[i] for i in random_index_list]

        # Select embeeding vectors of chosen words
        word_vectors = [vectors[vocabs.index(w)] for w in random_words]

        # Sum the vectors
        vector_sum = np.sum(word_vectors, axis=0)

        # Find the most similar words to the sum vector in the vocabulary
        similar_word = model.similar_by_vector(vector_sum, topn=20)

        #Check that the possible new distractor have not the same root as the original words, the solution or the other distractors
        selected_words = [w[0] for w in similar_word if all(stemmer.stem(w[0]) != stemmer.stem(bw) for bw in banned_words)]

        # Check if selected_words is empty
        if len(selected_words) == 0:
            print("ALERT: No words found") # This never happens
        else:
            # Add the word to the distractor list and the banned words list
            distractor_list.append(selected_words[0])
            banned_words.append(selected_words[0])

    # Add the solution and the distractors to make the choices list
    choices = distractor_list.copy()
    choices.append(datalist[i][1])
    
    # Shuffle the choices list randomly
    random.shuffle(choices)

    # Find the index of the solution in the list
    label = choices.index(datalist[i][1])
    
    # Add the JSON object to the final_json list
    final_json.append({'w1': datalist[i][0][0], 
                     'w2': datalist[i][0][1], 
                     'w3': datalist[i][0][2], 
                     'w4': datalist[i][0][3], 
                     'w5': datalist[i][0][4], 
                     'choices': choices, 
                     'label': label})




In [8]:
# Shuffle the final list randomly
random.shuffle(final_json)

# Split the list into train and test
split_index = int(0.8 * len(final_json))
train_json = final_json[:split_index]
test_json = final_json[split_index:]

In [9]:
# Save both JSONLs 

with open('Ghigliottin-AI-task1-train-data.jsonl', 'w', encoding='utf-8') as file:
    for dict in train_json:
        json.dump(dict, file, ensure_ascii=False)
        file.write('\n')

with open('Ghigliottin-AI-task1-test-data.jsonl', 'w', encoding='utf-8') as file:
    for dict in test_json:
        json.dump(dict, file, ensure_ascii=False)
        file.write('\n')

# Prompts

In [100]:
prompt_list = [
"Given the following words: {w1}, {w2}, {w3}, {w4}, {w5}; and the list of choices: {choices}. Which choice is related to all the 5 words? Please answer only with the index of the choice in the list"
]

In [101]:
input_prompts = []
for data in train_json:
    formatted_text = prompt_list[0].format(w1=data['w1'], w2=data['w2'], w3=data['w3'], w4=data['w4'], w5=data['w5'], choices=data['choices'])
    input_prompts.append(formatted_text)

In [102]:
#%pip install replicate
import os
import replicate
from getpass import getpass
import re

In [103]:
os.environ["REPLICATE_API_TOKEN"] = "r8_BU4b7PNHgP3LHseHL2j43ia0HY6NZ604eCu20"

In [104]:
llama2_70b = "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1"
llama2_13b = "a16z-infra/llama-2-13b-chat:2a7f981751ec7fdf87b5b91ad4db53683a98082e9ff7bfd12c8cd5ea85980a52"

In [None]:
prompt_outputs = []
for prompt in input_prompts[0:5]:
    output = replicate.run(
        "meta/llama-2-70b-chat",
        input={
            "prompt": prompt,
            "system_prompt": "The first token of your answer must be one of these numbers (0,1,2,3).",
        },
    )
    prompt_outputs.append(''.join(output))


In [None]:
output_lables = []
error_outputs = []
for o in prompt_outputs:


    match = re.search(r'[0-3]', o)

    if match:
        first_number = match.group()
        output_lables.append(first_number)  # Output: '1'
    else:
        print("No number found in the string.")


In [106]:
input_prompts[0:10]

["Given the following words: paese, vizio, marchio, incendio, nobile; and the list of choices: ['castello', 'casato', 'brand', 'origine']. Which choice is related to all the 5 words? Please answer only with the index of the choice in the list",
 "Given the following words: fiume, pesca, medicina, saggezza, filo; and the list of choices: ['perle', 'canapa', 'scienza', 'foce']. Which choice is related to all the 5 words? Please answer only with the index of the choice in the list",
 "Given the following words: mettere, tempesta, modesto, nuovo, garanzia; and the list of choices: ['avviso', 'compenso', 'finanziario', 'burrasca']. Which choice is related to all the 5 words? Please answer only with the index of the choice in the list",
 "Given the following words: pubblico, stretto, primo, entrare, radar; and the list of choices: ['entrarvi', 'sonar', 'missile', 'contatto']. Which choice is related to all the 5 words? Please answer only with the index of the choice in the list",
 "Given the

In [99]:
prompt_outputs

['The answer is:\n\n2',
 'The answer is:\n\n2\n\nThe choice related to all the 5 words is "scienza".',
 'The answer is:\n\n2',
 'The answer is:\n\n2. missile',
 'The answer is:\n\n2',
 'The answer is:\n\n2',
 "The answer is:\n\n2. 'coltello'",
 'The answer is:\n\n2',
 'The answer is:\n\n2\n\nThe choice "sfida" is related to all the 5 words:\n\n* "Partita" means "game" in Italian, and a game can be a challenge or a sfida.\n* "Onore" means "honor" in Italian, and a person\'s honor can be challenged or put to the test in a sfida.\n* "Compagnia" means "company" in Italian, and a company can face challenges or sfide in its business operations.\n* "Orizzonte" means "horizon" in Italian',
 'The answer is:\n\n2\n\nThe choice "cadere" is related to all the 5 words:\n\n* "Terra" can be translated as "land" or "earth", and "cadere" means "to fall".\n* "Dolore" means "pain", and "cadere" can be used to describe a painful sensation.\n* "Perdere" means "to lose", and "cadere" can be used to describe

In [85]:
output_lables

['2', '2', '2', '2', '2', '2', '2', '2', '2', '2']

In [112]:
#%pip install gpt4all
#%pip install langchain
# Import dependencies
from langchain import PromptTemplate, LLMChain
#from langchain.llms import GPT4All

In [116]:
from gpt4all import GPT4All
model = GPT4All("orca-mini-3b-gguf2-q4_0.gguf")

In [131]:
output = model.generate(input_prompts[0], max_tokens=30)


In [132]:
input_prompts[0]

"Given the following words: paese, vizio, marchio, incendio, nobile; and the list of choices: ['castello', 'casato', 'brand', 'origine']. Which choice is related to all the 5 words? Please answer only with the index of the choice in the list"

In [133]:
output

'.'

In [None]:
#
#
## Specify model weights path
#PATH='./nous-hermes-13b.ggmlv3.q4_0.bin'
#
## Create LLM Class
#llm = GPT4All(model=PATH, verbose=True)
#
## Create a prompt template
#prompt = PromptTemplate(
#    input_variables=['instruction', 'input', 'response'],
#    template="""
#    ### Instruction:
#    {instruction}
#    ### Input:
#    {input}
#    ### Response:
#    {response}
#    """ )
#
#chain = LLMChain(prompt=prompt, llm=llm)
#
## Run the prompt
## I used a childen story to test https://cuentosparadormir.com/infantiles/cuento/barba-flamenco-y-el-recortador-de-cuentos
## its about 783 words long!
#chain.run(instruction="""Resume esta historia, hazlo en español""",
#input="""[...story content...]""",
#response='A: ')