In [1]:
import pandas as pd
import time
import clients
import metrics

In [2]:
import random

def parse_df(file : str) -> pd.DataFrame:
    df = pd.DataFrame(columns=['Id', 'Complex_word', 'Text', 'Candidates'])
    
    with open(file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            line = line.split('\t')
            
            text = line[0]
            complex_word = line[1]
            
            # the rest of the line is the candidates, which will be put in dictionaries. These dictionaries will have the candidate as key and the number of times it appearas as value
            candidates = {}
            for candidate in line[2:]:
                candidate = candidate.strip()
                if candidate in candidates:
                    candidates[candidate] += 1
                else:
                    candidates[candidate] = 1
                    
            # add new row to the dataframe
            new_df = pd.DataFrame([[complex_word, text, candidates]], columns=['Complex_word', 'Text', 'Candidates'])
            df = pd.concat([df, new_df], ignore_index=True)
    
    return df

df = parse_df('../data/tsar2022_es_test_gold.tsv')
df.head()
            


Unnamed: 0,Id,Complex_word,Text,Candidates
0,,labrarse,"A comienzos de la década de 1980, se trasladó ...","{'construirse': 4, 'trabajar': 1, 'ganarse': 2..."
1,,tertulias,A lo largo de sus más de veinte años de experi...,"{'conversaciones': 2, 'debates': 4, 'reunion':..."
2,,propiciado,A pesar de las pocas bajas (menos de 500 en to...,"{'favorecido': 3, 'desencadenado': 3, 'favorid..."
3,,decadencia,A pesar de las pocas bajas (menos de 500 en to...,"{'caída': 11, 'decaimiento': 1, 'abatimiento':..."
4,,caseríos,Al igual que otros municipios cercanos a Toled...,"{'casas': 5, 'casales': 1, 'trabajos': 1, 'pue..."


## Exploratory Data Analysis

EDA (or Exploratory Data Analysis) is the process of examining and understanding the intrinsics of a dataset in order to obtain the most useful information from it as possible. This process is crucial to understand the data and to make sure that the data is clean and ready to be used in a machine learning model. In this step, it's very useful to apply some statistical techniques and visualizations to understand what we're working with a bit better.

However, in this notebook, the data in question is not numerical, but textual. This means that we can't use the usual methods to analyse data, such as histograms, scatter plots, etc. Instead, Natural Language Processing (NLP) techniques will be used to extract the most important information.

### Basic Dataset Information

#### Dataset Head

#### Dataset Shape

#### Dataset Structure

#### Looking for Null Values

### Text Statistics

#### Word Frecuency Analysis

#### N-Gram Exploration

#### Wordcloud (Without Stopwords)

### Specific Domain Analysis and Metrics

A number of specific metrics to understand a sentence's complexity and readability have been developed over the years. In this notebook, we will use the following metrics, taken from this reference [paper](https://www.researchgate.net/publication/348660072_Automated_Readability_Assessment_for_Spanish_e-Government_Information) on readability assessment for Spanish e-Government information, and the follwing website, [Legible.es](https://legible.es/):
- [μ legibility](https://legible.es/blog/legibilidad-mu/)


#### μ legibility

The readability index μ is a readability index based on the length of words and the mean and variance of the number of letters per word. It is calculated as follows:

$$ μ = \frac{n}{n-1} \times \frac{\bar{x}}{\sigma^2} \times 100 $$

Where:
- $ n $: number of words in the text
- $ \bar{x} $: the mean number of letters per word 
- $ \sigma^2 $: the variance of letters per word


In [3]:
def get_number_of_words(text : str) -> float:
    return len(text.split())

def get_average_number_of_characters(text : str) -> float:
    return sum([len(word) for word in text.split()]) / get_number_of_words(text)

def get_variance_of_letters_per_word(text : str) -> float:
    average = get_average_number_of_characters(text)
    number_of_words = get_number_of_words(text)
    variance = sum([(len(word) - average) ** 2 for word in text.split()]) / number_of_words
    
    return variance

def get_mu_legibility(text : str) -> float:
    n = get_number_of_words(text)
    x = get_average_number_of_characters(text)
    sigma = get_variance_of_letters_per_word(text)
    
    return (n/(n-1)) * (x / sigma) * 100

# add new columns to the dataframe
df['Mu_legibilituy'] = df['Text'].apply(get_mu_legibility)
df.head()



Unnamed: 0,Id,Complex_word,Text,Candidates,Mu_legibilituy
0,,labrarse,"A comienzos de la década de 1980, se trasladó ...","{'construirse': 4, 'trabajar': 1, 'ganarse': 2...",47.048815
1,,tertulias,A lo largo de sus más de veinte años de experi...,"{'conversaciones': 2, 'debates': 4, 'reunion':...",38.524947
2,,propiciado,A pesar de las pocas bajas (menos de 500 en to...,"{'favorecido': 3, 'desencadenado': 3, 'favorid...",57.156977
3,,decadencia,A pesar de las pocas bajas (menos de 500 en to...,"{'caída': 11, 'decaimiento': 1, 'abatimiento':...",57.156977
4,,caseríos,Al igual que otros municipios cercanos a Toled...,"{'casas': 5, 'casales': 1, 'trabajos': 1, 'pue...",62.283901


## Obtaining inferred candidates

In [4]:
# # Comment out the models you don't want to use
# open_source_models = [
#     # 'llama-70b-chat',
#     # 'mixtral-8x7b-instruct',
#     #'gemma-7b', todavia no está disponible
#     # 'falcon-40b-instruct'
#     ]
# closed_models = [
#     'ChatGPT',
#     # 'Gemini-1.0-pro'
# ]

# model_list = open_source_models + closed_models

# client = clients.ChatGPTClient()
# start_time = time.time()
# i = 0
# error_list = []
# df['Chatgpt_inferred_candidates'] = None

# for index, row in df.iterrows():
#     i += 1
#     if i % 10 == 0:
#         print(f'Processed {i} rows in {time.time() - start_time} seconds')
    
#     text = row['Text']
#     complex_word = row['Complex_word']
    
#     try:
#         candidates = client.get_response(text, complex_word)
#         candidates = clients.parse_output(candidates)
        
#         candidate_dict = {}
#         for candidate in candidates:
#             if candidate in candidate_dict:
#                 candidate_dict[candidate] += 1
#             else:
#                 candidate_dict[candidate] = 1
        
        
#         df.at[index, 'Chatgpt_inferred_candidates'] = candidate_dict
#     except Exception as e:
#         error_list.append((index, e))
#         df.at[index, 'Inferred_candidates'] = None


In [5]:
df = pd.read_csv('../data/chatgpt_inferred_candidates.csv')
df.head()

Unnamed: 0,Complex_word,Text,Candidates,Mu_legibilituy,Chatgpt_inferred_candidates
0,labrarse,"A comienzos de la década de 1980, se trasladó ...","{'construirse': 4, 'trabajar': 1, 'ganarse': 2...",47.048815,"{'ganarse': 1, 'forjarse': 1, 'construirse': 1..."
1,tertulias,A lo largo de sus más de veinte años de experi...,"{'conversaciones': 2, 'debates': 4, 'reunion':...",38.524947,"{'sobremesas': 1, 'charlas': 1, 'conversacione..."
2,propiciado,A pesar de las pocas bajas (menos de 500 en to...,"{'favorecido': 3, 'desencadenado': 3, 'favorid...",57.156977,"{'ocasionado': 1, 'causado': 1, 'provocado': 1..."
3,decadencia,A pesar de las pocas bajas (menos de 500 en to...,"{'caída': 11, 'decaimiento': 1, 'abatimiento':...",57.156977,"{'declive': 1, 'deterioro': 1, 'caída': 1, 'de..."
4,caseríos,Al igual que otros municipios cercanos a Toled...,"{'casas': 5, 'casales': 1, 'trabajos': 1, 'pue...",62.283901,"{'pequeños pueblos': 1, 'aldeas': 1, 'aldeanos..."


In [6]:



# client = clients.GeminiClient()
# start_time = time.time()
# i = 0
# error_list = []
# df['Gemini_inferred_candidates'] = None

# for index, row in df.iterrows():
#     i += 1
#     if i % 10 == 0:
#         print(f'Processed {i} rows in {time.time() - start_time} seconds')
    
#     text = row['Text']
#     complex_word = row['Complex_word']
    
#     try:
#         candidates = client.get_response(text, complex_word)
#         candidates = clients.parse_output(candidates)
        
#         candidate_dict = {}
#         for candidate in candidates:
#             if candidate in candidate_dict:
#                 candidate_dict[candidate] += 1
#             else:
#                 candidate_dict[candidate] = 1
        
        
#         df.at[index, 'Gemini_inferred_candidates'] = candidate_dict
#     except Exception as e:
#         break

# df.to_csv('../data/gemini_inferred_candidates.csv', index=False)


In [7]:

# df = pd.read_csv('../data/gemini_inferred_candidates.csv')

# client = clients.LLamaAPIClient(model='llama-70b-chat')
# start_time = time.time()
# i = 0
# error_list = []
# df['Llama_inferred_candidates'] = None

# for index, row in df.iterrows():
#     i += 1
#     if i % 10 == 0:
#         print(f'Processed {i} rows in {time.time() - start_time} seconds')
        
#     if row['Llama_inferred_candidates'] is None:
        
#         text = row['Text']
#         complex_word = row['Complex_word']
        
#         try:
#             candidates = client.get_response(text, complex_word)
#             candidates = clients.parse_output(candidates)
            
#             candidate_dict = {}
#             for candidate in candidates:
#                 if candidate in candidate_dict:
#                     candidate_dict[candidate] += 1
#                 else:
#                     candidate_dict[candidate] = 1
            
            
#             df.at[index, 'Llama_inferred_candidates'] = candidate_dict
#         except Exception as e:
#             break

# df.to_csv('../data/llama_inferred_candidates.csv', index=False)


In [8]:
df = pd.read_csv('../data/llama_inferred_candidates.csv')
df.head()

Unnamed: 0,Complex_word,Text,Candidates,Mu_legibilituy,Chatgpt_inferred_candidates,Gemini_inferred_candidates,Llama_inferred_candidates
0,labrarse,"A comienzos de la década de 1980, se trasladó ...","{'construirse': 4, 'trabajar': 1, 'ganarse': 2...",47.048815,"{'ganarse': 1, 'forjarse': 1, 'construirse': 1...","{'construir': 1, 'formar': 1, 'forjar': 1, 'ga...","{'formar': 1, 'crear': 1, 'construir': 1, 'est..."
1,tertulias,A lo largo de sus más de veinte años de experi...,"{'conversaciones': 2, 'debates': 4, 'reunion':...",38.524947,"{'sobremesas': 1, 'charlas': 1, 'conversacione...","{'conversaciones': 1, 'debates': 1, 'charlas':...","{'debates': 1, 'discusiones': 1, 'conversacion..."
2,propiciado,A pesar de las pocas bajas (menos de 500 en to...,"{'favorecido': 3, 'desencadenado': 3, 'favorid...",57.156977,"{'ocasionado': 1, 'causado': 1, 'provocado': 1...","{'provocado': 1, 'causado': 1, 'producido': 1,...","{'facilitado': 1, 'permitted': 1, 'fomentado':..."
3,decadencia,A pesar de las pocas bajas (menos de 500 en to...,"{'caída': 11, 'decaimiento': 1, 'abatimiento':...",57.156977,"{'declive': 1, 'deterioro': 1, 'caída': 1, 'de...","{'caída': 1, 'declive': 1, 'debilitamiento': 1...","{'declive': 1, 'caída': 1, 'deterioro': 1, 'di..."
4,caseríos,Al igual que otros municipios cercanos a Toled...,"{'casas': 5, 'casales': 1, 'trabajos': 1, 'pue...",62.283901,"{'pequeños pueblos': 1, 'aldeas': 1, 'aldeanos...","{'granjas': 1, 'casas': 1, 'viviendas': 1, 'fi...","{'ranchos': 1, 'granjas': 1, 'aldeas': 1, 'pue..."


In [9]:

# df = pd.read_csv('../data/llama_inferred_candidates.csv')

# client = clients.LLamaAPIClient(model='mixtral-8x7b-instruct')
# start_time = time.time()
# i = 0
# error_list = []
# df['Mixtral_inferred_candidates'] = None

# for index, row in df.iterrows():
#     i += 1
#     if i % 10 == 0:
#         print(f'Processed {i} rows in {time.time() - start_time} seconds')
        
#     if row['Mixtral_inferred_candidates'] is None:
        
#         text = row['Text']
#         complex_word = row['Complex_word']
        
#         try:
#             candidates = client.get_response(text, complex_word)
#             candidates = clients.parse_output(candidates)
            
#             candidate_dict = {}
#             for candidate in candidates:
#                 if candidate in candidate_dict:
#                     candidate_dict[candidate] += 1
#                 else:
#                     candidate_dict[candidate] = 1
            
            
#             df.at[index, 'Mixtral_inferred_candidates'] = candidate_dict
#         except Exception as e:
#             break

# df.to_csv('../data/mixtral_inferred_candidates.csv', index=False)


In [10]:

df = pd.read_csv('../data/mixtral_inferred_candidates.csv')

client = clients.Palm2Client()
start_time = time.time()
i = 0
error_list = []
df['Palm2_inferred_candidates'] = None

for index, row in df.iterrows():
    i += 1
    if i % 10 == 0:
        print(f'Processed {i} rows in {time.time() - start_time} seconds')
        
    if row['Palm2_inferred_candidates'] is None:
        
        text = row['Text']
        complex_word = row['Complex_word']
        
        try:
            candidates = client.get_response(text, complex_word)
            candidates = clients.parse_output(candidates)
            
            candidate_dict = {}
            for candidate in candidates:
                if candidate in candidate_dict:
                    candidate_dict[candidate] += 1
                else:
                    candidate_dict[candidate] = 1
            
            
            df.at[index, 'Palm2_inferred_candidates'] = candidate_dict
        except Exception as e:
            break

df.to_csv('../data/palm2_inferred_candidates.csv', index=False)
df.head()


Processed 10 rows in 6.54483699798584 seconds
Processed 20 rows in 14.18565320968628 seconds
Processed 30 rows in 21.161096572875977 seconds
Processed 40 rows in 27.64164638519287 seconds
Processed 50 rows in 33.72587609291077 seconds
Processed 60 rows in 39.91903376579285 seconds
Processed 70 rows in 46.132617473602295 seconds
Processed 80 rows in 52.316890716552734 seconds
Processed 90 rows in 58.85001564025879 seconds
Processed 100 rows in 64.84023189544678 seconds
Processed 110 rows in 71.24763011932373 seconds
Processed 120 rows in 82.07418727874756 seconds
Processed 130 rows in 88.4005491733551 seconds
Processed 140 rows in 94.51487517356873 seconds
Processed 150 rows in 100.62764978408813 seconds
Processed 160 rows in 107.15324401855469 seconds
Processed 170 rows in 113.28772497177124 seconds
Processed 180 rows in 119.22724747657776 seconds
Processed 190 rows in 125.06210660934448 seconds
Processed 200 rows in 131.39890480041504 seconds
Processed 210 rows in 137.12690091133118 s

Unnamed: 0,Complex_word,Text,Candidates,Mu_legibilituy,Chatgpt_inferred_candidates,Gemini_inferred_candidates,Llama_inferred_candidates,Mixtral_inferred_candidates,Palm2_inferred_candidates
0,labrarse,"A comienzos de la década de 1980, se trasladó ...","{'construirse': 4, 'trabajar': 1, 'ganarse': 2...",47.048815,"{'ganarse': 1, 'forjarse': 1, 'construirse': 1...","{'construir': 1, 'formar': 1, 'forjar': 1, 'ga...","{'formar': 1, 'crear': 1, 'construir': 1, 'est...","{'formar': 1, 'crear': 1, 'construir': 1, 'est...","{'ganarse': 1, 'construir': 1, 'formar': 1, 'c..."
1,tertulias,A lo largo de sus más de veinte años de experi...,"{'conversaciones': 2, 'debates': 4, 'reunion':...",38.524947,"{'sobremesas': 1, 'charlas': 1, 'conversacione...","{'conversaciones': 1, 'debates': 1, 'charlas':...","{'debates': 1, 'discusiones': 1, 'conversacion...","{'debates': 1, 'discusiones': 1, 'conversacion...","{'conversaciones': 1, 'charlas': 1, 'discusion..."
2,propiciado,A pesar de las pocas bajas (menos de 500 en to...,"{'favorecido': 3, 'desencadenado': 3, 'favorid...",57.156977,"{'ocasionado': 1, 'causado': 1, 'provocado': 1...","{'provocado': 1, 'causado': 1, 'producido': 1,...","{'facilitado': 1, 'permitted': 1, 'fomentado':...","{'facilitado': 1, 'permitted': 1, 'fomentado':...","{'causado': 1, 'producido': 1, 'originado': 1,..."
3,decadencia,A pesar de las pocas bajas (menos de 500 en to...,"{'caída': 11, 'decaimiento': 1, 'abatimiento':...",57.156977,"{'declive': 1, 'deterioro': 1, 'caída': 1, 'de...","{'caída': 1, 'declive': 1, 'debilitamiento': 1...","{'declive': 1, 'caída': 1, 'deterioro': 1, 'di...","{'declive': 1, 'caída': 1, 'deterioro': 1, 'di...","{'deterioro': 1, 'destrucción': 1, 'caída': 1,..."
4,caseríos,Al igual que otros municipios cercanos a Toled...,"{'casas': 5, 'casales': 1, 'trabajos': 1, 'pue...",62.283901,"{'pequeños pueblos': 1, 'aldeas': 1, 'aldeanos...","{'granjas': 1, 'casas': 1, 'viviendas': 1, 'fi...","{'ranchos': 1, 'granjas': 1, 'aldeas': 1, 'pue...","{'ranchos': 1, 'granjas': 1, 'aldeas': 1, 'pue...","{'casas': 1, 'viviendas': 1, 'chalets': 1, 'ca..."
