In [1]:
import pandas as pd
import clients
import metrics

In [2]:
import random

def parse_df(file : str) -> pd.DataFrame:
    df = pd.DataFrame(columns=['Id', 'Complex_word', 'Text', 'Candidates'])
    
    with open(file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            line = line.split('\t')
            
            id = random.randint(1, 100000)
            text = line[0]
            complex_word = line[1]
            
            # the rest of the line is the candidates, which will be put in dictionaries. These dictionaries will have the candidate as key and the number of times it appearas as value
            candidates = {}
            for candidate in line[2:]:
                candidate = candidate.strip()
                if candidate in candidates:
                    candidates[candidate] += 1
                else:
                    candidates[candidate] = 1
                    
            # add new row to the dataframe
            new_df = pd.DataFrame([[id, complex_word, text, candidates]], columns=['Id', 'Complex_word', 'Text', 'Candidates'])
            df = pd.concat([df, new_df], ignore_index=True)
    
    return df

df = parse_df('../data/tsar2022_es_test_gold.tsv')
df.set_index('Id', inplace=True)
df.head()
            


Unnamed: 0_level_0,Complex_word,Text,Candidates
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
89497,labrarse,"A comienzos de la década de 1980, se trasladó ...","{'construirse': 4, 'trabajar': 1, 'ganarse': 2..."
10973,tertulias,A lo largo de sus más de veinte años de experi...,"{'conversaciones': 2, 'debates': 4, 'reunion':..."
97065,propiciado,A pesar de las pocas bajas (menos de 500 en to...,"{'favorecido': 3, 'desencadenado': 3, 'favorid..."
58392,decadencia,A pesar de las pocas bajas (menos de 500 en to...,"{'caída': 11, 'decaimiento': 1, 'abatimiento':..."
50812,caseríos,Al igual que otros municipios cercanos a Toled...,"{'casas': 5, 'casales': 1, 'trabajos': 1, 'pue..."


## Exploratory Data Analysis

EDA (or Exploratory Data Analysis) is the process of examining and understanding the intrinsics of a dataset in order to obtain the most useful information from it as possible. This process is crucial to understand the data and to make sure that the data is clean and ready to be used in a machine learning model. In this step, it's very useful to apply some statistical techniques and visualizations to understand what we're working with a bit better.

However, in this notebook, the data in question is not numerical, but textual. This means that we can't use the usual methods to analyse data, such as histograms, scatter plots, etc. Instead, Natural Language Processing (NLP) techniques will be used to extract the most important information.

### Basic Dataset Information

#### Dataset Head

#### Dataset Shape

#### Dataset Structure

#### Looking for Null Values

### Text Statistics

#### Word Frecuency Analysis

#### N-Gram Exploration

#### Wordcloud (Without Stopwords)

### Specific Domain Analysis and Metrics

A number of specific metrics to understand a sentence's complexity and readability have been developed over the years. In this notebook, we will use the following metrics, taken from this reference [paper](https://www.researchgate.net/publication/348660072_Automated_Readability_Assessment_for_Spanish_e-Government_Information) on readability assessment for Spanish e-Government information, and the follwing website, [Legible.es](https://legible.es/):
- [μ legibility](https://legible.es/blog/legibilidad-mu/)


#### μ legibility

The readability index μ is a readability index based on the length of words and the mean and variance of the number of letters per word. It is calculated as follows:

$$ μ = \frac{n}{n-1} \times \frac{\bar{x}}{\sigma^2} \times 100 $$

Where:
- $ n $: number of words in the text
- $ \bar{x} $: the mean number of letters per word 
- $ \sigma^2 $: the variance of letters per word


In [3]:
def get_number_of_words(text : str) -> float:
    return len(text.split())

def get_average_number_of_characters(text : str) -> float:
    return sum([len(word) for word in text.split()]) / get_number_of_words(text)

def get_variance_of_letters_per_word(text : str) -> float:
    average = get_average_number_of_characters(text)
    number_of_words = get_number_of_words(text)
    variance = sum([(len(word) - average) ** 2 for word in text.split()]) / number_of_words
    
    return variance

def get_mu_legibility(text : str) -> float:
    n = get_number_of_words(text)
    x = get_average_number_of_characters(text)
    sigma = get_variance_of_letters_per_word(text)
    
    return (n/(n-1)) * (x / sigma) * 100

# add new columns to the dataframe
df['Mu_legibilituy'] = df['Text'].apply(get_mu_legibility)
df.head()



Unnamed: 0_level_0,Complex_word,Text,Candidates,Mu_legibilituy
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
89497,labrarse,"A comienzos de la década de 1980, se trasladó ...","{'construirse': 4, 'trabajar': 1, 'ganarse': 2...",47.048815
10973,tertulias,A lo largo de sus más de veinte años de experi...,"{'conversaciones': 2, 'debates': 4, 'reunion':...",38.524947
97065,propiciado,A pesar de las pocas bajas (menos de 500 en to...,"{'favorecido': 3, 'desencadenado': 3, 'favorid...",57.156977
58392,decadencia,A pesar de las pocas bajas (menos de 500 en to...,"{'caída': 11, 'decaimiento': 1, 'abatimiento':...",57.156977
50812,caseríos,Al igual que otros municipios cercanos a Toled...,"{'casas': 5, 'casales': 1, 'trabajos': 1, 'pue...",62.283901


## Obtaining inferred candidates

In [4]:
# Comment out the models you don't want to use
open_source_models = [
    'llama-70b-chat',
    'mixtral-8x7b-instruct',
    #'gemma-7b', todavia no está disponible
    # 'falcon-40b-instruct'
    ]
closed_models = [
    'ChatGPT',
    'Gemini-1.0-pro'
    ]

model_list = open_source_models + closed_models

manager = clients.ClientManager(model_list)

aux = df.sample(1)
text = aux['Text'].values[0]
complex_word = aux['Complex_word'].values[0]
user,system = clients.create_user_prompt(text, complex_word)
outputs = manager.get_response(user, system)
outputs = clients.parse_outputs(outputs)
outputs = clients.candidate_lists_to_dict(outputs)

# show dict of candidates ordered by value
sorted_outputs = dict(sorted(outputs.items(), key=lambda item: item[1], reverse=True))
sorted_outputs



{'fuerte': 4,
 'corpulento': 4,
 'vigoroso': 3,
 'sólido': 2,
 'robusto': 2,
 'musculoso': 1,
 'resistente': 1,
 'hercúleo': 1,
 'macizo': 1,
 'sano': 1}

## Metrics

In [6]:
jacc = metrics.get_jaccard_score(outputs, aux['Candidates'].values[0])
cos_sim = metrics.cosine_similarity(outputs, aux['Candidates'].values[0])
sim = metrics.get_key_similarity(outputs, aux['Candidates'].values[0])
val = metrics.get_value_similarity(outputs, aux['Candidates'].values[0])

print(f'Jaccard: {jacc}')
print(f'Cosine similarity: {cos_sim}')
print(f'Similarity: {sim}')
print(f'Value similarity: {val}')


Jaccard: 0.375
Cosine similarity: 0.8856148855400952
Similarity: 0.6
Value similarity: 0.75
