# Modelo: SAND (SentenceBERT, AnglE Embeddings, N-grams, Difference in Length)

### Requisitos e importaciones

In [None]:
pip install nltk sentence_transformers -U angle-emb



In [None]:
import pandas as pd
import torch
import sklearn
import string
import nltk
import transformers
import huggingface_hub
import sentence_transformers
import math
import numpy as np

from sentence_transformers import SentenceTransformer, util, evaluation, models, InputExample, losses
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from angle_emb import AnglE, AngleDataTokenizer, Prompts
from scipy.stats import spearmanr
from torch import nn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import linear_model
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = stopwords.words('spanish')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Importamos y preparamos el dataset

In [None]:
# dir = str: dirección en drive del dataset

def ds_prep(dir):
  ds = pd.read_csv(dir)
  ds['Split'] = ds['Text'].apply(lambda x: x.split("\n"))
  return ds

eng_ds = ds_prep('/content/drive/MyDrive/Semeval 2024/eng_ds/eng_train.csv')
esp_ds = ds_prep('/content/drive/MyDrive/Semeval 2024/esp_ds/esp_train.csv')

# Lo de abajo corresponde a los datasets necesarios para el task como tal

eng_train = ds_prep('/content/drive/MyDrive/Semeval 2024/eng_ds/eng_train.csv')
eng_val = ds_prep('/content/drive/MyDrive/Semeval 2024/eng_ds/eng_dev_with_labels.csv')
eng_test = ds_prep('/content/drive/MyDrive/Semeval 2024/eng_ds/eng_test.csv')

esp_train = ds_prep('/content/drive/MyDrive/Semeval 2024/esp_ds/esp_train.csv')
esp_val = ds_prep('/content/drive/MyDrive/Semeval 2024/esp_ds/esp_dev_with_labels.csv')
esp_test = ds_prep('/content/drive/MyDrive/Semeval 2024/esp_ds/esp_test.csv')

In [None]:
esp_test.head()

Unnamed: 0,PairID,Text,Split
0,ESP-test-0000,Los menonitas amish con ascendencia suiza de G...,[Los menonitas amish con ascendencia suiza de ...
1,ESP-test-0001,El perro negro está jugando con el perro marró...,[El perro negro está jugando con el perro marr...
2,ESP-test-0002,"Cuando se agita un disolvente, dos líquidos in...","[Cuando se agita un disolvente, dos líquidos i..."
3,ESP-test-0003,La exsoldado de Estados Unidos Chelsea Manning...,[La exsoldado de Estados Unidos Chelsea Mannin...
4,ESP-test-0004,La catedral de Módena es uno de los lugares de...,[La catedral de Módena es uno de los lugares d...


### Sentence BERT

In [None]:
# Podríamos usar CrossEncoders

eng_name = 'sentence-transformers/all-mpnet-base-v2'
eng_model = SentenceTransformer(eng_name)

esp_name = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
esp_model = SentenceTransformer(esp_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def sbert_cos(model, pares):
  emb = model.encode(pares, convert_to_tensor = True)
  cosine = util.cos_sim(emb, emb)
  cos = float(cosine[0][1])
  return cos

In [None]:
sbert_esp = [sbert_cos(esp_model, esp_ds['Split'][i]) for i in range(len(esp_ds['Split']))]
sbert_eng = [sbert_cos(eng_model, eng_ds['Split'][i]) for i in range(len(eng_ds['Split']))]

In [None]:

# model = SentenceTransformer()
# ds = pd.DataFrame()
def sbert_lista(model, ds):
  aux = [sbert_cos(model, ds['Split'][i]) for i in range(len(ds['Split']))]
  return aux

In [None]:
sbert_esp_train = sbert_lista(esp_model, esp_train)
sbert_esp_val = sbert_lista(esp_model, esp_val)
sbert_esp_test = sbert_lista(esp_model, esp_test)

print(len(sbert_esp_train), len(sbert_esp_val), len(sbert_esp_test))

1562 140 600


In [None]:
sbert_eng_train = sbert_lista(eng_model, eng_train)
sbert_eng_val = sbert_lista(eng_model, eng_val)
sbert_eng_test = sbert_lista(eng_model, eng_test)

print(len(sbert_eng_train), len(sbert_eng_val), len(sbert_eng_test))

5500 250 2600


### Diferencia de Longitud

In [None]:
def get_diff(pair):
    return abs(len(pair[0]) - len(pair[1])) / (len(pair[0]) + len(pair[1]))

def calc_diff(info):
    """
    Nos dice la deferencia de longitud entre dos oraciones
    Crea dos nuevas columnas
    """
    diff = []
    diff_stop = []
    for i in range(len(info)):
        #La diferencia de longitud así como nos la dan
        aux = info.iloc[i]['Text'].split('\n')
        aux[0] = aux[0].split()
        aux[1] = aux[1].split()
        diff.append(get_diff(aux))
        #La diferencia quitando stopwords, tokenizada
        diff_stop.append(get_diff(info.iloc[i]['Split']))
    return info.assign(Diff=diff, Diff_stop=diff_stop)

In [None]:
# ds DE NUEVO ES EL JODIDO DATAFRAME ME CAGOOOOOOOOOOOOOO

def macaco(ds):
  aux_df = calc_diff(ds)
  diff1 = aux_df['Diff']
  diff2 = aux_df['Diff_stop']
  return diff1, diff2

In [None]:
diff_eng_train = macaco(eng_train)[0]
diff_eng_val = macaco(eng_val)[0]
diff_eng_test = macaco(eng_test)[0]

diff_esp_train = macaco(esp_train)[0]
diff_esp_val = macaco(esp_val)[0]
diff_esp_test = macaco(esp_test)[0]

### N-grams

In [None]:
# Podemos considerar vectorizadores a partir del corpus separado, pero creo que es mejor que esté mixto

# gram = CountVectorizer: Corresponde a los unigram, bigram, trigram, obtenidos previamente
# corpus = list of string (creo): corresponde al corpus limpio listo para procesar, NO se quitan stopwords.
def ngram(gram, corpus):
  ngram1 = gram.fit_transform(corpus)
  ngram2 = gram.transform(corpus)
  ngram = gram.get_feature_names_out()
  return gram

# Funciones de extracción de n-grams
# Obs a futuro: Debería de comentar más mi código, está terrorífico.
def uni_list1(a=int, b=str, c=str):
    index = []
    final = []

    sent = c[a].toarray()

    for i in range(len(sent[0])):
        if sent[0][i] != 0:
            index.append(i)

    for i in index:
        final.append(b[i])

    return final

def ngram_metrica(a, b):
    val = 0
    set1 = set(a)
    set2 = set(b)
    inter = set1.intersection(set2)
    if len(set1) != 0:
        val = len(inter)/len(set1)
    return val

#ds dataframe a usar
def ngram_met(ds, a=str, b=str, c=str):
    aux = []
    for i in range(len(ds)):
        hola = uni_list1(i, a, b)
        hola1 = uni_list1(i, a, c)
        aux.append(ngram_metrica(hola, hola1))
    return aux

def clean(doc):
    doc = "".join([char for char in doc if char not in string.punctuation and not char.isdigit()])
    doc = " ".join([token for token in doc.split() if token not in stop_words])
    return doc.lower()

In [None]:
# ds = Columna split del datafram
def vocab(ds):
  a = []
  b = []
  for i in ds:
    a.append(clean(i[0]))
    b.append(clean(i[1]))
  return a, b

esp1, esp2 = vocab(esp_ds['Split'])
eng1, eng2 = vocab(eng_ds['Split'])

In [None]:
# gram corresponde a uni, bi, tri, objetos de CountVectorizer
# a, b corresponden al vocabulario de cada oración, obtenidos por la función vocab()
def ngram_prep(a, b, gram):
  aux = gram.fit_transform(a)
  aux2 = gram.transform(b)
  aux1 = gram.get_feature_names_out()
  return aux1, aux, aux2

In [None]:
# n = int: corresponde al ngram dado
# ayuda, ayuda2 = listas: corresponden al vocabulario en cuestión
# ds = columna del dataframe: eng_ds['Split']
# La lista que obtenemos al final corresponde a los valores a pasarle al modelo
def proceso(n, ayuda, ayuda2, ds):
  gram = CountVectorizer(ngram_range=(n,n))
  a, b, c = ngram_prep(ayuda, ayuda2, gram)
  gram_list = ngram_met(ds, a, b, c)
  return gram_list

In [None]:
#unigram_esp = proceso(1, esp1, esp2, esp_ds['Split'])
#bigram_esp = proceso(2, esp1, esp2, esp_ds['Split'])
#trigram_esp = proceso(3, esp1, esp2, esp_ds['Split'])

#unigram_eng = proceso(1, eng1, eng2, eng_ds['Split'])
#bigram_eng = proceso(2, eng1, eng2, eng_ds['Split'])
#trigram_eng = proceso(3, eng1, eng2, eng_ds['Split'])

In [None]:
# Una disculpa por las siguiente celdas Karla, se ve ojete lo sé 😢

In [None]:
# Esto lo necesaitamos para los distintos datasets de entreno, val, prueba.
nesp_tr1, nesp_tr2 = vocab(esp_train['Split'])
nesp_v1, nesp_v2 = vocab(esp_val['Split'])
nesp_te1, nesp_te2 = vocab(esp_test['Split'])

neng_tr1, neng_tr2 = vocab(eng_train['Split'])
neng_v1, neng_v2 = vocab(eng_val['Split'])
neng_te1, neng_te2 = vocab(eng_test['Split'])

In [None]:
uni_esp_train = proceso(1, nesp_tr1, nesp_tr2, esp_train['Split'])
uni_esp_val = proceso(1, nesp_v1, nesp_v2, esp_val['Split'])
#uni_esp_test = proceso(1, nesp_te1, nesp_te2, esp_test['Split'])

bi_esp_train = proceso(2, nesp_tr1, nesp_tr2, esp_train['Split'])
bi_esp_val = proceso(2, nesp_v1, nesp_v2, esp_val['Split'])
#bi_esp_test = proceso(2, nesp_te1, nesp_te2, esp_test['Split'])

tri_esp_train = proceso(3, nesp_tr1, nesp_tr2, esp_train['Split'])
tri_esp_val = proceso(3, nesp_v1, nesp_v2, esp_val['Split'])
#tri_esp_test = proceso(3, nesp_te1, nesp_te2, esp_test['Split'])

In [None]:
uni_eng_train = proceso(1, neng_tr1, neng_tr2, eng_train['Split'])
uni_eng_val = proceso(1, neng_v1, neng_v2, eng_val['Split'])
#uni_eng_test = proceso(1, neng_te1, neng_te2, eng_test['Split'])

bi_eng_train = proceso(2, neng_tr1, neng_tr2, eng_train['Split'])
bi_eng_val = proceso(2, neng_v1, neng_v2, eng_val['Split'])
#bi_eng_test = proceso(2, neng_te1, neng_te2, eng_test['Split'])

tri_eng_train = proceso(3, neng_tr1, neng_tr2, eng_train['Split'])
tri_eng_val = proceso(3, neng_v1, neng_v2, eng_val['Split'])
#tri_eng_test = proceso(3, neng_te1, neng_te2, eng_test['Split'])

In [None]:
# La lista de las métricas que usaremos es unigram_esp, bigram_esp, trigram_esp para ESPAÑOL
# para INGLÉS son unigram_eng, bigram_eng, trigram_eng

### AnglE Embeddings default:

In [None]:
angle_bert = AnglE.from_pretrained('SeanLee97/angle-bert-base-uncased-nli-en-v1', max_length=128, pooling_strategy='cls').cuda()
#angle_llama = AnglE.from_pretrained('NousResearch/Llama-2-7b-hf', pretrained_lora_path='SeanLee97/angle-llama-7b-nli-v2').cuda()

In [None]:
#print("All predefined promtps: ", Prompts.list_prompts())
angle_bert.set_prompt(prompt=Prompts.A)
#angle_llama.set_prompt(promtp=None)

In [None]:
# encode() sirve para calcular la similitud a partir de los AnglE embeddings

# eval() realiza el proceso de evaluación global considerando SOLO los AnglE embeddings

# angle_list() es un paso intermedio para obtener la lista de similitud a partir de los AnglE embeddings
# la lista de aquí entra al vector característico para el modelo de regresión


# i = int: i-ésimo par de oraciones a comparar
# ds = pd.DataFrame: dataset a usar, inglés o español
# OBS: cambiamos angle_llama por angle_bert dependiendo del modelo usado. (llama no quiere jalar en google colab para pobres)
def encode(i, ds):

  vec = angle_bert.encode(
      [{'text': ds['Split'][i][0]}, {'text': ds['Split'][i][1]}],
      to_numpy=True
  )

  #score = ds['Score'][i]

  cos = nn.CosineSimilarity(dim=0)
  tensor = torch.from_numpy(vec)
  output = cos(*tensor)

  return output

# ds = pd.Dataframe: el que vayamos a usar
def angle_list(ds):
  aux = ds['Split']
  return [float(encode(i, ds)) for i in range(len(aux))]

# ds = pd.DataFrame: el dataset a usar
def eval(ds):
  aux = ds['Split']
  pred = [encode(i, ds)[0] for i in range(len(aux))]
  gold = [float(ds['Score'][i]) for i in range(len(ds['Score']))]

  return round(spearmanr(gold, pred)[0],3)

In [None]:
#angle_esp = angle_list(esp_ds)
#angle_eng = angle_list(eng_ds)

In [None]:
angle_esp_train = angle_list(esp_train)
angle_esp_val = angle_list(esp_val)
#angle_esp_test = angle_list(esp_test)

angle_eng_train = angle_list(eng_train)
angle_eng_val = angle_list(eng_val)
#angle_eng_test = angle_list(eng_test)

### AnglE Embeddings finetuning:

In [None]:
from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader

#El resto de las herramientas ya están importadas

In [None]:
def finetune_prep(ds):
  ds[['text1', 'text2']] = ds['Text'].str.split("\n", expand=True)
  ds['label'] = ds['Score']
  ds = ds.drop(columns=['PairID', 'Text', 'Score', 'Split'])
  return ds

#finetune_eng = finetune_prep(eng_ds)
finetune_esp = finetune_prep(esp_ds)

In [None]:
#ft_eng = Dataset.from_pandas(finetune_eng)
ft_esp = Dataset.from_pandas(finetune_esp)

#ft_eng = ft_eng.select_columns(['text1', 'text2', 'label']) #Esto qué hace o qué pedo jaja

In [None]:
# ds = Dataset: el dataset obtenido tras aplicar finetune_prep()
def ft_prep(ds):
  ttv = ds.train_test_split(test_size=0.2)
  test_valid = ttv['test'].train_test_split(test_size = 0.5)
  ds = DatasetDict(
      {
        'train': ttv['train'],
        'test': test_valid['test'],
        'validation': test_valid['train']
      }
  )

  return ds

#ft_eng1 = ft_prep(ft_eng)
ft_esp1 = ft_prep(ft_esp)

In [None]:
# Tenemos que guardar los valores de verdad del train test.
train = ft_esp1['train'].shuffle().map(AngleDataTokenizer(angle_bert.tokenizer, angle_bert.max_length), num_proc=8)
valid = ft_esp1['validation'].map(AngleDataTokenizer(angle_bert.tokenizer, angle_bert.max_length), num_proc=8)
test = ft_esp1['test'].map(AngleDataTokenizer(angle_bert.tokenizer, angle_bert.max_length), num_proc=8)

In [None]:
angle_bert.fit(
    train_ds=train,
    valid_ds=valid,
    output_dir='checkpoints/semeval_v1',
    batch_size=32,
    epochs=5,
    learning_rate=2e-5,
    save_steps=100,
    eval_steps=1000,
    warmup_steps=0,
    gradient_accumulation_steps=1,
    loss_kwargs={
        'w1': 1.0,
        'w2': 1.0,
        'w3': 1.0,
        'cosine_tau': 20,
        'ibn_tau': 20,
        'angle_tau': 1.0
    },
    fp16=True,
    logging_steps=100
)

In [None]:
corrcoef, accuracy = angle_bert.evaluate(test, device=angle_bert.device)
print('corrcoef:', corrcoef)

In [None]:
print("Spearman FT inglés: ", eval(esp_ds))

### Juntamos características

In [None]:
# Las listas que tenemos son:
# Español: angle_esp, unigram_esp, bigram_esp, trigram_esp, diff, diff_stop
# Inglés: angle_eng, unigram_eng, bigram_eng, trigram_eng, diff_eng, diff_stop_eng

In [None]:
#lista_fin_esp = [[angle_esp[i], unigram_esp[i], bigram_esp[i], trigram_esp[i], diff[i], diff_stop[i], sbert_esp[i]] for i in range(len(angle_esp))]
#lista_fin_eng = [[angle_eng[i], unigram_eng[i], bigram_eng[i], trigram_eng[i], diff_eng[i], diff_stop_eng[i], sbert_eng[i]] for i in range(len(angle_eng))]

In [None]:
final_esp_train = [[angle_esp_train[i], uni_esp_train[i], bi_esp_train[i], tri_esp_train[i], diff_esp_train[i], sbert_esp_train[i]] for i in range(len(angle_esp_train))]
final_esp_val = [[angle_esp_val[i], uni_esp_val[i], bi_esp_val[i], tri_esp_val[i], diff_esp_val[i], sbert_esp_val[i]] for i in range(len(angle_esp_val))]
#final_esp_test = [[angle_esp_test[i], uni_esp_test[i], bi_esp_test[i], tri_esp_test[i], diff_esp_test[i], sbert_esp_test[i]] for i in range(len(angle_esp_test))]

final_eng_train = [[angle_eng_train[i], uni_eng_train[i], bi_eng_train[i], tri_eng_train[i], diff_eng_train[i], sbert_eng_train[i]] for i in range(len(angle_eng_train))]
final_eng_val = [[angle_eng_val[i], uni_eng_val[i], bi_eng_val[i], tri_eng_val[i], diff_eng_val[i], sbert_eng_val[i]] for i in range(len(angle_eng_val))]
#final_eng_test = [[angle_eng_test[i], uni_eng_test[i], bi_eng_test[i], tri_eng_test[i], diff_eng_test[i], sbert_eng_test[i]] for i in range(len(angle_eng_test))]

In [None]:
# ABLATION TEST
def gen_list(*args):
  zipped = zip(*args)
  crayolas = [list(i) for i in zipped]
  return crayolas

### Modelos de regresión

In [None]:
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor

def ez_reg(a, x_train, x_test, y_train, y_test):
  a.fit(x_train, y_train)
  pred = a.predict(x_test)
  spear = spearmanr(pred, y_test)[0]
  return spear, pred, a

In [None]:
svm_esp = ez_reg(svm.SVR(), final_esp_train, final_esp_val, esp_train['Score'], esp_val['Score'])
svm_eng = ez_reg(svm.SVR(), final_eng_train, final_eng_val, eng_train['Score'], eng_val['Score'])
forest_esp = ez_reg(RandomForestRegressor(max_depth=5, random_state=0), final_esp_train, final_esp_val, esp_train['Score'], esp_val['Score'])
forest_eng = ez_reg(RandomForestRegressor(max_depth=5, random_state=0), final_eng_train, final_eng_val, eng_train['Score'], eng_val['Score'])
svm_esp_epsilon = ez_reg(svm.SVR(C=1.0, epsilon=0.3), final_esp_train, final_esp_val, esp_train['Score'], esp_val['Score'])
svm_eng_epsilon = ez_reg(svm.SVR(C=1.0, epsilon=0.3), final_eng_train, final_eng_val, eng_train['Score'], eng_val['Score'])
tedridge_esp = ez_reg(linear_model.Ridge(alpha=.9), final_esp_train, final_esp_val, esp_train['Score'], esp_val['Score'])
tedridge_eng = ez_reg(linear_model.Ridge(alpha=.8), final_eng_train, final_eng_val, eng_train['Score'], eng_val['Score'])

print("Español: ")
print("SVM: ", svm_esp[0], "Forest: ", forest_esp[0], "SVM Epsilon: ", svm_esp_epsilon[0], "Ridge: ", tedridge_esp[0])
print("English: ")
print("SVM: ", svm_eng[0], "Forest: ", forest_eng[0], "SVM Epsilon: ", svm_eng_epsilon[0], "Ridge: ", tedridge_eng[0])

In [None]:
#ablation_embeddings_train = gen_list(angle_esp_train, sbert_esp_train, uni_esp_train, bi_esp_train, tri_esp_train)
#ablation_embeddings_val = gen_list(angle_esp_val, sbert_esp_val, uni_esp_val, bi_esp_val, tri_esp_val)
#ablation_embeddings_test = gen_list(angle_esp_test, sbert_esp_test, uni_esp_test, bi_esp_test, tri_esp_test)

#ablation_ND_train = gen_list(uni_esp_train, bi_esp_train, tri_esp_train, diff_esp_train)
#ablation_ND_val = gen_list(uni_esp_val, bi_esp_val, tri_esp_val, diff_esp_val)
#ablation_ND_test = gen_list(uni_esp_test, bi_esp_test, tri_esp_test, diff_esp_test)


#ENGLISH
ab_N_train = gen_list(uni_esp_train, bi_esp_train, tri_esp_train)
ab_N_val = gen_list(uni_esp_val, bi_esp_val, tri_esp_val)

ab_ND_train = gen_list(uni_esp_train, bi_esp_train, tri_esp_train, diff_esp_train)
ab_ND_val = gen_list(uni_esp_val, bi_esp_val, tri_esp_val, diff_esp_val)

In [None]:
def reshape(x):
  return (np.array(x)).reshape(-1,1)

In [None]:
sbert_esp = reshape(sbert_esp_train)
angle_esp = reshape(angle_esp_train)
ngram_esp = reshape(ab_N_train)
diff_esp = reshape(diff_esp_train)
nd_esp = reshape(ab_ND_train)

sbert_espval = reshape(sbert_esp_val)
angle_espval = reshape(angle_esp_val)
ngram_espval = reshape(ab_N_val)
diff_espval = reshape(diff_esp_val)
nd_espval = reshape(ab_ND_val)

In [None]:
# ABLATION TEST
ab_S = ez_reg(linear_model.Ridge(alpha=.9), sbert_esp, sbert_espval, esp_train['Score'], esp_val['Score'])
ab_A = ez_reg(linear_model.Ridge(alpha=.9), angle_esp, angle_espval, esp_train['Score'], esp_val['Score'])
ab_N = ez_reg(linear_model.Ridge(alpha=.9), ab_N_train, ab_N_val, esp_train['Score'], esp_val['Score'])
ab_D = ez_reg(linear_model.Ridge(alpha=.9), diff_esp, diff_espval, esp_train['Score'], esp_val['Score'])
ab_ND = ez_reg(linear_model.Ridge(alpha=.9), ab_ND_train, ab_ND_val, esp_train['Score'], esp_val['Score'])

In [None]:
print(ab_S[0], ab_A[0], ab_N[0], ab_D[0], ab_ND[0])

0.6397963493411423 0.6212947431534506 0.6431819101891384 0.5594840672838401 0.6676691399523925


In [None]:
# Español
#SVM S = 0.6397
#SVM A = 0.6140
#SVM N = 0.6425
#SVM D = 0.5595
#SVM N.D. = 0.6560
#SVM SAND = 0.688

#RF S = 0.6058
#RF A = 0.5976
#RF N = 0.6232
#RF D = 0.5584
#RF N.D. = 0.6596
#RF SAND = 0.6783

#SVME S = 0.6310
#SVME A = 0.6038
#SVME N = 0.6419
#SVME D = 0.5586
#SVME N.D. = 0.6658
#SVME SAND = 0.6937

#Ridge S = 0.6397
#Ridge A = 0.6212
#Ridge N = 0.6431
#Ridge D = 0.5594
#Ridge N.D. = 0.6676
#Ridge SAND = 0.7029

In [None]:
# INGLÉS
#SVM S = 0.7891
#SVM A = 0.7789
#SVM N = 0.6634
#SVM D = 0.2343
#SVM N.D. = 0.6655
#SVM SAND = 0.688

#RF S = 0.7847
#RF A = 0.7737
#RF N = 0.6496
#RF D = 0.2090
#RF N.D. = 0.6584
#RF SAND = 0.8197

#SVME S = 0.7865
#SVME A = 0.7772
#SVME N = 0.6620
#SVME D = 0.2839
#SVME N.D. = 0.6776
#SVME SAND = 0.7992

#Ridge S = 0.7891
#Ridge A = 0.7789
#Ridge N = 0.6584
#Ridge D = 0.2888
#Ridge N.D. = 0.6583
#Ridge SAND = 0.7921

In [None]:
# Inglés

# Modelo usado: 0.8133
#Solo Embeddings INGLÉS: 0.81465
#{1,2,3}-Grams, Distancia: 0.66559
#{2,3}-Grams, Distancia: 0.5700
#{1,3}-Grams, Distancia: 0.66278
#{1,2}-Grams, Distancia: 0.66737
# Unigram es el más importante de los n-gramas
#{1,2,3}-Grams: 0.66340
# La distancia no aporta demasiado ptm somos unos macacos
# Embs y unigram: 0.81521 AAAAAAAAAAAAAAAAAAAAAAHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH ME LLEVA LA VERGAAAAAAAAAAAAAAAAAAAAAAA
# Embs y bigram: 0.81538 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH ME CAGO
# Embs y tri: 0.81510 Meh
# Embs, {1,2}-Gram: 0.81709 estoy a dos de matarme.
# Embs y distancia: 0.81195 XD
# Embs, {1,2,3}-Gram: 0.8164 Oficialmente me voy a disparar en la pinga

In [None]:
aux_esp = svm_esp[2].predict(final_esp_test)
data_esp = {'PairID': esp_test['PairID'], 'Pred_Score': aux_esp}
df_esp = pd.DataFrame(data=data_esp)
df_esp

Unnamed: 0,PairID,Pred_Score
0,ESP-test-0000,0.736883
1,ESP-test-0001,0.631972
2,ESP-test-0002,0.338420
3,ESP-test-0003,0.241777
4,ESP-test-0004,0.270513
...,...,...
595,ESP-test-0595,0.269824
596,ESP-test-0596,0.385449
597,ESP-test-0597,0.292323
598,ESP-test-0598,0.414165


In [None]:
aux_eng = forest_eng[2].predict(final_eng_test)
data_eng = {'PairID': eng_test['PairID'], 'Pred_Score': aux_eng}
df_eng = pd.DataFrame(data=data_eng)
df_eng

Unnamed: 0,PairID,Pred_Score
0,ENG-test-0000,0.686044
1,ENG-test-0001,0.652077
2,ENG-test-0002,0.483478
3,ENG-test-0003,0.225006
4,ENG-test-0004,0.431744
...,...,...
2595,ENG-test-2595,0.795829
2596,ENG-test-2596,0.266357
2597,ENG-test-2597,0.444526
2598,ENG-test-2598,0.302475


In [None]:
#df_esp.to_csv('/content/drive/MyDrive/pred_esp_a.csv', index=False)
#df_eng.to_csv('/content/drive/MyDrive/pred_eng_a.csv', index=False)

In [None]:
#-----------------------------------------------------------------------------------------------------------------

In [None]:
from sklearn.model_selection import train_test_split
xtrain_esp, xtest_esp, ytrain_esp, ytest_esp = train_test_split(lista_fin_esp, esp_ds['Score'], test_size=0.25, random_state=69)
xtrain_eng, xtest_eng, ytrain_eng, ytest_eng = train_test_split(lista_fin_eng, eng_ds['Score'], test_size=0.25, random_state=420)

In [None]:
# Quiero usar una regresión Lasso para poder ponerle TedLasso a la variable pero no funciona :c

svm_esp = ez_reg(svm.SVR(), xtrain_esp, xtest_esp, ytrain_esp, ytest_esp)
svm_eng = ez_reg(svm.SVR(), xtrain_eng, xtest_eng, ytrain_eng, ytest_eng)
forest_esp = ez_reg(RandomForestRegressor(max_depth=5, random_state=0), xtrain_esp, xtest_esp, ytrain_esp, ytest_esp)
forest_eng = ez_reg(RandomForestRegressor(max_depth=5, random_state=0), xtrain_eng, xtest_eng, ytrain_eng, ytest_eng)
svm_esp_epsilon = ez_reg(svm.SVR(C=1.0, epsilon=0.3), xtrain_esp, xtest_esp, ytrain_esp, ytest_esp)
svm_eng_epsilon = ez_reg(svm.SVR(C=1.0, epsilon=0.3), xtrain_eng, xtest_eng, ytrain_eng, ytest_eng)
tedridge_esp = ez_reg(linear_model.Ridge(alpha=.9), xtrain_esp, xtest_esp, ytrain_esp, ytest_esp)
tedridge_eng = ez_reg(linear_model.Ridge(alpha=.8), xtrain_eng, xtest_eng, ytrain_eng, ytest_eng)

print("Español: ")
print("SVM: ", svm_esp[0], "Forest: ", forest_esp[0], "SVM Epsilon: ", svm_esp_epsilon[0], "Ridge: ", tedridge_esp[0])
print("English: ")
print("SVM: ", svm_eng[0], "Forest: ", forest_eng[0], "SVM Epsilon: ", svm_eng_epsilon[0], "Ridge: ", tedridge_eng[0])

Español: 
SVM:  0.7219595192598911 Forest:  0.7153253826793448 SVM Epsilon:  0.7226385068570647 Ridge:  0.7250130556145267
English: 
SVM:  0.8423314324931402 Forest:  0.8389179584724146 SVM Epsilon:  0.8387979640233312 Ridge:  0.8395007885642257


### Formato de salida

In [None]:
# ytest corresponde al valor en cuestión
# ds al dataframe como siempre
# ted al predict que obtenemos del modelo (tedridge_eng[1])
def formato_salida(ytest, ds, ted):
  index = ytest.index
  index_l = [ds['PairID'][i] for i in index]
  score_l = [ds['Score'][i] for i in index]
  jisho = {'PairID': index_l, 'Pred_Score': ted, 'Score': score_l}
  #jisho = {'PairID': index_l, 'Pred_Score': ted}
  df = pd.DataFrame(data=jisho)
  return df

In [None]:
df_aux = formato_salida(ytest_esp, esp_ds, tedridge_esp[1])
#df_aux

spearman_prueba1 = spearmanr(df_aux['Pred_Score'], df_aux['Score'])[0]
spearman_prueba1

0.7250130556145267

In [None]:
spearman_prueba = spearmanr(df_prueba['Pred_Score'], df_prueba['Score'])[0]
spearman_prueba

0.7854714796668703