# Text Mining Project

### NOVA IMS MT Metrics Shared Task

**Group members:**
- Lorenzo Pigozzi	--- m20200745
- Davide Farinati
- Antonio
- Luis Reis

## 1. Importing libraries and corpora <a class="anchor" id="1"></a>

In [19]:
# general libraries
import pandas as pd
import numpy as np
import re 
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm

# word's preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import SnowballStemmer
from nltk.translate.bleu_score import sentence_bleu
from bs4 import BeautifulSoup
import string
from sklearn.metrics.pairwise import cosine_similarity

# sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

# chinese library
import jieba

# LaBSE model
import tensorflow as tf
import torch
# from transformers import BertModel, BertTokenizerFast
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F

# warnings
import warnings
warnings.filterwarnings("ignore")

In [114]:
# importing the corpora
# cs_en = pd.read_csv('corpus\cs-en\scores.csv')
# de_en = pd.read_csv('corpus\de-en\scores.csv')
# ru_en = pd.read_csv('corpus\scores_ru-en.csv')
# zh_en = pd.read_csv('corpus\zh-en\scores.csv')
en_fi = pd.read_csv('corpus\en-fi\scores.csv')
# en_zh = pd.read_csv('corpus\en-zh\scores.csv')

In [115]:
en_fi.head()

Unnamed: 0,source,reference,translation,z-score,avg-score,annotators
0,"You can turn yourself into a pineapple, a dog ...","Voit muuttaa itsesi ananasta, koirasta tai Roy...","Voit muuttaa itsesi ananakseksi, koiraksi tai ...",-0.286195,34.2,5
1,Also shot were three men: two 29-year-olds and...,Myös ammuttiin kolme miestä: kaksi 29-vuotiait...,Myös kolmea miestä ammuttiin: kahta 29-vuotias...,0.547076,58.4,5
2,The information is stored at the cash register...,Tiedot tallennetaan kassakoneisiin joka tapauk...,Tiedot kuitenkin tallentuvat kassoilla joka ta...,1.122476,74.6,5
3,Xinhua says that there were traces of hydrochl...,"Xinhua kertoo, että Xinyin näytteestä oli sunn...","Xinhua kertoo, että Xinyin sunnuntaina antamas...",0.383095,53.6,5
4,"MacDonald, who was brought on board CBC's comm...",Voitaisiin kuulla CBD: n kommenttitiimin toimi...,"MacDonaldin, joka tuli CBC:n selostajatiimiin ...",-0.493065,32.25,4


In [116]:
corpus = en_fi.copy()

#  3. Pre-processing <a class="anchor" id="3."></a>

In [117]:
# selecting the necessary variables for the baseline
source_reference = corpus[['source','reference']]
source_translation = corpus[['source','translation']]

In [118]:
source_reference.head()

Unnamed: 0,source,reference
0,"You can turn yourself into a pineapple, a dog ...","Voit muuttaa itsesi ananasta, koirasta tai Roy..."
1,Also shot were three men: two 29-year-olds and...,Myös ammuttiin kolme miestä: kaksi 29-vuotiait...
2,The information is stored at the cash register...,Tiedot tallennetaan kassakoneisiin joka tapauk...
3,Xinhua says that there were traces of hydrochl...,"Xinhua kertoo, että Xinyin näytteestä oli sunn..."
4,"MacDonald, who was brought on board CBC's comm...",Voitaisiin kuulla CBD: n kommenttitiimin toimi...


## 3.1. Cleaning <a class="anchor" id="3.1."></a>

In [125]:
def clean(text_list,
          lower = False,
          keep_numbers = False,
          keep_expression = False,
          remove_char = False,
          remove_stop = False,
          remove_tag = False,
          lemmatize = False,
          stemmer = False,
          english = True
          ):
    """
    Function that a receives a list of strings and preprocesses it.
    
    :param text_list: List of strings.
    :param lemmatize: Tag to apply lemmatization if True.
    :param stemmer: Tag to apply the stemmer if True.
    """
    if english:
        lang = 'english'
    else:
        lang = 'finnish'
    
    stop = set(stopwords.words(lang))
    stem = SnowballStemmer(lang)
    
    updates = []
    for j in range(len(text_list)):
        
        text = text_list[j]
        
        #LOWERCASE TEXT
        if lower:
            text = text.lower()
            
        #KEEP NUMBERS AS TOKENS
        if not keep_numbers:
            text = re.sub("[\d+]", 'X', text)
        
        #KEEP '?' and '!' AS TOKENS
        if not keep_expression:
            text = re.sub("[\?|\!]", 'EXPRESSION', text)
            
        #REMOVE TAGS
        if remove_tag:
            text = BeautifulSoup(text).get_text()
            
        #REMOVE THAT IS NOT TEXT
        if remove_char:
            text = re.sub("[^a-zA-Z]", ' ', text)
        
        #REMOVE STOP WORDS
        if remove_stop:
            text = ' '.join([word for word in text.split(' ') if word not in stop])
        
        #LEMMATIZATION
        if lemmatize:
            if english:
                lemma = WordNetLemmatizer()
                text = " ".join(lemma.lemmatize(word) for word in text.split())
        
        #STEMMER
        if stemmer:
            text = " ".join(stem.stem(word) for word in text.split())
        
        updates.append(text)
        
    return updates

def clean_ch(text_list, keep_numbers=False, remove_punctuation=False, remove_stop = False, stopwords_set='merged'):
    """
    Function that removes chinese stopwords
    
    :param stopwords_set: remove words of both sets (merged), just the 1st (fst) or just the second (snd) 
    """
    updates = []
    
    zh_stopwords1 = [line.strip() for line in open('chinese_stopwords/chinese_stopwords1.txt', 'r', encoding='utf-8').readlines()]
    zh_stopwords2 = [line.strip() for line in open('chinese_stopwords/chinese_stopwords2.txt', 'r', encoding='utf-8').readlines()]
    
    if stopwords_set == 'merged':
        stop = list(set(zh_stopwords1 + zh_stopwords2))
    elif stopwords_set == 'fst':
        stop = zh_stopwords1
    elif stopwords_set == 'snd':
        stop = zh_stopwords2

    for j in range(len(text_list)):
        
        text = text_list[j]
        
        #KEEP NUMBERS AS TOKENS
        if keep_numbers:
            text = re.sub("[\d+]", 'X', text)
        
        # REMOVE PUNCTUATION
        if remove_punctuation:
            # https://stackoverflow.com/questions/36640587/how-to-remove-chinese-punctuation-in-python
            punc = "！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
            text = re.sub(r"[%s]+" %punc, "", text)
        
        # REMOVE STOP WORDS
        if remove_stop:
            pretext = text
            text = ' '.join([word for word in jieba.cut(text) if word not in stop])
            
        updates.append(text)
        
    return updates

def update_df(dataframe, list_updated):
    dataframe.update(pd.DataFrame({"Text": list_updated}))

In [126]:
source_reference.head()

Unnamed: 0,source,reference
0,"You can turn yourself into a pineapple, a dog ...","Voit muuttaa itsesi ananasta, koirasta tai Roy..."
1,Also shot were three men: two 29-year-olds and...,Myös ammuttiin kolme miestä: kaksi 29-vuotiait...
2,The information is stored at the cash register...,Tiedot tallennetaan kassakoneisiin joka tapauk...
3,Xinhua says that there were traces of hydrochl...,"Xinhua kertoo, että Xinyin näytteestä oli sunn..."
4,"MacDonald, who was brought on board CBC's comm...",Voitaisiin kuulla CBD: n kommenttitiimin toimi...


In [127]:
source_translation.head()

Unnamed: 0,source,translation
0,"You can turn yourself into a pineapple, a dog ...","Voit muuttaa itsesi ananakseksi, koiraksi tai ..."
1,Also shot were three men: two 29-year-olds and...,Myös kolmea miestä ammuttiin: kahta 29-vuotias...
2,The information is stored at the cash register...,Tiedot kuitenkin tallentuvat kassoilla joka ta...
3,Xinhua says that there were traces of hydrochl...,"Xinhua kertoo, että Xinyin sunnuntaina antamas..."
4,"MacDonald, who was brought on board CBC's comm...","MacDonaldin, joka tuli CBC:n selostajatiimiin ..."


In [128]:
# # ENGLISH - CHINESE
# source_reference['reference'] = clean_ch(source_reference['reference'], 
#                                 keep_numbers = False,
#                                 remove_punctuation = False,
#                                 remove_stop = True,
#                                 stopwords_set = 'snd')

# source_reference['source'] = clean(source_reference['source'], 
#                                       lower = True, 
#                                       remove_char = True,
#                                       remove_stop = True,
#                                       lemmatize = True,
#                                       stemmer = False,
#                                       english = True)

# # CHINESE - ENGLISH
# source_reference['source'] = clean_ch(source_reference['source'], 
#                                 keep_numbers = False,
#                                 remove_punctuation = False,
#                                 remove_stop = True,
#                                 stopwords_set = 'snd')

# source_reference['reference'] = clean(source_reference['reference'], 
#                                       lower = True, 
#                                       remove_char = True,
#                                       remove_stop = True,
#                                       lemmatize = True,
#                                       stemmer = False,
#                                       english = True)


# ENGLISH - FINNISH
source_reference['reference'] = clean(source_reference['reference'], 
                                      lower = True, 
                                      remove_char = True,
                                      remove_stop = True,
                                      lemmatize = True,
                                      stemmer = False,
                                      english = False)

source_reference['source'] = clean(source_reference['source'], 
                                      lower = True, 
                                      remove_char = True,
                                      remove_stop = True,
                                      lemmatize = True,
                                      stemmer = False,
                                      english = True)

In [129]:
# # ENGLISH - CHINESE
# source_translation['translation'] = clean_ch(source_translation['translation'], 
#                                 keep_numbers = False,
#                                 remove_punctuation = False,
#                                 remove_stop = True,
#                                 stopwords_set = 'snd')

# source_translation['source'] = clean(source_translation['source'], 
#                                           lower = True, 
#                                            remove_char = True,
#                                            remove_stop = True,
#                                             lemmatize = True,
#                                             stemmer = False,
#                                             english = True)


# # CHINESE - ENGLISH
# source_translation['source'] = clean_ch(source_translation['source'], 
#                                 keep_numbers = False,
#                                 remove_punctuation = False,
#                                 remove_stop = True,
#                                 stopwords_set = 'snd')

# source_translation['translation'] = clean(source_translation['translation'], 
#                                           lower = True, 
#                                            remove_char = True,
#                                            remove_stop = True,
#                                             lemmatize = True,
#                                             stemmer = False,
#                                             english = True)


# ENGLISH - FINNISH
source_translation['translation'] = clean(source_translation['translation'], 
                                          lower = True, 
                                           remove_char = True,
                                           remove_stop = True,
                                            lemmatize = True,
                                            stemmer = False,
                                            english = False)

source_translation['source'] = clean(source_translation['source'], 
                                          lower = True, 
                                           remove_char = True,
                                           remove_stop = True,
                                            lemmatize = True,
                                            stemmer = False,
                                            english = True)

In [130]:
source_reference.head()

Unnamed: 0,source,reference
0,turn pineapple dog character befitting roy lic...,voit muuttaa itsesi ananasta koirasta roy lic...
1,also shot three men two XX year old one XX yea...,my s ammuttiin kolme miest kaksi XX vuotiait...
2,information stored cash register anyway whethe...,tiedot tallennetaan kassakoneisiin tapauksessa...
3,xinhua say trace hydrochlorothiazide diuretic ...,xinhua kertoo ett xinyin n ytteest sunnunta...
4,macdonald brought board cbc commentary team pr...,voitaisiin kuulla cbd n kommenttitiimin toimi...


In [131]:
source_translation.head() 

Unnamed: 0,source,translation
0,turn pineapple dog character befitting roy lic...,voit muuttaa itsesi ananakseksi koiraksi hahm...
1,also shot three men two XX year old one XX yea...,my s kolmea miest ammuttiin kahta XX vuotias...
2,information stored cash register anyway whethe...,tiedot kuitenkin tallentuvat kassoilla tapauks...
3,xinhua say trace hydrochlorothiazide diuretic ...,xinhua kertoo ett xinyin sunnuntaina antamas...
4,macdonald brought board cbc commentary team pr...,macdonaldin tuli cbc n selostajatiimiin tuoma...


# LaBSE

https://ai.googleblog.com/2020/08/language-agnostic-bert-sentence.html \
https://arxiv.org/abs/2007.01852 \
https://tfhub.dev/google/LaBSE/1 \
https://pytorch.org/docs/stable/generated/torch.matmul.html

\
Pre-trained Model: https://huggingface.co/sentence-transformers/LaBSE

In [132]:
# For similarity between sentences, an L2-norm is recommended before calculating the similarity
def similarity(embeddings_1, embeddings_2):
    normalized_embeddings_1 = F.normalize(embeddings_1, p=2)
    normalized_embeddings_2 = F.normalize(embeddings_2, p=2)
    return torch.matmul(
        normalized_embeddings_1, normalized_embeddings_2.transpose(0, 1)
    )

In [133]:
# tokenizer = BertTokenizerFast.from_pretrained("setu4993/LaBSE")
# model = BertModel.from_pretrained("setu4993/LaBSE")
# model = model.eval()

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE")
model = AutoModel.from_pretrained("sentence-transformers/LaBSE")

Some weights of the model checkpoint at sentence-transformers/LaBSE were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [134]:
sources = list(source_reference['source'].head(500))
references = list(source_reference['reference'].head(500))
translations = list(source_translation['translation'].head(500))

# german_source = list(deen_reference['source'])
# english_reference = list(deen_reference['reference'])

In [138]:
source_inputs = tokenizer(sources, return_tensors="pt", padding=True)
reference_inputs = tokenizer(references, return_tensors="pt", padding=True)
translation_inputs = tokenizer(translations, return_tensors="pt", padding=True)

with torch.no_grad():
    source_outputs = model(**source_inputs)
    reference_outputs = model(**reference_inputs)
    translation_outputs = model(**translation_inputs)

In [139]:
# To get the sentence embeddings, use the pooler output
source_embeddings = source_outputs.pooler_output
reference_embeddings = reference_outputs.pooler_output
translation_embeddings = translation_outputs.pooler_output

In [141]:
matrix_reference = similarity(reference_embeddings, source_embeddings)
diagonal_reference = pd.Series(tf.linalg.tensor_diag_part(matrix_reference))

matrix_translation = similarity(translation_embeddings, source_embeddings)
diagonal_translation = pd.Series(tf.linalg.tensor_diag_part(matrix_translation))

## Diagonal of the embedding matrices

In [142]:
result = pd.concat([diagonal_reference, diagonal_translation, corpus['z-score'].head(500), 
                    corpus['avg-score'].head(500)], axis = 1)
result.columns = ['source_reference_similarity', 'source_translation_similarity', 'z-score', 'avg-score']
result['difference_similarity'] = result['source_reference_similarity'] - result['source_translation_similarity']

In [76]:
print('Source : CHINESE   |  Reference and Translation : ENGLISH')
print('Pearson correlation difference_similarity and z-score: ')
print(result[['z-score', 'difference_similarity']].corr(method='pearson').iloc[1:2,:1].values[0][0])
print('---------------------------------------------------------------------')
print('---------------------------------------------------------------------')
print('Pearson correlation source_translation_similarity and z-score: ')
print(result[['z-score', 'source_translation_similarity']].corr(method='pearson').iloc[1:2,:1].values[0][0])
print('---------------------------------------------------------------------')
print('Kendall correlation source_translation_similarity and z-score: ')
print(result[['z-score', 'source_translation_similarity']].corr(method='kendall').iloc[1:2,:1].values[0][0])

Source : CHINESE   |  Reference and Translation : ENGLISH
Pearson correlation difference_similarity and z-score: 
0.09269746386076858
---------------------------------------------------------------------
---------------------------------------------------------------------
Pearson correlation source_translation_similarity and z-score: 
0.30105157682509415
---------------------------------------------------------------------
Kendall correlation source_translation_similarity and z-score: 
0.16196237654836318


In [108]:
print('Source : ENGLISH   |  Reference and Translation : CHINESE')
print('Pearson correlation difference_similarity and z-score: ')
print(result[['z-score', 'difference_similarity']].corr(method='pearson').iloc[1:2,:1].values[0][0])
print('---------------------------------------------------------------------')
print('---------------------------------------------------------------------')
print('Pearson correlation source_translation_similarity and z-score: ')
print(result[['z-score', 'source_translation_similarity']].corr(method='pearson').iloc[1:2,:1].values[0][0])
print('---------------------------------------------------------------------')
print('Kendall correlation source_translation_similarity and z-score: ')
print(result[['z-score', 'source_translation_similarity']].corr(method='kendall').iloc[1:2,:1].values[0][0])

Source : ENGLISH   |  Reference and Translation : CHINESE
Pearson correlation difference_similarity and z-score: 
0.004173359039506843
---------------------------------------------------------------------
---------------------------------------------------------------------
Pearson correlation source_translation_similarity and z-score: 
0.23716163737944085
---------------------------------------------------------------------
Kendall correlation source_translation_similarity and z-score: 
0.11737332024675519


In [143]:
print('Source : ENGLISH   |  Reference and Translation : FINNISH')
print('Pearson correlation difference_similarity and z-score: ')
print(result[['z-score', 'difference_similarity']].corr(method='pearson').iloc[1:2,:1].values[0][0])
print('---------------------------------------------------------------------')
print('---------------------------------------------------------------------')
print('Pearson correlation source_translation_similarity and z-score: ')
print(result[['z-score', 'source_translation_similarity']].corr(method='pearson').iloc[1:2,:1].values[0][0])
print('---------------------------------------------------------------------')
print('Kendall correlation source_translation_similarity and z-score: ')
print(result[['z-score', 'source_translation_similarity']].corr(method='kendall').iloc[1:2,:1].values[0][0])

Source : ENGLISH   |  Reference and Translation : FINNISH
Pearson correlation difference_similarity and z-score: 
-0.14494737709508357
---------------------------------------------------------------------
---------------------------------------------------------------------
Pearson correlation source_translation_similarity and z-score: 
0.19488241594825306
---------------------------------------------------------------------
Kendall correlation source_translation_similarity and z-score: 
0.11780494534551317


## Entire embedding vector for each sentence

In [144]:
matrix_reference

tensor([[0.7786, 0.1385, 0.1194,  ..., 0.1172, 0.1874, 0.1633],
        [0.0607, 0.8447, 0.1732,  ..., 0.1481, 0.1821, 0.1622],
        [0.0430, 0.2108, 0.7832,  ..., 0.1987, 0.1531, 0.0591],
        ...,
        [0.2070, 0.1215, 0.2416,  ..., 0.7976, 0.2308, 0.1462],
        [0.2002, 0.1702, 0.0799,  ..., 0.2109, 0.8104, 0.3737],
        [0.0175, 0.1822, 0.1830,  ..., 0.1495, 0.2263, 0.7588]])

In [145]:
matrix_translation

tensor([[0.7575, 0.1540, 0.1176,  ..., 0.1483, 0.1901, 0.1483],
        [0.0538, 0.8251, 0.1684,  ..., 0.1205, 0.1906, 0.1506],
        [0.0645, 0.2404, 0.8107,  ..., 0.1991, 0.0960, 0.0617],
        ...,
        [0.2051, 0.0873, 0.2847,  ..., 0.6929, 0.2073, 0.0956],
        [0.1859, 0.1360, 0.0776,  ..., 0.2224, 0.7727, 0.3438],
        [0.0725, 0.1919, 0.1693,  ..., 0.1600, 0.3125, 0.7768]])

In [161]:
cosine_similarity([matrix_reference[0].tolist()], [matrix_translation[0].tolist()])[0][0]

0.9888775724611849

In [146]:
cos = []
for i in range(len(matrix_reference)):
    cos.append(cosine_similarity([matrix_reference[i].tolist()], [matrix_translation[i].tolist()])[0][0])

In [147]:
result = pd.concat([pd.Series(cos), corpus['z-score'].head(500), corpus['avg-score'].head(500)], axis = 1)
result.columns = ['cosine_scores', 'z-score', 'avg-score']
result.head()

Unnamed: 0,cosine_scores,z-score,avg-score
0,0.988878,-0.286195,34.2
1,0.9975,0.547076,58.4
2,0.973626,1.122476,74.6
3,0.980316,0.383095,53.6
4,0.975524,-0.493065,32.25


In [74]:
print('Source : CHINESE   |  Reference and Translation : ENGLISH')
print('---------------------------------------------------------------------')
print('Pearson correlation source_translation_similarity and z-score: ')
print(result[['z-score', 'cosine_scores']].corr(method='pearson').iloc[1:2,:1].values[0][0])
print('---------------------------------------------------------------------')
print('Kendall correlation source_translation_similarity and z-score: ')
print(result[['z-score', 'cosine_scores']].corr(method='kendall').iloc[1:2,:1].values[0][0])

Source : CHINESE   |  Reference and Translation : ENGLISH
---------------------------------------------------------------------
Pearson correlation source_translation_similarity and z-score: 
0.4507465848338922
---------------------------------------------------------------------
Kendall correlation source_translation_similarity and z-score: 
0.32659983086897115


In [112]:
print('Source : ENGLISH   |  Reference and Translation : CHINESE')
print('---------------------------------------------------------------------')
print('Pearson correlation source_translation_similarity and z-score: ')
print(result[['z-score', 'cosine_scores']].corr(method='pearson').iloc[1:2,:1].values[0][0])
print('---------------------------------------------------------------------')
print('Kendall correlation source_translation_similarity and z-score: ')
print(result[['z-score', 'cosine_scores']].corr(method='kendall').iloc[1:2,:1].values[0][0])

Source : ENGLISH   |  Reference and Translation : CHINESE
---------------------------------------------------------------------
Pearson correlation source_translation_similarity and z-score: 
0.2838280909635055
---------------------------------------------------------------------
Kendall correlation source_translation_similarity and z-score: 
0.22439300379231272


In [148]:
print('Source : ENGLISH   |  Reference and Translation : FINNISH')
print('---------------------------------------------------------------------')
print('Pearson correlation source_translation_similarity and z-score: ')
print(result[['z-score', 'cosine_scores']].corr(method='pearson').iloc[1:2,:1].values[0][0])
print('---------------------------------------------------------------------')
print('Kendall correlation source_translation_similarity and z-score: ')
print(result[['z-score', 'cosine_scores']].corr(method='kendall').iloc[1:2,:1].values[0][0])

Source : ENGLISH   |  Reference and Translation : FINNISH
---------------------------------------------------------------------
Pearson correlation source_translation_similarity and z-score: 
0.29150711128441614
---------------------------------------------------------------------
Kendall correlation source_translation_similarity and z-score: 
0.21189399434259404


# Regression

In [49]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity

In [141]:
data = result[['source_reference_similarity', 'source_translation_similarity', 'z-score']]

In [142]:
X = data.drop(['z-score'], axis=1)
y = data['z-score']

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, test_size = 0.2, random_state = 7)

In [143]:
baseline_regressor = LinearRegression()
baseline_regressor.fit(X_train, y_train)

LinearRegression()

In [144]:
y_pred = baseline_regressor.predict(X_test)

In [145]:
baseline_r2_test = baseline_regressor.score(X_test, y_test)

print(f'Baseline R^2 score on test set : {baseline_r2_test}')

Baseline R^2 score on test set : 0.14687139609738986


In [None]:
y_pred
baseline_corr_train, baseline_corr_train_pvalue = pearsonr(y_train, cos_train)