[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1mUDx4uFpbS6jrD7lN-P7F7saefP_QJ31?usp=sharing)

# Word embedding with Word2Vec

Disclaimer : this notebook is a mix of our function implemented into the lab session "Week 9". A lot of cellules are taking from the lab session (Word embedding with Word2Vec). 

## Load data via GitHub and load required packages 

In [1]:
#Training data and import some packages
import pandas as pd
import numpy as np
path = "https://raw.githubusercontent.com/Lirette2/DMML2021_Apple/main/data/training_data.csv"
df = pd.read_csv(path, index_col=0)
#df = pd.read_csv('/content/training_data.csv')

In [2]:
# Import required packages
!python -m spacy download fr_core_news_sm

#Import fr_core_news_sm
import spacy
from spacy import displacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn as sns

Collecting fr_core_news_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-2.2.5/fr_core_news_sm-2.2.5.tar.gz (14.7 MB)
[K     |████████████████████████████████| 14.7 MB 4.4 MB/s 
Building wheels for collected packages: fr-core-news-sm
  Building wheel for fr-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for fr-core-news-sm: filename=fr_core_news_sm-2.2.5-py3-none-any.whl size=14727026 sha256=ffd976a80a9af9d07a8bcbd461891bb89d79223335da173bea042f20a75891f0
  Stored in directory: /tmp/pip-ephem-wheel-cache-99u3tu6j/wheels/c9/a6/ea/0778337c34660027ee67ef3a91fb9d3600b76777a912ea1c24
Successfully built fr-core-news-sm
Installing collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('fr_core_news_sm')


In [3]:
# Import additional packages
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

from spacy.lang.fr.stop_words import STOP_WORDS
from spacy.lang.fr.examples import sentences 
from spacy.lang.fr import French
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [4]:
#Look at the data
df.head()

Unnamed: 0_level_0,sentence,difficulty
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Les coûts kilométriques réels peuvent diverger...,C1
1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1
2,Le test de niveau en français est sur le site ...,A1
3,Est-ce que ton mari est aussi de Boston?,A1
4,"Dans les écoles de commerce, dans les couloirs...",B1


In [5]:
df_pred = pd.read_csv(path, index_col=0)
df_pred.head()

Unnamed: 0_level_0,sentence,difficulty
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Les coûts kilométriques réels peuvent diverger...,C1
1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1
2,Le test de niveau en français est sur le site ...,A1
3,Est-ce que ton mari est aussi de Boston?,A1
4,"Dans les écoles de commerce, dans les couloirs...",B1


In [6]:
path = "https://raw.githubusercontent.com/Lirette2/DMML2021_Apple/main/data/sample_submission.csv"
df_example_submission = pd.read_csv(path, index_col=0)
df_example_submission.head()

Unnamed: 0_level_0,difficulty
id,Unnamed: 1_level_1
0,A1
1,A1
2,A1
3,A1
4,A1


## Starting with Word Embedding with Word2Vec


In [7]:
# Import vocabulary librairie from spaCy
sp = !python -m spacy download fr_core_news_sm

In [8]:
#Import package used for Word2Vec
from gensim.models import Word2Vec

In [9]:
# Create a list of punctuation marks
punctuations = string.punctuation
punctuations

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
# Create a list of stopwords
stop_words = spacy.lang.fr.stop_words.STOP_WORDS 

list(stop_words)[:10]

['tend',
 'siens',
 'autres',
 'nôtres',
 'là',
 'me',
 'suffit',
 'floc',
 'lesquels',
 'effet']

In [11]:
# Load French language model
# Load French language model
import fr_core_news_sm
#sp = spacy.load('en_core_web_sm')
sp = fr_core_news_sm.load()

# Create tokenizer function
def spacy_tokenizer(sentence):
    # Create token object, which is used to create documents with linguistic annotations.
    mytokens = sp(sentence)

    # Lemmatize each token and convert each token into lowercase
    mytokens = [ word.lemma_.lower().strip() for word in mytokens ]

    # Remove stop words and punctuation
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # Return preprocessed list of tokens
    return mytokens

# Example
#New_sentence = df["sentence"].sample()
New_sentence = df["sentence"].head()
New_sentence.values[0]

"Les coûts kilométriques réels peuvent diverger sensiblement des valeurs moyennes en fonction du moyen de transport utilisé, du taux d'occupation ou du taux de remplissage, de l'infrastructure utilisée, de la topographie des lignes, du flux de trafic, etc."

In [12]:
spacy_tokenizer(New_sentence.values[0])

['coût',
 'kilométrique',
 'réel',
 'pouvoir',
 'diverger',
 'sensiblemer',
 'valeur',
 'moyenner',
 'fonction',
 'moyen',
 'transport',
 'utiliser',
 'taux',
 'occupation',
 'taux',
 'remplissage',
 'infrastructure',
 'utiliser',
 'topographie',
 'ligne',
 'flux',
 'trafic',
 'etc.']

In [13]:
#Vectorization Feature Engineering (TF-IDF)
tfidf_vector = TfidfVectorizer(tokenizer=spacy_tokenizer) # we use the above defined tokenizer

In [14]:
texts = df['sentence']


In [15]:
# Create tokenizer function for preprocessing
def spacy_tokenizer(text):

    # Define stopwords, punctuation, and numbers
    stop_words = spacy.lang.fr.stop_words.STOP_WORDS
    punctuations = string.punctuation
    numbers = "0123456789"

    # Create spacy object
    mytokens = sp(text)

    # Lemmatize each token and convert each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Remove stop words and punctuation
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # Remove sufix like ".[1" in "experience.[1"
    mytokens_2 = []
    for word in mytokens:
      for char in word:
        if (char in punctuations) or (char in numbers):
          word = word.replace(char, "")
      if word != "":
        mytokens_2.append(word)

    # Return preprocessed list of tokens
    return mytokens_2

# Tokenize texts
processed_texts = []
for text in texts:
  processed_text = spacy_tokenizer(text)
  processed_texts.append(processed_text)

In [16]:
#Use this lines only if we want to check
#for processed_text in processed_texts:
#  print(processed_text[:20])

In [17]:
# Word embedding 
### Parameters: 
#     - min_count: minimum number of occurence of single word in corpus to be taken into account
#     - size: dimension of the vectors representing the tokens
#     - IMPORTANT: processed_texts must be a list of lists of tokens object!
word2vec = Word2Vec(processed_texts, min_count=2, size=100)
vocab = word2vec.wv.vocab
print(vocab)

{'coût': <gensim.models.keyedvectors.Vocab object at 0x7f837114d650>, 'kilométrique': <gensim.models.keyedvectors.Vocab object at 0x7f837114d690>, 'réel': <gensim.models.keyedvectors.Vocab object at 0x7f837114d750>, 'pouvoir': <gensim.models.keyedvectors.Vocab object at 0x7f837114d7d0>, 'valeur': <gensim.models.keyedvectors.Vocab object at 0x7f837114d810>, 'moyenner': <gensim.models.keyedvectors.Vocab object at 0x7f837114d890>, 'fonction': <gensim.models.keyedvectors.Vocab object at 0x7f837114d8d0>, 'moyen': <gensim.models.keyedvectors.Vocab object at 0x7f837114d990>, 'transport': <gensim.models.keyedvectors.Vocab object at 0x7f837114d850>, 'utiliser': <gensim.models.keyedvectors.Vocab object at 0x7f837114d790>, 'taux': <gensim.models.keyedvectors.Vocab object at 0x7f837114d910>, 'occupation': <gensim.models.keyedvectors.Vocab object at 0x7f837114d950>, 'infrastructure': <gensim.models.keyedvectors.Vocab object at 0x7f837114d710>, 'topographie': <gensim.models.keyedvectors.Vocab object

In [18]:
# Vector
v1 = word2vec.wv['terre'] 
v1

array([-6.77017868e-03, -8.98337923e-03, -5.61073842e-03,  1.50320912e-03,
       -4.66752192e-03,  4.83976165e-03, -7.78126204e-03,  2.55223922e-03,
        1.32509200e-02,  2.25790311e-03,  5.52995596e-03, -4.84859338e-03,
        9.56962630e-03, -4.23990004e-03,  4.16849134e-03, -6.64889114e-03,
       -3.69382254e-03, -6.93685655e-03,  4.31417441e-03, -1.13721825e-02,
        2.46695825e-03, -4.11078101e-03,  8.78430158e-03,  5.13461372e-03,
       -8.62661935e-03,  4.96520102e-03, -1.25561543e-02, -4.88891127e-03,
       -2.88186129e-05,  1.84362731e-03,  1.43817514e-02, -4.75228438e-03,
        5.27692121e-03,  2.40144972e-03,  5.78543870e-03, -7.12854089e-03,
       -9.06314608e-03, -2.84390035e-03,  5.52966818e-03,  2.44042254e-03,
       -5.83378086e-03, -5.23230759e-03,  2.77589029e-03, -1.72105823e-02,
       -9.49604344e-03,  3.78106488e-03, -1.45562482e-03,  7.37843686e-04,
        1.76315615e-03,  7.50355097e-03, -1.10176415e-03, -4.68868343e-03,
       -4.00114665e-03, -

In [19]:
# Similar vectors/words
sim_words = word2vec.wv.most_similar('terre')
sim_words

[('femme', 0.9038424491882324),
 ('voir', 0.8998454809188843),
 ('devoir', 0.8974031805992126),
 ('mettre', 0.8955041766166687),
 ('fin', 0.8954460620880127),
 ('y', 0.8950017690658569),
 ('aller', 0.8928413987159729),
 ('grand', 0.8909603953361511),
 ('pouvoir', 0.8888776898384094),
 ('al', 0.8874717950820923)]

In [20]:
# Similarity between two words
word2vec.wv.similarity('terre', 'apprentissage')

0.6413261

In [21]:
#### **** ***************** **** ####
#### **** RAW TEXT FEATURES **** ####
#### **** ***************** **** ####

# Count tokens per sentence
def count_token(sent):
  return(len(spacy_tokenizer(sent))) #spacy_tokenizer() to get tokens, len() to count them

# Count raw words per sentence
def count_words(sent):
  return(len(sent.split())) #split() gives us individual words, len() counts them

#Get average character length of word
def count_avg_word_character(sent):
  words = sent.split()
  return(sum(len(word) for word in words) / len(words))

def count_avg_token_character(sent):
  words = spacy_tokenizer(sent)
  if len(words) == 0:
    return(0)
  else:
    return(sum(len(word) for word in words) / len(words))

In [22]:
#### **** **************** **** ####
#### **** LEXICAL FEATURES **** ####
#### **** **************** **** ####

# Lexical Diversity
def lex_div_word(sent):
  total_number_word = len(sent.split())
  unique = set(sent.split())
  return(len(unique)/total_number_word)
#We don't apply for token, as the goal of the tokenzization is to be left with 
#unique tokens
#For tokens, we should apply to the whole text as done by tfidf_vector

# Lexical Density
def lex_den_tokens(sent):
  st = spacy_tokenizer(sent)
  string = " ".join([str(item) for item in st])
  x = sp(string)
  counter = 0 
  for token in x:
    if token.pos_ == "NOUN" or token.pos_ == "ADJ" or token.pos_ == "VERB" or token.pos_ == "ADV":
      counter = counter + 1
  if len(st) == 0:
    return(0)
  else:
    return(counter/len(st))

def lex_den_words(sent):
  x = sp(sent)
  counter = 0 
  for token in x:
    if token.pos_ == "NOUN" or token.pos_ == "ADJ" or token.pos_ == "VERB" or token.pos_ == "ADV":
      counter = counter + 1
  return(counter/len(x))

# Words NOT in frequent list
path = "https://raw.githubusercontent.com/Lirette2/DMML2021_Apple/main/data/list_words.csv"
words = pd.read_csv(path, index_col=0)


def words_list(sent):
  unique = set(sent.split())
  counter = 0
  for word_in_sentence in unique:
    for word_in_list in words.Mots:
      if word_in_sentence == word_in_list:
        counter = counter + 1
        break#we stop comparing once the word in found, to make it faster
  return(counter/len(unique))


def token_list(sent):
  unique = spacy_tokenizer(sent)
  counter = 0
  for word_in_sentence in unique:
    for word_in_list in words.Mots:
      if word_in_sentence == word_in_list:
        counter = counter + 1
        break#we stop comparing once the word in found, to make it faster
  return(counter/len(unique))

In [23]:
raw_word_count = pd.Series(texts.apply(count_words),name="raw_word_count")
token_count = pd.Series(texts.apply(count_token),name="token_count")
avg_chr_word = pd.Series(texts.apply(count_avg_word_character),name="avg_chr_word")
avg_chr_token = pd.Series(texts.apply(count_avg_token_character),name="avg_chr_token")
diversity_word = pd.Series(texts.apply(lex_div_word),name="diversity_word")
density_word = pd.Series(texts.apply(lex_den_words),name="density_word")
density_token = pd.Series(texts.apply(lex_den_tokens),name="density_token")
#freq_word_list = pd.Series(df.sentence.apply(words_not_list),name="freq_word_list")
freq_token_list = pd.Series(texts.apply(words_list),name="freq_token_list")

new_df = pd.concat([df,raw_word_count,token_count,avg_chr_word,avg_chr_token,
                    diversity_word,density_word,density_token,freq_token_list],axis=1)
new_df

Unnamed: 0_level_0,sentence,difficulty,raw_word_count,token_count,avg_chr_word,avg_chr_token,diversity_word,density_word,density_token,freq_token_list
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,Les coûts kilométriques réels peuvent diverger...,C1,38,23,5.736842,7.391304,0.763158,0.488889,0.956522,0.206897
1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1,12,5,4.250000,5.400000,1.000000,0.375000,0.400000,0.416667
2,Le test de niveau en français est sur le site ...,A1,13,6,4.153846,5.833333,0.923077,0.400000,1.000000,0.500000
3,Est-ce que ton mari est aussi de Boston?,A1,8,2,4.125000,5.000000,1.000000,0.400000,0.500000,0.750000
4,"Dans les écoles de commerce, dans les couloirs...",B1,34,16,5.176471,6.062500,0.823529,0.380952,0.812500,0.285714
...,...,...,...,...,...,...,...,...,...,...
4795,"C'est pourquoi, il décida de remplacer les hab...",B2,26,12,5.384615,7.500000,1.000000,0.500000,0.833333,0.423077
4796,Il avait une de ces pâleurs splendides qui don...,C1,21,10,4.666667,5.700000,0.952381,0.454545,1.000000,0.250000
4797,"Et le premier samedi de chaque mois, venez ren...",A2,14,6,4.785714,6.666667,0.928571,0.466667,1.000000,0.461538
4798,Les coûts liés à la journalisation n'étant pas...,C2,32,16,6.093750,8.500000,0.875000,0.540541,0.937500,0.321429


In [24]:
from sklearn.preprocessing import MinMaxScaler
#Scale the data
scaler = MinMaxScaler()
col_to_scale = ["raw_word_count","token_count","avg_chr_word","avg_chr_token"]
#no need for the others because they already are on a scale from 0 to 1

df[col_to_scale]= scaler.fit_transform(new_df[col_to_scale])
df

Unnamed: 0_level_0,sentence,difficulty,raw_word_count,token_count,avg_chr_word,avg_chr_token
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Les coûts kilométriques réels peuvent diverger...,C1,0.140152,0.190083,0.339713,0.615942
1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1,0.041667,0.041322,0.204545,0.450000
2,Le test de niveau en français est sur le site ...,A1,0.045455,0.049587,0.195804,0.486111
3,Est-ce que ton mari est aussi de Boston?,A1,0.026515,0.016529,0.193182,0.416667
4,"Dans les écoles de commerce, dans les couloirs...",B1,0.125000,0.132231,0.288770,0.505208
...,...,...,...,...,...,...
4795,"C'est pourquoi, il décida de remplacer les hab...",B2,0.094697,0.099174,0.307692,0.625000
4796,Il avait une de ces pâleurs splendides qui don...,C1,0.075758,0.082645,0.242424,0.475000
4797,"Et le premier samedi de chaque mois, venez ren...",A2,0.049242,0.049587,0.253247,0.555556
4798,Les coûts liés à la journalisation n'étant pas...,C2,0.117424,0.132231,0.372159,0.708333


## Testing and training the model 

In [25]:
# Select features
X = df[["sentence","raw_word_count","token_count","avg_chr_word","avg_chr_token"]]# the features we want to analyze

ylabels = df['difficulty'] # the labels, or answers, we want to test against

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.2, random_state=1234, stratify=ylabels)

X_train

Unnamed: 0_level_0,sentence,raw_word_count,token_count,avg_chr_word,avg_chr_token
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
962,Le réalisateur m'a d'abord demandé de me mettr...,0.037879,0.033058,0.272727,0.645833
1886,"Après quelques mois de cette pauvreté noble, a...",0.109848,0.115702,0.266667,0.511905
2721,L'indicateur n'était que de 40% chez les femme...,0.034091,0.024793,0.245455,0.638889
1025,L'objectif de ce type de voyage est d'être act...,0.094697,0.099174,0.241259,0.527778
4048,"Et, en France, beaucoup moins de filles que de...",0.083333,0.082645,0.276680,0.500000
...,...,...,...,...,...
3693,Je vais prendre ma douche dans ma salle-de-bain.,0.026515,0.033058,0.284091,0.604167
3408,"Après l'éruption de 1754, la plus grosse connu...",0.132576,0.123967,0.275253,0.561111
4289,Léonard est initié par Verrocchio aux nombreus...,0.117424,0.123967,0.369318,0.644444
3312,"On en trouve des exemples dans l'ouvrage ""L'in...",0.090909,0.123967,0.432727,0.694444


In [26]:
y_train

id
962     B1
1886    C1
2721    A2
1025    B1
4048    B2
        ..
3693    A1
3408    B1
4289    C2
3312    C2
269     A1
Name: difficulty, Length: 3840, dtype: object

In [27]:
from sklearn.compose import ColumnTransformer

# Define classifier
classifier = LogisticRegression(multi_class="multinomial",max_iter=1000)

#Vectorizer
tfidf_vector = TfidfVectorizer(tokenizer=spacy_tokenizer)

#Column Transformer (to apply vectorizer to the right column)
column_transformer = ColumnTransformer(
    [("tfidf", tfidf_vector, "sentence")],
    remainder="passthrough")

# Create pipeline
pipe = Pipeline([("tfidf",column_transformer),("classifier", classifier)])

# Fit model on training set
pipe.fit(X_train, y_train)

Pipeline(steps=[('tfidf',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('tfidf',
                                                  TfidfVectorizer(tokenizer=<function spacy_tokenizer at 0x7f8371167560>),
                                                  'sentence')])),
                ('classifier',
                 LogisticRegression(max_iter=1000, multi_class='multinomial'))])

In [28]:
# Evaluate the model
def evaluate(test, pred):
  precision = precision_score(test, pred,average=None)
  recall = recall_score(test, pred, average=None)
  f1= f1_score(test, pred, average=None)
  print(f'CONFUSION MATRIX:\n{confusion_matrix(test, pred)}')
  print(f"ACCURACY SCORE:\n{accuracy_score(test, pred) :.4f}")
  print(f'CLASSIFICATION REPORT:')
  print("Precision:\t {0:4f}".format(precision_score(test, pred,average="macro"))) 
  print("Recall:\t {0:4f}".format(recall_score(test, pred, average="macro")))
  print("F1_Score:\t {0:4f}".format(f1_score(test, pred, average="macro")))

In [29]:
# Predictions
y_pred = pipe.predict(X_test)

# Evaluation - test set
evaluate(y_test, y_pred)

CONFUSION MATRIX:
[[98 32 20 11  0  2]
 [51 61 36  6  4  1]
 [25 36 58 24  8  8]
 [ 6  6 24 69 29 24]
 [ 5  5  7 36 69 38]
 [ 4  0  7 26 30 94]]
ACCURACY SCORE:
0.4677
CLASSIFICATION REPORT:
Precision:	 0.465451
Recall:	 0.466911
F1_Score:	 0.464865
