[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1mUDx4uFpbS6jrD7lN-P7F7saefP_QJ31?usp=sharing)


In [None]:
# Import required packages
!python -m spacy download fr_core_news_sm
#import fr_core_news_sm
import spacy
from spacy import displacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 

In [None]:
# Import additional packages
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from spacy.lang.fr.stop_words import STOP_WORDS
from spacy.lang.fr.examples import sentences 
from spacy.lang.fr import French


In [None]:
path = "https://raw.githubusercontent.com/Lirette2/DMML2021_Apple/main/data/training_data.csv"

df = pd.read_csv(path, index_col=0)
df.head()


In [None]:
df.info()


In [None]:
# Base rate: the data-set is a bit balanced!
df.difficulty.value_counts()

In [None]:
difficulty_count = df.groupby("difficulty").count()
plt.bar(difficulty_count.index.values, difficulty_count["sentence"])
plt.xlabel("Difficulty")
plt.ylabel("Number of Sentences")
plt.show()

In [None]:
round(df.difficulty.value_counts().max()/ len(df), 4)


### tokening the date with spaCy


In [None]:
# Create a list of punctuation marks
punctuations = string.punctuation
punctuations

In [None]:
# Create a list of stopwords
#stop_words = spacy.lang.en.stop_words.STOP_WORDS
stop_words = spacy.lang.fr.stop_words.STOP_WORDS

list(stop_words)[:10]

In [None]:
# Load French language model
import fr_core_news_sm
#sp = spacy.load('en_core_web_sm')
sp = fr_core_news_sm.load()

# Create tokenizer function
def spacy_tokenizer(sentence):
    # Create token object, which is used to create documents with linguistic annotations.
    mytokens = sp(sentence)

    # Lemmatize each token and convert each token into lowercase
    mytokens = [ word.lemma_.lower().strip() for word in mytokens ]
    ## alternative way
    # mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Remove stop words and punctuation
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # Return preprocessed list of tokens
    return mytokens

texts = df['sentence']

# Tokenize texts
processed_texts = []
for text in texts:
  processed_text = spacy_tokenizer(text)
  processed_texts.append(processed_text)

In [None]:
# Word embedding 
### Parameters: 
#     - min_count: minimum number of occurence of single word in corpus to be taken into account
#     - size: dimension of the vectors representing the tokens
#     - IMPORTANT: processed_texts must be a list of lists of tokens object!
from gensim.models import Word2Vec
word2vec = Word2Vec(processed_texts, min_count=2, size=100)
vocab = word2vec.wv.vocab

In [None]:
#### **** ***************** **** ####
#### **** RAW TEXT FEATURES **** ####
#### **** ***************** **** ####

# Count tokens per sentence
def count_token(sent):
  return(len(spacy_tokenizer(sent))) #spacy_tokenizer() to get tokens, len() to count them

# Count raw words per sentence
def count_words(sent):
  return(len(sent.split())) #split() gives us individual words, len() counts them

#Get all characters in a sentece
def count_sentence_character(sent):
  words = sent.split()
  return(sum(len(word) for word in words))

#Get average character length of word
def count_avg_word_character(sent):
  words = sent.split()
  return(sum(len(word) for word in words) / len(words))

def count_avg_token_character(sent):
  words = spacy_tokenizer(sent)
  if len(words) == 0:
    return(0)
  else:
    return(sum(len(word) for word in words) / len(words))

#Get min character length of word
def count_min_word_character(sent):
  words = sent.split()
  return(min(len(word) for word in words))

#Get max character length of word
def count_max_word_character(sent):
  words = sent.split()
  return(max(len(word) for word in words))

In [None]:
#### **** **************** **** ####
#### **** LEXICAL FEATURES **** ####
#### **** **************** **** ####

# Lexical Diversity
def lex_div_word(sent):
  total_number_word = len(sent.split())
  unique = set(sent.split())
  return(len(unique)/total_number_word)
#We don't apply for token, as the goal of the tokenzization is to be left with 
#unique tokens
#For tokens, we should apply to the whole text as done by tfidf_vector

# Lexical Density
def lex_den_tokens(sent):
  st = spacy_tokenizer(sent)
  if len(st) == 0:
    return(0)
  else:
    string = " ".join([str(item) for item in st])
    x = sp(string)
    counter = 0 
    for token in x:
      if token.pos_ == "NOUN" or token.pos_ == "ADJ" or token.pos_ == "VERB" or token.pos_ == "ADV":
        counter = counter + 1
    return(counter/len(st))

def lex_den_words(sent):
  x = sp(sent)
  counter = 0 
  for token in x:
    if token.pos_ == "NOUN" or token.pos_ == "ADJ" or token.pos_ == "VERB" or token.pos_ == "ADV":
      counter = counter + 1
  return(counter/len(x))

# Words NOT in frequent list
path = "https://raw.githubusercontent.com/Lirette2/DMML2021_Apple/main/data/list_words.csv"
words = pd.read_csv(path, index_col=0)


def words_list(sent):
  unique = set(sent.split())
  counter = 0
  for word_in_sentence in unique:
    for word_in_list in words.Mots:
      if word_in_sentence == word_in_list:
        counter = counter + 1
        break#we stop comparing once the word in found, to make it faster
  return(1-(counter/len(unique)))


def token_list(sent):
  unique = spacy_tokenizer(sent)
  if len(unique) == 0:
    return(0)
  else:
    counter = 0
    for word_in_sentence in unique:
      for word_in_list in words.Mots:
        if word_in_sentence == word_in_list:
          counter = counter + 1
          break#we stop comparing once the word in found, to make it faster
    return(1-(counter/len(unique)))

#Word Embedding:
def get_vector(sent):
  token = spacy_tokenizer(sent)
  for word in token: 
    if word in word2vec.wv.vocab:
      return(np.mean(word2vec[word]))

In [None]:
def get_features(data_to_process):
  raw_word_count = pd.Series(data_to_process.sentence.apply(count_words),name="raw_word_count")
  token_count = pd.Series(data_to_process.sentence.apply(count_token),name="token_count")
  avg_chr_word = pd.Series(data_to_process.sentence.apply(count_avg_word_character),name="avg_chr_word")
  min_chr_word = pd.Series(data_to_process.sentence.apply(count_min_word_character),name="min_chr_word")
  max_chr_word = pd.Series(data_to_process.sentence.apply(count_max_word_character),name="max_chr_word")
  tot_chr_stn = pd.Series(data_to_process.sentence.apply(count_sentence_character),name="tot_chr_stn") 
  avg_chr_token = pd.Series(data_to_process.sentence.apply(count_avg_token_character),name="avg_chr_token")
  diversity_word = pd.Series(data_to_process.sentence.apply(lex_div_word),name="diversity_word")
  density_word = pd.Series(data_to_process.sentence.apply(lex_den_words),name="density_word")
  density_token = pd.Series(data_to_process.sentence.apply(lex_den_tokens),name="density_token")
  freq_word_list = pd.Series(data_to_process.sentence.apply(words_list),name="freq_word_list")
  freq_token_list = pd.Series(data_to_process.sentence.apply(token_list),name="freq_token_list")
  token_wv = pd.Series(data_to_process.sentence.apply(get_vector),name="token_wv")
  
  processed_df = pd.concat([data_to_process,raw_word_count,token_count,avg_chr_word,min_chr_word,max_chr_word,tot_chr_stn,avg_chr_token,
                    diversity_word,density_word,density_token,freq_word_list,freq_token_list,token_wv],axis=1)
  return(processed_df)

def scale_data(df_to_scale):
  scaler = MinMaxScaler()
  col_to_scale = ["raw_word_count","token_count","avg_chr_word","min_chr_word","max_chr_word","tot_chr_stn","avg_chr_token",]
  #no need for the others because they already are on a scale from 0 to 1
  df_to_scale[col_to_scale]= scaler.fit_transform(df_to_scale[col_to_scale])
  return(df_to_scale) 


In [None]:
new_df = get_features(df)


In [None]:
new_df = scale_data(new_df)
new_df["token_wv"] = new_df["token_wv"].fillna(0)
new_df.token_wv.isna().sum()

In [None]:
# Select features
X = new_df[["sentence","raw_word_count","token_count","avg_chr_word","min_chr_word","max_chr_word","tot_chr_stn","avg_chr_token",
                    "diversity_word","density_word","density_token","freq_word_list",
            "freq_token_list","token_wv",]]# the features we want to analyze

ylabels = new_df['difficulty'] # the labels, or answers, we want to test against

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.2, random_state=1234, stratify=ylabels)

X_train

In [None]:
y_train


In [None]:
# Evaluate the model
def evaluate(test, pred):
  precision = precision_score(test, pred,average=None)
  recall = recall_score(test, pred, average=None)
  f1= f1_score(test, pred, average=None)
  print(f'CONFUSION MATRIX:\n{confusion_matrix(test, pred)}')
  print(f"ACCURACY SCORE:\n{accuracy_score(test, pred) :.4f}")
  print(f'CLASSIFICATION REPORT:')
  print("Precision:\t {0:4f}".format(precision_score(test, pred,average="macro"))) 
  print("Recall:\t {0:4f}".format(recall_score(test, pred, average="macro")))
  print("F1_Score:\t {0:4f}".format(f1_score(test, pred, average="macro")))

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report


# Define classifier
classifier = LogisticRegression(multi_class="multinomial",max_iter=1000)

#Vectorizer
tfidf_vector = TfidfVectorizer(tokenizer=spacy_tokenizer)

#Column Transformer (to apply vectorizer to the right column)
column_transformer = ColumnTransformer(
    [("tfidf", tfidf_vector, "sentence")],
    remainder="passthrough")

# Create pipeline
pipe = Pipeline([("tfidf",column_transformer),("classifier", classifier)])

# Fit model on training set
pipe.fit(X_train_full2, y_train)
# Predictions
y_pred = pipe.predict(X_test_full2)

# Evaluation - test set
evaluate(y_test, y_pred)

In [None]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
#Create a Gaussian Classifier
random_for=RandomForestClassifier(n_estimators=100)

pipe_rf = Pipeline([("tfidf",column_transformer),('feature_selection',SelectFromModel(LinearSVC(penalty="l2"))),("model", random_for)])
#Train the model using the training sets y_pred=clf.predict(X_test)
pipe_rf.fit(X_train_full2,y_train)

y_pred=pipe_rf.predict(X_test_full2)
evaluate(y_test, y_pred)


In [None]:
from sklearn.svm import LinearSVC


#Train the model
lsvc = LinearSVC(verbose=0)

pipe_lsvc = Pipeline([("tfidf",column_transformer),("model", lsvc)])
pipe_lsvc.fit(X_train_full2,y_train)
y_pred=pipe_lsvc.predict(X_test_full2)
evaluate(y_test, y_pred)


In [None]:
### Submission data

path = "https://raw.githubusercontent.com/Lirette2/DMML2021_Apple/main/data/unlabelled_test_data.csv"
sub_df = pd.read_csv(path, index_col=0)

new_sub_df = get_features(sub_df)

In [None]:
new_sub_df = scale_data(new_sub_df)
new_sub_df["token_wv"] = new_sub_df["token_wv"].fillna(0)
#new_sub_df.token_wv.isna().sum()

In [None]:
X_sub = new_sub_df[["sentence","raw_word_count","token_count","avg_chr_word","min_chr_word","max_chr_word","tot_chr_stn","avg_chr_token",
                    "diversity_word","density_word","density_token","freq_word_list",
            "freq_token_list","token_wv"]]

In [None]:
y_sub = pipe.predict(X_sub)


In [None]:
new_sub_df["difficulty"] = y_pred_sub
submission = new_sub_df.filter(["id","difficulty"],axis=1)
submission

In [None]:
from google.colab import files
submission.to_csv('submission_18_apple_unil.csv') 
files.download('submission_18_apple_unil.csv')

### Doc2Vec 

Here, I have some troubles, mostly because I tried to include the existing regressors, to the new model with Doc2Vec. I will serapate only put without the other regressors, and the submission

In [None]:
#Doc2Vec
# Not sure how correct this precedure is
#I'm following: https://towardsdatascience.com/multi-class-text-classification-with-doc2vec-logistic-regression-9da9947b43f4

#Multiprocessing, this allows code to run faster because it uses all the CPU available
import multiprocessing
cores = multiprocessing.cpu_count()

# Getting the texts with the correct difficulty tags
texts_tagged = df.apply(
    lambda r: TaggedDocument(words=spacy_tokenizer(r.sentence), tags=[r.difficulty]), axis=1)

In [None]:
#Doc2Vec: Part 2
#We now build the the vocabulary

model_dbow = Doc2Vec(dm=0, vector_size=35, negative=6, hs=0, min_count=1, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in texts_tagged.values])

In [None]:
model_dbow.train(texts_tagged, total_examples=model_dbow.corpus_count, epochs=model_dbow.epochs)


In [None]:
#Doc2Vec: Part 3

def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [None]:
train_tagged, test_tagged = train_test_split(texts_tagged, test_size=0.2, random_state=1234)

y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)

In [None]:
from sklearn.svm import LinearSVC

#Train the model
lsvc = LinearSVC(verbose=0)

# Fit model on training set
lsvc.fit(X_train, y_train)
# Predictions
y_pred = lsvc.predict(X_test)

# Evaluation - test set
evaluate(y_test, y_pred)
#0.6615
#0.6594 # vector_size = 35,neg=6,min_count=2

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report


# Define classifier
classifier = LogisticRegression(multi_class="multinomial",max_iter=1000, solver="lbfgs")

# Fit model on training set
classifier.fit(X_train, y_train)
# Predictions
y_pred = classifier.predict(X_test)

# Evaluation - test set
evaluate(y_test, y_pred)