#Project : Nestle

Here are the important libraries that we will use

In [None]:
# Install and update spaCy
!pip install -U spacy

# Download the french language model
!python -m spacy download fr

# Libraries:

import pandas as pd
import spacy
import string
from spacy.lang.fr.stop_words import STOP_WORDS

[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'fr' are deprecated. Please use the
full pipeline package name 'fr_core_news_sm' instead.[0m
Collecting fr-core-news-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.2.0/fr_core_news_sm-3.2.0-py3-none-any.whl (17.4 MB)
[K     |████████████████████████████████| 17.4 MB 463 kB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')


##Data Preparation

First, we read the Training Data using the download link.

In [None]:
# Read the data:

# We take the url to doawnload the training data:

url = "https://storage.googleapis.com/kagglesdsdata/competitions/32066/2798629/training_data.csv?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1638871683&Signature=BxIwsdE%2BQaVaP44mJZiajFAd05%2F2RADSA4xIKEqvZsO2ulQbUJL5TIBs5jx88hatetORhWF9annOzgcuLJYV5Do2qAcBUfz1whR60FQHgpjv%2F7X3orM8UPG2w7ZhqC0vd3fRpFWdfI%2FqZnGg6CSySAFth8Yultup9ANQl6EMy59y91dl3uC6OYgmiHHHOfOCrU6LJoez2BCER8l1rY1VIKUwJogQhx%2BNDP7kCdDzsLYmkGzwqAn49sLqulJLw%2F7t7Bm5BJclauqXS8FJzXDWzB4S1ndG43qMnQJpFHP8hc2ndPiQX8q1nZNN8NudQjZJjVK4xr1lEm56vZqGRhYrZA%3D%3D&response-content-disposition=attachment%3B+filename%3Dtraining_data.csv"
Training_Data = pd.read_csv(url, delimiter=",")
Training_Data.head(2)

Unnamed: 0,id,sentence,difficulty
0,0,Les coûts kilométriques réels peuvent diverger...,C1
1,1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1


We get some informations about the Training Data in order to see how good they are.

In [None]:
Training_Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4800 entries, 0 to 4799
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          4800 non-null   int64 
 1   sentence    4800 non-null   object
 2   difficulty  4800 non-null   object
dtypes: int64(1), object(2)
memory usage: 112.6+ KB


In [None]:
# Base rate: the data-set is balanced
Training_Data.difficulty.value_counts()

A1    813
C2    807
C1    798
B1    795
A2    795
B2    792
Name: difficulty, dtype: int64

Then, we read the Test Data using also the download link.

In [None]:
url = "https://storage.googleapis.com/kagglesdsdata/competitions/32066/2798629/unlabelled_test_data.csv?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1638891443&Signature=Yq3ALBqeX%2BRm7bWum%2FoP%2FCSO3s4iW8mcJfp07Gm11sUWLdennLm6wKgoeI%2FsCc3grJSl7cj%2B9O1JbP2l7gxwMUMH2o0zysO9zPXZODpQCG73sRgurwfRbJqQ3C%2Fv8zVCJ0D1TKaBC8vH2Mn%2FrhmFUm58RerTIk%2FdFmBEScD6nyLN591GA3gAY04NIYCLH34pRCRQ4uwYvDra%2FxYH7ABvLQ8tOwloYXHBkq%2BkssqXUcVhypJS9USj4p%2FkFWj5GwgVf8Vf3ytj2FXDTf0GBw0IEMT2bVHQRE8T1ZWc8%2BLqmdRMqA%2BwA%2BlTCUyua9FyQNokxrEd3j%2FzBNIXf8ZGD3JTDQ%3D%3D&response-content-disposition=attachment%3B+filename%3Dunlabelled_test_data.csv"
Test_Data = pd.read_csv(url, delimiter=",")
Test_Data.head(2)

Unnamed: 0,id,sentence
0,0,Nous dûmes nous excuser des propos que nous eû...
1,1,Vous ne pouvez pas savoir le plaisir que j'ai ...


##Tokening the Data With spaCy

In [None]:
# We create a list of punctuation marks
punctuations = string.punctuation
punctuations

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
# And a list of stopwords
stop_words = spacy.lang.fr.stop_words.STOP_WORDS

#list(stop_words)

In [None]:
# Let's implement the tokenizer function:

# Load French language model:
sp = spacy.load('fr_core_news_sm')

# Tokenizer function:
def spacy_tokenizer(sentence):
    # Create token object, which is used to create documents with linguistic annotations.
    mytokens = sp(sentence)
    
    # Lemmatize each token and convert each token into lowercase
    mytokens = [ word.lemma_.lower().strip() for word in mytokens ]

    # Remove stop words and punctuation
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # Remove anonymous dates and people
    mytokens = [ word.replace('xx/', '').replace('xxxx/', '').replace('xx', '') for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in ["xxxx", "xx", ""] ]

    # Return preprocessed list of tokens
    return mytokens

Vectorization Feature Engineering (TF-IDF)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vector = TfidfVectorizer(tokenizer=spacy_tokenizer)

##Classification of the reviews using Logistic Regression

In [None]:
# Select features
X_train = Training_Data['sentence'] # the features we want to analyze
y_train = Training_Data['difficulty'] # the labels, or answers, we want to test against

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
# Define classifier
classifier = LogisticRegression()

# Create pipeline
pipe = Pipeline([('vectorizer', tfidf_vector),
                 ('classifier', classifier)])

# Fit model on training set
pipe.fit(X_train, y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function spacy_tokenizer at 0x7f217f4f2a70>)),
                ('classifier', LogisticRegression())])

Then we do the prediction with our model.

In [None]:
X_test = Test_Data["sentence"]
y_LogRegPred = pipe.predict(X_test)

In [None]:
LogisticReg_pred = pd.DataFrame(y_LogRegPred)

In [None]:
LogisticReg_pred.columns = ['difficulty']
LogisticReg_pred.insert(0, 'id', [i for i in range(1200)])

In [None]:
LogisticReg_pred.to_csv('LogisticReg_prediction.csv', header = True, index = False)

##Classification of the reviews using knn regression

In [None]:
# Select features
X_train = Training_Data['sentence'] # the features we want to analyze
y_train = Training_Data['difficulty'] # the labels, or answers, we want to test against

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
# Define classifier
classifier = KNeighborsClassifier(n_neighbors=6)

# Create pipeline
pipe = Pipeline([('vectorizer', tfidf_vector),
                 ('classifier', classifier)])

# Fit model on training set
pipe.fit(X_train, y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function spacy_tokenizer at 0x7f217f4f2a70>)),
                ('classifier', KNeighborsClassifier(n_neighbors=6))])

In [None]:
X_test = Test_Data['sentence']
y_KnnPred = pipe.predict(X_test)

In [None]:
Knn_pred = pd.DataFrame(y_KnnPred)

In [None]:
Knn_pred.columns = ['difficulty']
Knn_pred.insert(0, 'id', [i for i in range(1200)])

In [None]:
Knn_pred.to_csv('KNN_prediction.csv', header = True, index = False)

##Classification of the reviews using Decision Tree regression

In [None]:
# Select features
X_train = Training_Data['sentence'] # the features we want to analyze
y_train = Training_Data['difficulty'] # the labels, or answers, we want to test against

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=7, random_state=72)

# Define classifier
classifier = DecisionTreeClassifier()

# Create pipeline
pipe = Pipeline([('vectorizer', tfidf_vector),
('classifier', classifier)])

#fit
pipe.fit(X_train, y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function spacy_tokenizer at 0x7f217f4f2a70>)),
                ('classifier', DecisionTreeClassifier())])

In [None]:
X_test = Test_Data['sentence']

In [None]:
y_DecisionTreePred = pipe.predict(X_test)

In [None]:
DecisionTree_pred = pd.DataFrame(y_DecisionTreePred)

In [None]:
DecisionTree_pred.columns = ['difficulty']
DecisionTree_pred.insert(0, 'id', [i for i in range(1200)])

In [None]:
DecisionTree_pred.to_csv('DecisionTree_prediction.csv', header = True, index = False)

#To improve the model we try doc2vec

In [None]:
# Tokenize data - same tokenizer function as before
Training_DataDoc2Vec = Training_Data
Test_DataDoc2Vec = Test_Data
#%%time
from gensim.models.doc2vec import TaggedDocument
training_tagged = Training_DataDoc2Vec.apply(lambda r: TaggedDocument(words=spacy_tokenizer(r['sentence']), tags = [r.difficulty]), axis=1)

In [None]:
test_tagged = Test_DataDoc2Vec.apply(lambda r: TaggedDocument(words=spacy_tokenizer(r['sentence']), tags = None), axis=1)

In [None]:
# Allows to speed up a bit
import multiprocessing
cores = multiprocessing.cpu_count()

In [None]:
# Define Doc2Vec and build vocabulary
from gensim.models import Doc2Vec

model_dbow = Doc2Vec(dm=0, vector_size=30, negative=6, hs=0, min_count=1, sample=0, workers=cores, epoch=300)
model_dbow.build_vocab([x for x in training_tagged.values])

In [None]:
# Train distributed Bag of Word model
model_dbow.train(training_tagged, total_examples=model_dbow.corpus_count, epochs=model_dbow.epochs)

In [None]:
# Select X and y
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=100)) for doc in sents])
    return targets, regressors

y_train, X_train = vec_for_learning(model_dbow, training_tagged)

In [None]:
# Fit model on training set
logreg = LogisticRegression(max_iter=1000, solver='lbfgs')
logreg.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [None]:
# To prepare the test data
def vec_for_predicting(model, tagged_docs):
    sents = tagged_docs.values
    regressors = [(model.infer_vector(doc.words, steps=100)) for doc in sents]
    return regressors

In [None]:
X_test = vec_for_predicting(model_dbow, test_tagged)

In [None]:
y_doc2vecpred = logreg.predict(X_test)

In [None]:
doc2vec_pred = pd.DataFrame(y_doc2vecpred)

In [None]:
doc2vec_pred.columns = ['difficulty']
doc2vec_pred.insert(0, 'id', [i for i in range(1200)])

In [None]:
doc2vec_pred.to_csv('doc2vec_prediction.csv', header = True, index = False)

#To improve our model, we decide to use One Hot Encoder

##Update: I don't think that it is possible to use one hot encoder for the text analysing.

#Draft :

In [None]:
#from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
# Evaluate the model
#def evaluate(true, pred):
#    precision = precision_score(true, pred, average='micro')
#    recall = recall_score(true, pred, average='micro')
#    f1 = f1_score(true, pred, average='micro')
#    print(f"CONFUSION MATRIX:\n{confusion_matrix(true, pred)}")
#    print(f"ACCURACY SCORE:\n{accuracy_score(true, pred):.4f}")
#    print(f"CLASSIFICATION REPORT:\n\tPrecision: {precision:.4f}\n\tRecall: {recall:.4f}\n\tF1_Score: {f1:.4f}")

In [None]:
# Predictions
#y_pred = pipe.predict(X_test)

# Evaluation - test set
#evaluate(y_test, y_pred)