# Cleaning

## Install Spacy

In [None]:
# we update and install spaCy
!pip install -U spacy


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!python -m spacy download fr_core_news_sm


2022-12-22 03:10:32.170339: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fr-core-news-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.4.0/fr_core_news_sm-3.4.0-py3-none-any.whl (16.3 MB)
[K     |████████████████████████████████| 16.3 MB 147 kB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')


In [None]:
import spacy
# Load Our Model & NLP (TALN) Object
nlp = spacy.load('fr_core_news_sm')

## 0.Download data

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np

df_train = pd.read_csv("/content/drive/MyDrive/DDML/training_data.csv")
df_pred = pd.read_csv("/content/drive/MyDrive/DDML/unlabelled_test_data.csv")
df_pred.head()

Unnamed: 0,id,sentence
0,0,Nous dûmes nous excuser des propos que nous eû...
1,1,Vous ne pouvez pas savoir le plaisir que j'ai ...
2,2,"Et, paradoxalement, boire froid n'est pas la b..."
3,3,"Ce n'est pas étonnant, car c'est une saison my..."
4,4,"Le corps de Golo lui-même, d'une essence aussi..."


## 1.Normalizing Text

**Clean "-" & " ' "**

source: https://monkeylearn.com/blog/text-cleaning/#:~:text=Text%20cleaning%20can%20be%20performed,words%20to%20their%20root%20form.&text=You'd%20need%20to%20perform,Removing%20Stopwords

In [None]:
# https://monkeylearn.com/blog/text-cleaning/#:~:text=Text%20cleaning%20can%20be%20performed,words%20to%20their%20root%20form.&text=You'd%20need%20to%20perform,Removing%20Stopwords
import re

# replace apostrophe & dash
df_train = df_train.replace("'", " ", regex=True)
df_train = df_train.replace("-", " ", regex=True)
df_train.head()


Unnamed: 0,id,sentence,difficulty
0,0,Les coûts kilométriques réels peuvent diverger...,C1
1,1,"Le bleu, c est ma couleur préférée mais je n a...",A1
2,2,Le test de niveau en français est sur le site ...,A1
3,3,Est ce que ton mari est aussi de Boston?,A1
4,4,"Dans les écoles de commerce, dans les couloirs...",B1


## 2.Method Tokenizer

source : https://python.plainenglish.io/text-classification-using-python-spacy-7a414abcc83a

In [None]:
import string
import nltk
from nltk.stem import WordNetLemmatizer
sp = spacy.load('fr_core_news_sm')

# Define tokenizer function
def spacy_token(sentence):

# Create our list of punctuation marks
    punctuations = string.punctuation

# Create our list of stopwords
    stop_words = spacy.lang.fr.stop_words.STOP_WORDS

# Create a WordNetLemmatizer object
    lemmatizer = WordNetLemmatizer()


    # Create token object, which is used to create documents with linguistic annotations.
    mytokens = sp(sentence)

    # Lemmatize each token 
    mytokens = [ word for word in mytokens if word not in lemmatizer]


    # Remove punctuation
    mytokens = [ word for word in mytokens if word not in punctuations]

    # Remove stop words 
    mytokens = [ word for word in mytokens if word not in stop_words]

    # Return preprocessed list of tokens
    return mytokens

## 3.Method evaluate

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Evaluate the model
def evaluate(true, pred):
    precision = precision_score(y_test, y_pred,average='micro')
    recall = recall_score(y_test, y_pred, average='micro')
    f1 = f1_score(y_test, y_pred, average='micro')
    print(f"ACCURACY SCORE:\n{accuracy_score(true, pred):.4f}")
    print(f"CLASSIFICATION REPORT:\n\tPrecision: {precision:.4f}\n\tRecall: {recall:.4f}\n\tF1_Score: {f1:.4f}")
evaluate(y_test, y_pred)

ACCURACY SCORE:
0.3177
CLASSIFICATION REPORT:
	Precision: 0.3177
	Recall: 0.3177
	F1_Score: 0.3177


# 4. Train models

## 4.1. Logistic Regression with Tf-IDF

In [None]:
# import libraries
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split


#IDV & DV
X = df_train['sentence'] # Inputs --> IDV
y = df_train['difficulty'] # Outputs wanted --> DV

X_train , X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    # Pipeline for TF-IDF with Logistic Regression model
pipe_tvec = Pipeline([
    ('tvec', TfidfVectorizer(tokenizer=spacy_token, ngram_range=(1, 12), min_df=1, norm='l2', analyzer="char",sublinear_tf=True)), #transformer
    ('lr', LogisticRegression(solver ='lbfgs', penalty='l2', C=5)) #model
])  

    # Fit model on training set
pipe_tvec.fit(X_train, y_train)

    # Predictions
y_pred = pipe_tvec.predict(X_test)

    # Print accuracy on test set
evaluate(y_test, y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


ACCURACY SCORE:
0.5146
CLASSIFICATION REPORT:
	Precision: 0.5146
	Recall: 0.5146
	F1_Score: 0.5146


In [None]:
# Save the results 
LRAccuracy = accuracy_score(y_test, y_pred)
LRPrecision = precision_score(y_test, y_pred,average='micro')
LRRecall = recall_score(y_test, y_pred,average='micro')
LRF1= f1_score(y_test, y_pred,average='micro')

## 4.2. KNN 


In [None]:
# import libraries
from sklearn.neighbors import KNeighborsClassifier

# Pipeline for TF-IDF with KNN model imporving parameters
pipe_knn = Pipeline([
    ('tvec', TfidfVectorizer(tokenizer=spacy_token, ngram_range=(1, 12), min_df=1, norm='l2', analyzer="char",sublinear_tf=True)), #transformer
    ('knn', KNeighborsClassifier(n_neighbors=140, p=2, weights='distance')) #model
])

# Fit model on training set
pipe_knn.fit(X_train, y_train)
y_pred = pipe_knn.predict(X_test)

# Evaluation
evaluate(y_test, y_pred)


ACCURACY SCORE:
0.3906
CLASSIFICATION REPORT:
	Precision: 0.3906
	Recall: 0.3906
	F1_Score: 0.3906


In [None]:
# Save the results 
KNNAccuracy = accuracy_score(y_test, y_pred)
KNNPrecision = precision_score(y_test, y_pred,average='micro')
KNNRecall = recall_score(y_test, y_pred,average='micro')
KNNF1= f1_score(y_test, y_pred,average='micro')

## 4.3. Decision Tree Classifier 


In [None]:
from sklearn.tree import DecisionTreeClassifier 

# Pipeline for TF-IDF with Decision Tree Classifier model imporving parameters
pipe_tree = Pipeline([
    ('tvec', TfidfVectorizer()), #transformer
    ('tree', DecisionTreeClassifier(max_depth=15)) #model
])

# Fit model on training set
pipe_tree.fit(X_train, y_train)
y_pred = pipe_tree.predict(X_test)

# Evaluation
evaluate(y_test, y_pred)

ACCURACY SCORE:
0.3187
CLASSIFICATION REPORT:
	Precision: 0.3187
	Recall: 0.3187
	F1_Score: 0.3187


In [None]:
# Save the results 
DTCAccuracy = accuracy_score(y_test, y_pred)
DTCPrecision = precision_score(y_test, y_pred,average='micro')
DTCRecall = recall_score(y_test, y_pred,average='micro')
DTCF1= f1_score(y_test, y_pred,average='micro')

## 4.4. Random Forest Classifier


In [None]:
# import libraries
from sklearn.ensemble import RandomForestClassifier

# Pipeline for TF-IDF with Random Forest Classifier model imporving parameters
pipe_forest = Pipeline([
    ('tvec', TfidfVectorizer(tokenizer=spacy_token, ngram_range=(1, 12), min_df=1, norm='l2', analyzer="char",sublinear_tf=True)), #transformer
    ('forest', RandomForestClassifier(max_depth=220, )) #model
])

# Fit model on training set
pipe_forest.fit(X_train, y_train)
y_pred = pipe_forest.predict(X_test)

# Evaluation
evaluate(y_test, y_pred)

ACCURACY SCORE:
0.3688
CLASSIFICATION REPORT:
	Precision: 0.3688
	Recall: 0.3688
	F1_Score: 0.3688


In [None]:
# Save the results 
RDFAccuracy = accuracy_score(y_test, y_pred)
RDFPrecision = precision_score(y_test, y_pred,average='micro')
RDFRecall = recall_score(y_test, y_pred,average='micro')
RDFF1= f1_score(y_test, y_pred,average='micro')

# 5. Show a summary of your results

In [None]:
Results = pd.DataFrame({'Model selected': ['Logistic Regression ','KNN','Decision Tree Classifier ', 'Random Forest Classifier '],
                        'Accuracy': [LRAccuracy, KNNAccuracy, DTCAccuracy, RDFAccuracy],
                        'Precision': [LRPrecision, KNNPrecision, DTCPrecision, RDFPrecision],
                        'Recall': [LRRecall, KNNRecall, DTCRecall, RDFRecall],
                        'F1_Score': [LRF1, KNNF1, DTCF1, RDFF1]})


Results

Unnamed: 0,Model selected,Accuracy,Precision,Recall,F1_Score
0,Logistic Regression,0.514583,0.514583,0.514583,0.514583
1,KNN,0.390625,0.390625,0.390625,0.390625
2,Decision Tree Classifier,0.31875,0.31875,0.31875,0.31875
3,Random Forest Classifier,0.36875,0.36875,0.36875,0.36875


**The best model is the following :**

In [None]:
print(Results.loc[Results.Accuracy == Results.Accuracy.max()][['Model selected','Accuracy']])

         Model selected  Accuracy
0  Logistic Regression   0.514583
