## 1) Prepare dataset

In [None]:
# Download the french language model (the one who shows the best accuracy on spacy.io)
!python -m spacy download fr_core_news_md

In [12]:
import pandas as pd
import spacy
from spacy import displacy


# Load English language model
sp = spacy.load('fr_core_news_md')

# Import stopwords from french language
stop_words = spacy.lang.fr.stop_words.STOP_WORDS
# Import punctations characters
punctuations = string.punctuation

In [89]:
trainDf = pd.read_csv("https://raw.githubusercontent.com/LaCrazyTomato/Group-Project-DM-ML-2021/main/data/training_data.csv")
testDf = pd.read_csv("https://raw.githubusercontent.com/LaCrazyTomato/Group-Project-DM-ML-2021/main/data/unlabelled_test_data.csv")

display(testDf.head())
display(trainDf.head())

Unnamed: 0,id,sentence
0,0,Nous dûmes nous excuser des propos que nous eû...
1,1,Vous ne pouvez pas savoir le plaisir que j'ai ...
2,2,"Et, paradoxalement, boire froid n'est pas la b..."
3,3,"Ce n'est pas étonnant, car c'est une saison my..."
4,4,"Le corps de Golo lui-même, d'une essence aussi..."


Unnamed: 0,id,sentence,difficulty
0,0,Les coûts kilométriques réels peuvent diverger...,C1
1,1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1
2,2,Le test de niveau en français est sur le site ...,A1
3,3,Est-ce que ton mari est aussi de Boston?,A1
4,4,"Dans les écoles de commerce, dans les couloirs...",B1


## 2) Classification


In [87]:
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score



In [21]:
def spacy_tokenizer(sentence):
    """
    Tokenizer function
    """
    
    # Create token object, which is used to create documents with linguistic annotations.
    mytokens = sp(sentence)

    # Lemmatize each token and convert each token into lowercase
    mytokens = [ word.lemma_.lower().strip() for word in mytokens ]
    
    # Remove stop words and punctuation
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # Return preprocessed list of tokens
    return mytokens


def evaluate(true, pred):
    """
    Function to evaluate model accuracy
    """
    precision = precision_score(true, pred)
    recall = recall_score(true, pred)
    f1 = f1_score(true, pred)
    print(f"CONFUSION MATRIX:\n{confusion_matrix(true, pred)}")
    print(f"ACCURACY SCORE:\n{accuracy_score(true, pred):.4f}")
    print(f"CLASSIFICATION REPORT:\n\tPrecision: {precision:.4f}\n\tRecall: {recall:.4f}\n\tF1_Score: {f1:.4f}")




In [73]:
# Term Frequency-Inverse Document Frequency
tfidf_vector = TfidfVectorizer(tokenizer=spacy_tokenizer) # we use the above defined tokenizer

## 2.1) Logistic Regression

In [74]:
# Select features
X = trainDf['sentence'] # the features we want to analyze
ylabels = trainDf['difficulty'] # the labels, or answers, we want to test against

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.2, random_state=1234, stratify=ylabels)

X_train

962     Le réalisateur m'a d'abord demandé de me mettr...
1886    Après quelques mois de cette pauvreté noble, a...
2721    L'indicateur n'était que de 40% chez les femme...
1025    L'objectif de ce type de voyage est d'être act...
4048    Et, en France, beaucoup moins de filles que de...
                              ...                        
3693     Je vais prendre ma douche dans ma salle-de-bain.
3408    Après l'éruption de 1754, la plus grosse connu...
4289    Léonard est initié par Verrocchio aux nombreus...
3312    On en trouve des exemples dans l'ouvrage "L'in...
269     On peut aussi aller au théâtre, dans les musée...
Name: sentence, Length: 3840, dtype: object

In [84]:
# Define classifier
LR_model = LogisticRegression()

# Create pipeline
pipe = Pipeline([('vectorizer', tfidf_vector),
                 ('classifier', LR_model)])

# Fit model on training set
pipe.fit(X_train, y_train)




Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_patt...
                                 tokenizer=<function spacy_tokenizer at 0x000001673CAF23A8>,
                                 use_idf=True, vocabulary=None)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fi

In [82]:
pred = pipe.predict(X_test)


In [83]:
(pred==y_test).sum()/len(y_test)

0.403125

## 2.2) Logistic Regression with Cross Validation

In [88]:
LRCV_model = LogisticRegressionCV()

pipe = Pipeline([('vectorizer', tfidf_vector),
                 ('classifier', LRCV_model)])


pipe.fit(X_train, y_train)




Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_patt...
                                 tokenizer=<function spacy_tokenizer at 0x000001673CAF23A8>,
                                 use_idf=True, vocabulary=None)),
                ('classifier',
                 LogisticRegressionCV(Cs=10, class_weight=None, cv='warn',
                                     

In [90]:
pred = pipe.predict(X_test)

In [91]:
(pred==y_test).sum()/len(y_test)

0.39479166666666665

## 2.3) Decision Tree

In [36]:
from sklearn.tree import DecisionTreeClassifier, plot_tree

tree_model = DecisionTreeClassifier()

pipe = Pipeline([('vectorizer', tfidf_vector),
                 ('classifier', tree_model)])


pipe.fit(X_train, y_train)


Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_patt...
                                 tokenizer=<function spacy_tokenizer at 0x00000167345B0CA8>,
                                 use_idf=True, vocabulary=None)),
                ('classifier',
                 DecisionTreeClassifier(class_weight=None, criterion='gini',
                                   

In [39]:
pred = pipe.predict(X_test)

(pred==y_test).sum()/len(y_test)


0.3458333333333333

In [58]:
pd.DataFrame(pipe.predict(["J'ai", "acquis", "des", "connaissances"])).iloc[:,0].value_counts()

A1    3
B2    1
Name: 0, dtype: int64

## 2.4) Random Forest

In [59]:
from sklearn.ensemble import RandomForestClassifier

randomForest_model = RandomForestClassifier(n_estimators=50)

pipe = Pipeline([('vectorizer', tfidf_vector),
                 ('classifier', randomForest_model)])


pipe.fit(X_train, y_train)


Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_patt...
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                               

In [60]:
pred = pipe.predict(X_test)

(pred == y_test).sum()/len(y_test)

0.3958333333333333

## 2.5) KNN

In [85]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()

pipe = Pipeline([('vectorizer', tfidf_vector),
                 ('classifier', knn_model)])


pipe.fit(X_train, y_train)


Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function spacy_tokenizer at 0x000001673CAF23A8>,
                                 use_idf=True, vocabulary=None)),
                ('classifier',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                     

In [86]:
pred = pipe.predict(X_test)

(pred == y_test).sum()/len(y_test)


0.17395833333333333

### Next Step :

In [63]:
# Prochaine étape :
# Fonction pour chaque modèle et itérer avec différentes configs du TfidfVectorizer


# Create list of configs
def configs():

    models = list()
    
    # Define config lists
    ngram_range = [(1,1), (1,2), (1, 3), (2, 2), (2, 3), (3, 3)]
    min_df = [1]
    max_df = [1.0]
    analyzer=['word', 'char']
    
    # Create config instances
    for n in ngram_range:
        for i in min_df:
            for j in max_df:
                for a in analyzer:
                    cfg = [n, i, j, a]
                    models.append(cfg)
    return models

configs = configs()


[[(1, 1), 1, 1.0, 'word'],
 [(1, 1), 1, 1.0, 'char'],
 [(1, 2), 1, 1.0, 'word'],
 [(1, 2), 1, 1.0, 'char'],
 [(1, 3), 1, 1.0, 'word'],
 [(1, 3), 1, 1.0, 'char'],
 [(2, 2), 1, 1.0, 'word'],
 [(2, 2), 1, 1.0, 'char'],
 [(2, 3), 1, 1.0, 'word'],
 [(2, 3), 1, 1.0, 'char'],
 [(3, 3), 1, 1.0, 'word'],
 [(3, 3), 1, 1.0, 'char']]