In [132]:
# Suppressing all warnings for a cleaner output
import warnings
warnings.filterwarnings('ignore')
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Importing essential libraries for data handling and visualization
import pandas as pd # For data manipulation
import plotly.express as px # For interactive plots
from sklearn.linear_model import LogisticRegression # Logistic Regression model
from sklearn.neighbors import KNeighborsClassifier # K-Nearest Neighbors model
from sklearn.feature_extraction.text import TfidfVectorizer , CountVectorizer # Text feature extraction
from sklearn.pipeline import Pipeline  # To create a processing pipeline
from sklearn.model_selection import GridSearchCV # For hyperparameter tuning
from sklearn.metrics import accuracy_score, classification_report # Model evaluation metrics
import plotly.graph_objects as go # Advanced plotting
from sklearn.ensemble import RandomForestClassifier  # Random Forest model
from sklearn.tree import DecisionTreeClassifier # Decision Tree model

# Re-importing warnings which seems redundant and could be removed
import warnings 




In [None]:
# Loading the training dataset
data = pd.read_csv('training_data.csv')

In [134]:
# Displaying the first few rows of the dataset
data

Unnamed: 0,id,sentence,difficulty
0,0,Les coûts kilométriques réels peuvent diverger...,C1
1,1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1
2,2,Le test de niveau en français est sur le site ...,A1
3,3,Est-ce que ton mari est aussi de Boston?,A1
4,4,"Dans les écoles de commerce, dans les couloirs...",B1
...,...,...,...
4795,4795,"C'est pourquoi, il décida de remplacer les hab...",B2
4796,4796,Il avait une de ces pâleurs splendides qui don...,C1
4797,4797,"Et le premier samedi de chaque mois, venez ren...",A2
4798,4798,Les coûts liés à la journalisation n'étant pas...,C2


In [135]:
# Printing the shape of the dataset to understand its size 

print(data.shape)

(4800, 3)


In [None]:
# Printing unique values in the 'difficulty' column to understand the classification categories
print(data['difficulty'].unique())

['C1' 'A1' 'B1' 'A2' 'B2' 'C2']


# Data Exploration 

In [137]:
# Grouping data by 'difficulty' and aggregating sentences
data_grouped = data[['difficulty','sentence']].groupby(['difficulty']).agg(list)

# Calculating the average length of sentences for each difficulty level
data_grouped['avg_len'] = data_grouped['sentence'].apply(lambda x: sum([len(i) for i in x])/len(x))

In [138]:
# Displaying the grouped data
data_grouped

Unnamed: 0_level_0,sentence,avg_len
difficulty,Unnamed: 1_level_1,Unnamed: 2_level_1
A1,"[Le bleu, c'est ma couleur préférée mais je n'...",39.172202
A2,[voilà une autre histoire que j'ai beaucoup ai...,61.54717
B1,"[Dans les écoles de commerce, dans les couloir...",84.58239
B2,[Il est particulièrement observé chez les pers...,120.902778
C1,[Les coûts kilométriques réels peuvent diverge...,156.444862
C2,[Ma timidité me quittait dès que je m'éloignai...,199.612144


In [139]:
# Creating a bar plot to visualize average sentence length by difficulty level
fig = px.bar(data_grouped.reset_index(), x="difficulty", y="avg_len", color="difficulty", title="Average Sentence Length per Difficulty")
fig.show()


In [140]:
# split data into train and test
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2, random_state=42)

In [141]:
# Displaying the first few rows of the training set
train

Unnamed: 0,id,sentence,difficulty
2758,2758,"On n'en sort pas, et moi, j'en ai marre.",A2
2850,2850,Il y avait deux enfants de 14 et 11 ans.,A2
4440,4440,Son appartement est loin de l'Université.,A1
4576,4576,Les négociateurs du Congrès de Vienne établire...,C2
1774,1774,Je ne trouve pas le livre que tu cherches.,A1
...,...,...,...
4426,4426,C'est très gentil de sa part !,A2
466,466,Alexandre aimait réviser à l'université pour f...,B1
3092,3092,"Socrate incarne, en effet, le sage qui sait di...",C2
3772,3772,Nous sommes en vacances chez des amis.,A1


## Logistic regression with TF - IDF

####  without hyper parameter tuning

In [142]:
# Setting up TF-IDF and Logistic Regression pipeline
tfidf = TfidfVectorizer()
logreg = LogisticRegression()

pipe = Pipeline([('tfidf', tfidf), ('logreg', logreg)])

# Training the model
pipe.fit(train['sentence'], train['difficulty'])

# Making predictions and evaluating the model
preds = pipe.predict(test['sentence'])
print(accuracy_score(test['difficulty'], preds))
print(classification_report(test['difficulty'], preds))


0.4510416666666667
              precision    recall  f1-score   support

          A1       0.52      0.64      0.57       166
          A2       0.35      0.32      0.33       158
          B1       0.39      0.30      0.34       166
          B2       0.47      0.38      0.42       153
          C1       0.44      0.43      0.43       152
          C2       0.49      0.62      0.55       165

    accuracy                           0.45       960
   macro avg       0.44      0.45      0.44       960
weighted avg       0.44      0.45      0.44       960



#### with hyper parameter tuning

In [156]:
# Setting up TF-IDF and Logistic Regression within a pipeline
tfidf = TfidfVectorizer()
lr = LogisticRegression()
pipe = Pipeline([('tfidf', tfidf), ('lr', lr)])

# Defining a grid of parameters to tune
param_grid = {
    'tfidf__max_df': [0.5, 1.0],  
    'tfidf__min_df': [1, 3],      
    'tfidf__ngram_range': [(1, 1), (1, 2)],  
    'lr__C': [0.1, 1, 10],  
}
# Setting up GridSearchCV for hyperparameter tuning
grid = GridSearchCV(pipe, param_grid, cv=3, n_jobs=-1)  
grid.fit(train['sentence'], train['difficulty'])

# Displaying the best parameters and model performance
print("Best Parameters: ", grid.best_params_)
print("Best Cross-Validation Score: ", grid.best_score_)
print("Best Estimator: ", grid.best_estimator_)

# Evaluating the model on the test set
test_accuracy = grid.score(test['sentence'], test['difficulty'])
print("Test Accuracy: ", test_accuracy)

# Making predictions and evaluating accuracy
preds = grid.predict(test['sentence'])
print("Accuracy Score: ", accuracy_score(test['difficulty'], preds))
print("Classification Report:\n", classification_report(test['difficulty'], preds))

Test Accuracy:  0.4510416666666667
Accuracy Score:  0.4510416666666667
Classification Report:
               precision    recall  f1-score   support

          A1       0.52      0.64      0.57       166
          A2       0.35      0.32      0.33       158
          B1       0.39      0.30      0.34       166
          B2       0.47      0.38      0.42       153
          C1       0.44      0.43      0.43       152
          C2       0.49      0.62      0.55       165

    accuracy                           0.45       960
   macro avg       0.44      0.45      0.44       960
weighted avg       0.44      0.45      0.44       960



# Logistic Regression with Bag-of-Words

#### without hyper parameter tuning

In [144]:
# Creating a pipeline with Bag of Words and Logistic Regression
bow = CountVectorizer()
logreg = LogisticRegression()
pipe = Pipeline([('bow', bow), ('logreg', logreg)])

# Training the model with the training data
pipe.fit(train['sentence'], train['difficulty'])

# Predicting and evaluating on the test data
preds = pipe.predict(test['sentence'])
print(accuracy_score(test['difficulty'], preds))
print(classification_report(test['difficulty'], preds))


0.475
              precision    recall  f1-score   support

          A1       0.53      0.74      0.61       166
          A2       0.38      0.44      0.41       158
          B1       0.42      0.34      0.37       166
          B2       0.50      0.42      0.46       153
          C1       0.48      0.44      0.46       152
          C2       0.55      0.47      0.50       165

    accuracy                           0.48       960
   macro avg       0.47      0.47      0.47       960
weighted avg       0.47      0.47      0.47       960



#### with hyper parameter tuning

In [158]:
# Setting up Bag of Words and Logistic Regression in a pipeline
bow = CountVectorizer()
lr = LogisticRegression()

pipe = Pipeline([('bow', bow), ('lr', lr)])

# Defining parameters for tuning
param_grid = {
    'bow__max_df': [0.5, 1.0],
    'bow__min_df': [1, 2, 3],
    'bow__ngram_range': [(1, 1), (1, 2)],
    'lr__C': [ 0.1, 1, 10],

}

# Using GridSearchCV for finding the best model parameters
grid = GridSearchCV(pipe, param_grid, cv=3, n_jobs=-1)
grid.fit(train['sentence'], train['difficulty'])

# Displaying best parameters and model performance
print("Best Parameters: ", grid.best_params_)
print("Best Cross-Validation Score: ", grid.best_score_)
print("Best Estimator: ", grid.best_estimator_)

# Evaluating the model on the test data
test_accuracy = grid.score(test['sentence'], test['difficulty'])
print("Test Accuracy: ", test_accuracy)

# Predicting and evaluating model performance
preds = grid.predict(test['sentence'])
print("Accuracy Score: ", accuracy_score(test['difficulty'], preds))
print("Classification Report:\n", classification_report(test['difficulty'], preds))


Best Parameters:  {'bow__max_df': 0.5, 'bow__min_df': 1, 'bow__ngram_range': (1, 1), 'lr__C': 1}
Best Cross-Validation Score:  0.43515624999999997
Best Estimator:  Pipeline(steps=[('bow', CountVectorizer(max_df=0.5)),
                ('lr', LogisticRegression(C=1))])
Test Accuracy:  0.475
Accuracy Score:  0.475
Classification Report:
               precision    recall  f1-score   support

          A1       0.53      0.74      0.61       166
          A2       0.38      0.44      0.41       158
          B1       0.42      0.34      0.37       166
          B2       0.50      0.42      0.46       153
          C1       0.48      0.44      0.46       152
          C2       0.55      0.47      0.50       165

    accuracy                           0.48       960
   macro avg       0.47      0.47      0.47       960
weighted avg       0.47      0.47      0.47       960



### We see that bag of words gives better results which makes sense as we have a small dataset and tfidf is better for large datasets
### We will thus use bag of words for the next models

# KNN

#### without hypr parameter tuning

In [146]:
# Creating a pipeline with Bag of Words and K-Nearest Neighbors
bow = CountVectorizer()
knn = KNeighborsClassifier()

pipe = Pipeline([('bow', bow), ('knn', knn)])

# Training the pipeline model on the training set
pipe.fit(train['sentence'], train['difficulty'])

# Making predictions on the test set and evaluating the model
preds = pipe.predict(test['sentence'])
print(accuracy_score(test['difficulty'], preds))
print(classification_report(test['difficulty'], preds))


0.2
              precision    recall  f1-score   support

          A1       0.21      0.96      0.35       166
          A2       0.10      0.08      0.09       158
          B1       0.18      0.05      0.08       166
          B2       0.22      0.03      0.05       153
          C1       0.14      0.01      0.01       152
          C2       0.86      0.04      0.07       165

    accuracy                           0.20       960
   macro avg       0.29      0.19      0.11       960
weighted avg       0.29      0.20      0.11       960



#### with hyper parameter tuning

In [147]:
# Setting up the Bag of Words model
bow = CountVectorizer()

# Testing different values of k for K-Nearest Neighbors
k_values = range(1, 21)  
accuracies = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    pipe = Pipeline([('bow', bow), ('knn', knn)])
    
    pipe.fit(train['sentence'], train['difficulty'])
    preds = pipe.predict(test['sentence'])
    
    accuracy = accuracy_score(test['difficulty'], preds)
    accuracies.append(accuracy)
    print(f"Accuracy for k={k}: {accuracy}")

# Plotting the accuracy for each k value
fig = go.Figure(data=go.Scatter(x=list(k_values), y=accuracies, mode='lines+markers'))
fig.update_layout(title='KNN Accuracy over different values of k',
                  xaxis_title='k',
                  yaxis_title='Accuracy',
                  plot_bgcolor='white')
fig.show()



Accuracy for k=1: 0.24583333333333332
Accuracy for k=2: 0.20520833333333333
Accuracy for k=3: 0.20520833333333333
Accuracy for k=4: 0.21458333333333332
Accuracy for k=5: 0.2
Accuracy for k=6: 0.196875
Accuracy for k=7: 0.18854166666666666
Accuracy for k=8: 0.19166666666666668
Accuracy for k=9: 0.184375
Accuracy for k=10: 0.18645833333333334
Accuracy for k=11: 0.18125
Accuracy for k=12: 0.184375
Accuracy for k=13: 0.18541666666666667
Accuracy for k=14: 0.1875
Accuracy for k=15: 0.18541666666666667
Accuracy for k=16: 0.18958333333333333
Accuracy for k=17: 0.18854166666666666
Accuracy for k=18: 0.18645833333333334
Accuracy for k=19: 0.18333333333333332
Accuracy for k=20: 0.18541666666666667


In [148]:
# Initializing the Bag of Words model
bow = CountVectorizer()


# Setting k to 1 for K-Nearest Neighbors
k = 1

# Setting up the pipeline
knn = KNeighborsClassifier(n_neighbors=k)
pipe = Pipeline([('bow', bow), ('knn', knn)])

# Training the model
pipe.fit(train['sentence'], train['difficulty'])

# Making predictions and evaluating the model
preds = pipe.predict(test['sentence'])

# Calculating and printing the accuracy
accuracy = accuracy_score(test['difficulty'], preds)
accuracies.append(accuracy)
print(f"Accuracy for k={k}: {accuracy}")

# Printing a detailed classification report
print(classification_report(test['difficulty'], preds))

Accuracy for k=1: 0.24583333333333332
              precision    recall  f1-score   support

          A1       0.24      0.77      0.37       166
          A2       0.14      0.14      0.14       158
          B1       0.27      0.17      0.21       166
          B2       0.36      0.14      0.20       153
          C1       0.25      0.12      0.17       152
          C2       0.64      0.11      0.19       165

    accuracy                           0.25       960
   macro avg       0.32      0.24      0.21       960
weighted avg       0.32      0.25      0.21       960



# Decision Tree

#### without hyper parameter tuning

In [149]:
# Setting up the Bag of Words model
bow = CountVectorizer()

# Initializing the Decision Tree Classifier
dt = DecisionTreeClassifier()

# Setting up the pipeline
pipe = Pipeline([('bow', bow), ('dt', dt)])

# Training the model
pipe.fit(train['sentence'], train['difficulty'])

# Making predictions and evaluating the model
preds = pipe.predict(test['sentence'])

# Printing the accuracy of the model on the test data
print(accuracy_score(test['difficulty'], preds))

# Printing a detailed classification report
print(classification_report(test['difficulty'], preds))



0.31979166666666664
              precision    recall  f1-score   support

          A1       0.46      0.55      0.51       166
          A2       0.26      0.26      0.26       158
          B1       0.26      0.20      0.22       166
          B2       0.24      0.28      0.26       153
          C1       0.31      0.37      0.34       152
          C2       0.35      0.25      0.29       165

    accuracy                           0.32       960
   macro avg       0.31      0.32      0.31       960
weighted avg       0.32      0.32      0.31       960



#### with hyper parameter tuning

In [150]:
# Initializing the Bag of Words model
bow = CountVectorizer()

# Testing different values of max_depth for Decision Tree
max_depth_values = range(1, 21)
accuracies = []


for max_depth in max_depth_values:
    dt = DecisionTreeClassifier(max_depth=max_depth)
    pipe = Pipeline([('bow', bow), ('dt', dt)])
    
    pipe.fit(train['sentence'], train['difficulty'])
    preds = pipe.predict(test['sentence'])
    
    accuracy = accuracy_score(test['difficulty'], preds)
    accuracies.append(accuracy)
    print(f"Accuracy for max_depth={max_depth}: {accuracy}")

# Plotting the accuracy for each max_depth value
fig = go.Figure(data=go.Scatter(x=list(max_depth_values), y=accuracies, mode='lines+markers'))
fig.update_layout(title='Decision Tree Accuracy over different values of max_depth',
                  xaxis_title='max_depth',
                  yaxis_title='Accuracy',
                  plot_bgcolor='white')
fig.show()







Accuracy for max_depth=1: 0.25729166666666664
Accuracy for max_depth=2: 0.26666666666666666
Accuracy for max_depth=3: 0.2864583333333333
Accuracy for max_depth=4: 0.290625
Accuracy for max_depth=5: 0.2947916666666667
Accuracy for max_depth=6: 0.3125
Accuracy for max_depth=7: 0.3229166666666667
Accuracy for max_depth=8: 0.31979166666666664
Accuracy for max_depth=9: 0.3177083333333333
Accuracy for max_depth=10: 0.33958333333333335
Accuracy for max_depth=11: 0.3177083333333333
Accuracy for max_depth=12: 0.3125
Accuracy for max_depth=13: 0.31666666666666665
Accuracy for max_depth=14: 0.3145833333333333
Accuracy for max_depth=15: 0.3177083333333333
Accuracy for max_depth=16: 0.3145833333333333
Accuracy for max_depth=17: 0.3145833333333333
Accuracy for max_depth=18: 0.3177083333333333
Accuracy for max_depth=19: 0.321875
Accuracy for max_depth=20: 0.325


In [151]:
# Initializing Bag of Words model
bow = CountVectorizer()
dt = DecisionTreeClassifier(max_depth=10)
pipe = Pipeline([('bow', bow), ('dt', dt)])

# Training and evaluating the model
pipe.fit(train['sentence'], train['difficulty'])
preds = pipe.predict(test['sentence'])
print(accuracy_score(test['difficulty'], preds))
print(classification_report(test['difficulty'], preds))

0.3375
              precision    recall  f1-score   support

          A1       0.46      0.58      0.52       166
          A2       0.23      0.34      0.27       158
          B1       0.43      0.16      0.24       166
          B2       0.28      0.40      0.33       153
          C1       0.31      0.34      0.33       152
          C2       0.46      0.20      0.28       165

    accuracy                           0.34       960
   macro avg       0.36      0.34      0.33       960
weighted avg       0.37      0.34      0.33       960



# Random Forest

#### without hyper parameter tuning

In [152]:
# Pipeline with Bag of Words and Random Forest Classifier
bow = CountVectorizer()
rf = RandomForestClassifier()
pipe = Pipeline([('bow', bow), ('rf', rf)])

# Training the model and making predictions
pipe.fit(train['sentence'], train['difficulty'])
preds = pipe.predict(test['sentence'])

# Output accuracy and classification report
print(accuracy_score(test['difficulty'], preds))
print(classification_report(test['difficulty'], preds))


0.37916666666666665
              precision    recall  f1-score   support

          A1       0.38      0.83      0.52       166
          A2       0.28      0.25      0.27       158
          B1       0.35      0.28      0.31       166
          B2       0.38      0.29      0.33       153
          C1       0.39      0.32      0.35       152
          C2       0.59      0.29      0.39       165

    accuracy                           0.38       960
   macro avg       0.40      0.38      0.36       960
weighted avg       0.40      0.38      0.36       960



#### with hyper parameter tuning

In [153]:
# Pipeline setup with Bag of Words and Random Forest Classifier
bow = CountVectorizer()
rf = RandomForestClassifier()

pipe = Pipeline([('bow', bow), ('rf', rf)])

# Defining the parameter grid for hyperparameter tuning
param_grid = {
    'bow__max_df': [0.5, 1.0],
    'bow__min_df': [1, 2, 3],
    'bow__ngram_range': [(1, 1), (1, 2)],
    'rf__n_estimators': [50, 100, 200],
    'rf__max_depth': [5, 10, 20],
}

# Using GridSearchCV to find the best model parameters
grid = GridSearchCV(pipe, param_grid, cv=3, n_jobs=-1)
grid.fit(train['sentence'], train['difficulty'])

# Outputting the best parameters and performance scores
print("Best Parameters: ", grid.best_params_)
print("Best Cross-Validation Score: ", grid.best_score_)
print("Best Estimator: ", grid.best_estimator_)
test_accuracy = grid.score(test['sentence'], test['difficulty'])
print("Test Accuracy: ", test_accuracy)

# Making predictions and printing evaluation metrics
preds = grid.predict(test['sentence'])
print("Accuracy Score: ", accuracy_score(test['difficulty'], preds))
print("Classification Report:\n", classification_report(test['difficulty'], preds))


Best Parameters:  {'bow__max_df': 1.0, 'bow__min_df': 2, 'bow__ngram_range': (1, 1), 'rf__max_depth': 20, 'rf__n_estimators': 200}
Best Cross-Validation Score:  0.38307291666666665
Best Estimator:  Pipeline(steps=[('bow', CountVectorizer(min_df=2)),
                ('rf', RandomForestClassifier(max_depth=20, n_estimators=200))])
Test Accuracy:  0.3770833333333333
Accuracy Score:  0.3770833333333333
Classification Report:
               precision    recall  f1-score   support

          A1       0.38      0.83      0.52       166
          A2       0.32      0.18      0.23       158
          B1       0.41      0.13      0.19       166
          B2       0.35      0.29      0.32       153
          C1       0.35      0.39      0.37       152
          C2       0.44      0.42      0.43       165

    accuracy                           0.38       960
   macro avg       0.38      0.37      0.34       960
weighted avg       0.38      0.38      0.35       960



# Results

In [154]:
# Defining metrics and classifies for performance comparison
metrics = ['precision', 'recall', 'f1-score', 'accuracy']
classifiers = ['Logistic Regression', 'KNN', 'DecisionTree', 'RandomForest']

# Creating a dataframe to store the performance scores
performance_df = pd.DataFrame(index=metrics, columns=classifiers)

# Populating the DataFrame with performance scores for each classifier
performance_df['Logistic Regression'] = ['0.47', '0.47', '0.47', '0.48']
performance_df['KNN'] = ['0.32', '0.24', '0.21', '0.25']
performance_df['DecisionTree'] = ['0.36', '0.34', '0.33', '0.34']
performance_df['RandomForest'] = ['0.38', '0.38', '0.34', '0.38']

# Displaying the performance DataFrame
performance_df


Unnamed: 0,Logistic Regression,KNN,DecisionTree,RandomForest
precision,0.47,0.32,0.36,0.38
recall,0.47,0.24,0.34,0.38
f1-score,0.47,0.21,0.33,0.34
accuracy,0.48,0.25,0.34,0.38
