In [56]:
# Suppressing all warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

# Importing essential libraries for data manipulation and machine learning
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
import torch
from transformers import AutoTokenizer, AutoModel

In [57]:
# Loading the training data
data = pd.read_csv('training_data.csv')
data.head()

Unnamed: 0,id,sentence,difficulty
0,0,Les coûts kilométriques réels peuvent diverger...,C1
1,1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1
2,2,Le test de niveau en français est sur le site ...,A1
3,3,Est-ce que ton mari est aussi de Boston?,A1
4,4,"Dans les écoles de commerce, dans les couloirs...",B1


In [58]:
# Initializing a pre-trained Sentence Transformer model for sentence embeddings
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

# Extracting sentences from the data and converting them to a list
sentences = data['sentence'].values.tolist()

# Generating embeddings for each sentence using the Sentence Transformer model
sentence_embeddings = model.encode(sentences)

# Adding the generated embeddings as a new column in the DataFrame
data['embedding'] = sentence_embeddings.tolist()
data.head()

Unnamed: 0,id,sentence,difficulty,embedding
0,0,Les coûts kilométriques réels peuvent diverger...,C1,"[-0.0463661290705204, -0.039946261793375015, -..."
1,1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1,"[0.037579141557216644, -0.06782776117324829, -..."
2,2,Le test de niveau en français est sur le site ...,A1,"[-0.15025213360786438, 0.09958713501691818, -0..."
3,3,Est-ce que ton mari est aussi de Boston?,A1,"[-0.294254332780838, 0.12327229976654053, -0.0..."
4,4,"Dans les écoles de commerce, dans les couloirs...",B1,"[-0.08921276777982712, 0.24804086983203888, -0..."


In [59]:
# split data into train and test
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.1, random_state=42)

# Logistic Regression with Multililingual Embedding

In [None]:
# Converting embeddings to numpy arrays for model training and testing
X_train = np.array(train['embedding'].values.tolist())
y_train = train['difficulty'].values.tolist()
X_test = np.array(test['embedding'].values.tolist())
y_test = test['difficulty'].values.tolist()

# Training Logistic Regression and making predictions on the test set
clf = LogisticRegression(random_state=0).fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Outputting accuracy and classification report
accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          A1       0.69      0.65      0.67        91
          A2       0.40      0.47      0.43        72
          B1       0.46      0.38      0.42        92
          B2       0.51      0.50      0.50        70
          C1       0.46      0.46      0.46        71
          C2       0.52      0.57      0.55        84

    accuracy                           0.51       480
   macro avg       0.51      0.51      0.50       480
weighted avg       0.51      0.51      0.51       480



In [61]:
# Preparing embeddings and word count features for training and testing
train_embeddings = np.array(train['embedding'].tolist())
test_embeddings = np.array(test['embedding'].tolist())
train_word_counts = train['sentence'].apply(lambda x: len(x.split())).to_numpy().reshape(-1, 1)
test_word_counts = test['sentence'].apply(lambda x: len(x.split())).to_numpy().reshape(-1, 1)
train_features = np.hstack((train_embeddings, train_word_counts))
test_features = np.hstack((test_embeddings, test_word_counts))

# Preparing labels
train_labels = train['difficulty']
test_labels = test['difficulty']

# Training Logistic Regression model and making predictions
model = LogisticRegression()
model.fit(train_features, train_labels)
test_predictions = model.predict(test_features)

# Calculating and printing model accuracy and classification report
accuracy = model.score(test_features, test_labels)
print(f"Model Accuracy: {accuracy}")

report = classification_report(test_labels, test_predictions)
print("Classification Report:")
print(report)


Model Accuracy: 0.4791666666666667
Classification Report:
              precision    recall  f1-score   support

          A1       0.70      0.70      0.70        91
          A2       0.36      0.43      0.39        72
          B1       0.47      0.38      0.42        92
          B2       0.38      0.40      0.39        70
          C1       0.42      0.42      0.42        71
          C2       0.49      0.50      0.50        84

    accuracy                           0.48       480
   macro avg       0.47      0.47      0.47       480
weighted avg       0.48      0.48      0.48       480



In [62]:
# Importing necessary classes from the transformers library
from transformers import AutoModel, AutoTokenizer

# Identifier for the fine-tuned model
finetuned_model_id = 'MokaExpress/flaubert-french-difficulty'

# Loading the tokenizer for the specified fine-tuned model
tokenizer = AutoTokenizer.from_pretrained(finetuned_model_id)

# Loading the model itself
model = AutoModel.from_pretrained(finetuned_model_id)




Some weights of the model checkpoint at MokaExpress/flaubert-french-difficulty were not used when initializing FlaubertModel: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias']
- This IS expected if you are initializing FlaubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing FlaubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Logistic Regression with Finetuned - Embedding

In [63]:
# Identifier for the fine-tuned Flaubert model
finetuned_model_id = 'MokaExpress/flaubert-french-difficulty'

# Loading the tokenizer and model for the specified fine-tuned Flaubert model
tokenizer = AutoTokenizer.from_pretrained(finetuned_model_id)
model = AutoModel.from_pretrained(finetuned_model_id)
model.eval()  # Set the model to evaluation mode

# Function to generate embeddings for a sentence using the Flaubert model
def embed_flaubert(sentence, model, tokenizer):
    inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :]
    return cls_embedding.numpy()




Some weights of the model checkpoint at MokaExpress/flaubert-french-difficulty were not used when initializing FlaubertModel: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias']
- This IS expected if you are initializing FlaubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing FlaubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [64]:
# Importing tqdm for progress bar display during processing
from tqdm import tqdm
tqdm.pandas()

# Applying the embed_flaubert function to each sentence in the train set
train['flaubert_embedding'] = train['sentence'].progress_apply(lambda x: embed_flaubert(x, model, tokenizer))

# Applying the same to each sentence in the test set
test['flaubert_embedding'] = test['sentence'].progress_apply(lambda x: embed_flaubert(x, model, tokenizer))


100%|██████████| 4320/4320 [02:58<00:00, 24.20it/s]
100%|██████████| 480/480 [00:17<00:00, 27.42it/s]


In [65]:
# flatten the embeddings

train['flaubert_embedding_flatten'] = train['flaubert_embedding'].apply(lambda x: x.flatten())
test['flaubert_embedding_flatten'] = test['flaubert_embedding'].apply(lambda x: x.flatten())

In [66]:
# Converting Flaubert embeddings to numpy arrays for model input
X_train = np.array(train['flaubert_embedding_flatten'].values.tolist())
y_train = train['difficulty'].values.tolist()
X_test = np.array(test['flaubert_embedding_flatten'].values.tolist())
y_test = test['difficulty'].values.tolist()

# Training Logistic Regression with Flaubert embeddings as features
clf = LogisticRegression(random_state=0).fit(X_train, y_train)

# Making predictions on the test set
y_pred = clf.predict(X_test)

# Evaluating the model's performance
accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          A1       0.77      0.74      0.75        91
          A2       0.55      0.57      0.56        72
          B1       0.73      0.66      0.70        92
          B2       0.58      0.73      0.65        70
          C1       0.65      0.69      0.67        71
          C2       0.74      0.64      0.69        84

    accuracy                           0.67       480
   macro avg       0.67      0.67      0.67       480
weighted avg       0.68      0.67      0.67       480



In [67]:
# Preparing Flaubert embeddings and word count features
train_embeddings = np.array(train['flaubert_embedding_flatten'].tolist())
test_embeddings = np.array(test['flaubert_embedding_flatten'].tolist())
train_word_counts = train['sentence'].apply(lambda x: len(x.split())).to_numpy().reshape(-1, 1)
test_word_counts = test['sentence'].apply(lambda x: len(x.split())).to_numpy().reshape(-1, 1)

# Combining embeddings and word counts for training and testing
train_features = np.hstack((train_embeddings, train_word_counts))
test_features = np.hstack((test_embeddings, test_word_counts))

# Preparing labels
train_labels = train['difficulty']
test_labels = test['difficulty']

# Training Logistic Regression model and making predictions
model = LogisticRegression()
model.fit(train_features, train_labels)

# Making predictions and evaluating the model
test_predictions = model.predict(test_features)
accuracy = model.score(test_features, test_labels)

print(f"Model Accuracy: {accuracy}")

report = classification_report(test_labels, test_predictions)
print("Classification Report:")
print(report)


Model Accuracy: 0.6604166666666667
Classification Report:
              precision    recall  f1-score   support

          A1       0.79      0.77      0.78        91
          A2       0.57      0.54      0.55        72
          B1       0.71      0.65      0.68        92
          B2       0.57      0.69      0.62        70
          C1       0.59      0.65      0.62        71
          C2       0.71      0.64      0.67        84

    accuracy                           0.66       480
   macro avg       0.66      0.66      0.65       480
weighted avg       0.67      0.66      0.66       480



# SVC with Finetuned Embedding

In [68]:
# Preparing data for the SVM model
X_train = np.array(train['flaubert_embedding_flatten'].values.tolist())
y_train = train['difficulty'].values.tolist()
X_test = np.array(test['flaubert_embedding_flatten'].values.tolist())
y_test = test['difficulty'].values.tolist()

# Training the Support Vector Machine classifier
svm_clf = SVC(random_state=0)
svm_clf.fit(X_train, y_train)

# Predicting and evaluating the SVM model
y_pred_svm = svm_clf.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("SVM Accuracy:", accuracy_svm)
print(classification_report(y_test, y_pred_svm))


SVM Accuracy: 0.7395833333333334
              precision    recall  f1-score   support

          A1       0.85      0.81      0.83        91
          A2       0.62      0.74      0.67        72
          B1       0.80      0.64      0.71        92
          B2       0.67      0.81      0.74        70
          C1       0.70      0.68      0.69        71
          C2       0.81      0.76      0.79        84

    accuracy                           0.74       480
   macro avg       0.74      0.74      0.74       480
weighted avg       0.75      0.74      0.74       480



In [55]:
# save the model
import pickle
pickle.dump(svm_clf, open('svm_clf.pkl', 'wb'))


# XGBOOST with Finetuned model

In [53]:
# Importing XGBoost and relevant metrics
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

# Encoding the 'difficulty' labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train['difficulty_encoded'] = label_encoder.fit_transform(train['difficulty'])
test['difficulty_encoded'] = label_encoder.transform(test['difficulty'])

# Preparing the dataset for XGBoost model training and testing
X_train = np.array(train['flaubert_embedding_flatten'].values.tolist())
y_train = train['difficulty_encoded'].values.tolist()
X_test = np.array(test['flaubert_embedding_flatten'].values.tolist())
y_test = test['difficulty_encoded'].values.tolist()

# Training the XGBoost Classifier
gbm_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=0)
gbm_clf.fit(X_train, y_train)

# Predictions and evaluation
y_pred_gbm = gbm_clf.predict(X_test)
accuracy_gbm = accuracy_score(y_test, y_pred_gbm)
print("GBM Accuracy:", accuracy_gbm)
print(classification_report(y_test, y_pred_gbm))


GBM Accuracy: 0.7125
              precision    recall  f1-score   support

           0       0.83      0.81      0.82        91
           1       0.59      0.67      0.63        72
           2       0.74      0.63      0.68        92
           3       0.65      0.76      0.70        70
           4       0.66      0.68      0.67        71
           5       0.78      0.73      0.75        84

    accuracy                           0.71       480
   macro avg       0.71      0.71      0.71       480
weighted avg       0.72      0.71      0.71       480

