In [None]:
# Importing necessary libraries: pandas, torch and Hugging Face Transformers. Installing sacremoses
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
!pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [None]:
# Reading unlabelled test data from a CSV file
test_data = pd.read_csv('unlabelled_test_data.csv')


In [None]:
# Displaying the test_data DataFrame
test_data

Unnamed: 0,id,sentence
0,0,Nous dûmes nous excuser des propos que nous eû...
1,1,Vous ne pouvez pas savoir le plaisir que j'ai ...
2,2,"Et, paradoxalement, boire froid n'est pas la b..."
3,3,"Ce n'est pas étonnant, car c'est une saison my..."
4,4,"Le corps de Golo lui-même, d'une essence aussi..."
...,...,...
1195,1195,C'est un phénomène qui trouve une accélération...
1196,1196,Je vais parler au serveur et voir si on peut d...
1197,1197,Il n'était pas comme tant de gens qui par pare...
1198,1198,Ils deviennent dangereux pour notre économie.


In [None]:
# Loading a fine-tuned Flaubert model for difficulty classification
finetuned_model_id = 'MokaExpress/flaubert-french-difficulty'
tokenizer = AutoTokenizer.from_pretrained(finetuned_model_id)
model = AutoModel.from_pretrained(finetuned_model_id)
model.eval()

config.json:   0%|          | 0.00/1.94k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/553M [00:00<?, ?B/s]

FlaubertModel(
  (position_embeddings): Embedding(512, 768)
  (embeddings): Embedding(68729, 768, padding_idx=2)
  (layer_norm_emb): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (attentions): ModuleList(
    (0-11): 12 x MultiHeadAttention(
      (q_lin): Linear(in_features=768, out_features=768, bias=True)
      (k_lin): Linear(in_features=768, out_features=768, bias=True)
      (v_lin): Linear(in_features=768, out_features=768, bias=True)
      (out_lin): Linear(in_features=768, out_features=768, bias=True)
    )
  )
  (layer_norm1): ModuleList(
    (0-11): 12 x LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  )
  (ffns): ModuleList(
    (0-11): 12 x TransformerFFN(
      (lin1): Linear(in_features=768, out_features=3072, bias=True)
      (lin2): Linear(in_features=3072, out_features=768, bias=True)
      (act): GELUActivation()
    )
  )
  (layer_norm2): ModuleList(
    (0-11): 12 x LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  )
)

In [None]:
# Function to embed a sentence using a Flaubert model
def embed_flaubert(sentence, model, tokenizer):
    inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :]
    return cls_embedding.numpy()

In [None]:
# Applying Flaubert embedding to sentences in test_data
from tqdm import tqdm
tqdm.pandas()
test_data['flaubert_embedding'] = test_data['sentence'].progress_apply(lambda x: embed_flaubert(x, model, tokenizer))
test_data

100%|██████████| 1200/1200 [03:25<00:00,  5.83it/s]


Unnamed: 0,id,sentence,flaubert_embedding
0,0,Nous dûmes nous excuser des propos que nous eû...,"[[-0.3248504, 0.73134893, -1.7739966, -2.85757..."
1,1,Vous ne pouvez pas savoir le plaisir que j'ai ...,"[[-1.5312365, -3.0805593, 0.8174979, -1.892102..."
2,2,"Et, paradoxalement, boire froid n'est pas la b...","[[-0.9070852, -2.0561767, 1.091047, -1.2783799..."
3,3,"Ce n'est pas étonnant, car c'est une saison my...","[[-1.25852, -1.79965, 0.70542955, -1.4450144, ..."
4,4,"Le corps de Golo lui-même, d'une essence aussi...","[[1.0797553, 0.9749571, -0.45053092, -3.803106..."
...,...,...,...
1195,1195,C'est un phénomène qui trouve une accélération...,"[[-0.5692037, -3.5233462, 0.96555924, -2.74066..."
1196,1196,Je vais parler au serveur et voir si on peut d...,"[[-1.0710199, -2.613129, 1.5394348, -1.3612888..."
1197,1197,Il n'était pas comme tant de gens qui par pare...,"[[1.0383526, 0.58540815, -1.2524678, -3.048275..."
1198,1198,Ils deviennent dangereux pour notre économie.,"[[0.5380863, -1.2467924, 0.035965957, -1.45615..."


In [None]:
# Flattening Flaubert embeddings and adding as a new column
test_data['flaubert_embedding_flatten'] = test_data['flaubert_embedding'].apply(lambda x: x.flatten())

In [None]:
# load classifier
import pickle
with open('svm_clf.pkl', 'rb') as f:
    classifier = pickle.load(f)


In [None]:
# Predicting using the SVM classifier
import numpy as np
y_pred_svm = classifier.predict(np.array(test_data['flaubert_embedding_flatten'].values.tolist()))

In [None]:
# Adding predicted difficulty labels to the test_data DataFrame
test_data['difficulty'] = y_pred_svm

In [None]:
# Selecting and displaying only the 'id' and 'difficulty' columns in test_data
test_data = test_data[['id','difficulty']]
test_data

Unnamed: 0,id,difficulty
0,0,C2
1,1,A2
2,2,B1
3,3,A2
4,4,C2
...,...,...
1195,1195,B1
1196,1196,A2
1197,1197,C2
1198,1198,B2


In [None]:
# Saving test_data as a CSV file for submission
test_data.to_csv('submission.csv', index=False)

In [None]:
# Function to predict the difficulty level of a French sentence using FlauBERT embeddings and a classifier.
def predict_french_difficulty(sentence, model, tokenizer, classifier):
    # Embed the sentence using FlauBERT
    sentence_embedding = embed_flaubert(sentence, model, tokenizer).flatten()

    # Predicting the difficulty using the SVM classifier
    difficulty_prediction = classifier.predict([sentence_embedding])[0]
    return difficulty_prediction

# Example usage
example_sentence = "Ceci est une phrase simple"
predicted_difficulty = predict_french_difficulty(example_sentence, model, tokenizer, classifier)
print(f"Predicted difficulty level for the sentence: {predicted_difficulty}")

Predicted difficulty level for the sentence: B1
