In [4]:
# Importing necessary libraries: pandas, torch, and Hugging Face Transformers
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel

In [7]:
# Reading unlabelled test data from a CSV file
test_data = pd.read_csv('unlabelled_test_data.csv')


In [8]:
# Displaying the test_data DataFrame
test_data

Unnamed: 0,id,sentence
0,0,Nous dûmes nous excuser des propos que nous eû...
1,1,Vous ne pouvez pas savoir le plaisir que j'ai ...
2,2,"Et, paradoxalement, boire froid n'est pas la b..."
3,3,"Ce n'est pas étonnant, car c'est une saison my..."
4,4,"Le corps de Golo lui-même, d'une essence aussi..."
...,...,...
1195,1195,C'est un phénomène qui trouve une accélération...
1196,1196,Je vais parler au serveur et voir si on peut d...
1197,1197,Il n'était pas comme tant de gens qui par pare...
1198,1198,Ils deviennent dangereux pour notre économie.


In [5]:
# Loading a fine-tuned Flaubert model for difficulty classification
finetuned_model_id = 'MokaExpress/flaubert-french-difficulty'
tokenizer = AutoTokenizer.from_pretrained(finetuned_model_id)
model = AutoModel.from_pretrained(finetuned_model_id)
model.eval()  

Some weights of the model checkpoint at MokaExpress/flaubert-french-difficulty were not used when initializing FlaubertModel: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
- This IS expected if you are initializing FlaubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing FlaubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


FlaubertModel(
  (position_embeddings): Embedding(512, 768)
  (embeddings): Embedding(68729, 768, padding_idx=2)
  (layer_norm_emb): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (attentions): ModuleList(
    (0-11): 12 x MultiHeadAttention(
      (q_lin): Linear(in_features=768, out_features=768, bias=True)
      (k_lin): Linear(in_features=768, out_features=768, bias=True)
      (v_lin): Linear(in_features=768, out_features=768, bias=True)
      (out_lin): Linear(in_features=768, out_features=768, bias=True)
    )
  )
  (layer_norm1): ModuleList(
    (0-11): 12 x LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  )
  (ffns): ModuleList(
    (0-11): 12 x TransformerFFN(
      (lin1): Linear(in_features=768, out_features=3072, bias=True)
      (lin2): Linear(in_features=3072, out_features=768, bias=True)
      (act): GELUActivation()
    )
  )
  (layer_norm2): ModuleList(
    (0-11): 12 x LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  )
)

In [6]:
# Function to embed a sentence using a Flaubert model
def embed_flaubert(sentence, model, tokenizer):
    inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :]
    return cls_embedding.numpy()

In [10]:
# Applying Flaubert embedding to sentences in test_data
from tqdm import tqdm
tqdm.pandas()
test_data['flaubert_embedding'] = test_data['sentence'].progress_apply(lambda x: embed_flaubert(x, model, tokenizer))
test_data

100%|██████████| 1200/1200 [01:00<00:00, 19.87it/s]


Unnamed: 0,id,sentence,flaubert_embedding
0,0,Nous dûmes nous excuser des propos que nous eû...,"[[-0.32488617, 0.73135746, -1.7740397, -2.8575..."
1,1,Vous ne pouvez pas savoir le plaisir que j'ai ...,"[[-1.5312389, -3.0805702, 0.81750405, -1.89209..."
2,2,"Et, paradoxalement, boire froid n'est pas la b...","[[-0.9071048, -2.0562153, 1.0910534, -1.278390..."
3,3,"Ce n'est pas étonnant, car c'est une saison my...","[[-1.2585075, -1.7996714, 0.7054241, -1.445060..."
4,4,"Le corps de Golo lui-même, d'une essence aussi...","[[1.0797147, 0.9749886, -0.45064196, -3.803045..."
...,...,...,...
1195,1195,C'est un phénomène qui trouve une accélération...,"[[-0.5692383, -3.5234427, 0.9656427, -2.740808..."
1196,1196,Je vais parler au serveur et voir si on peut d...,"[[-1.0711411, -2.613142, 1.5394385, -1.3612442..."
1197,1197,Il n'était pas comme tant de gens qui par pare...,"[[1.0383626, 0.5854192, -1.252559, -3.0483758,..."
1198,1198,Ils deviennent dangereux pour notre économie.,"[[0.53811425, -1.2468678, 0.036003873, -1.4562..."


In [12]:
# Flattening Flaubert embeddings and adding as a new column
test_data['flaubert_embedding_flatten'] = test_data['flaubert_embedding'].apply(lambda x: x.flatten())


In [11]:
# load classifier
import pickle
with open('svm_clf.pkl', 'rb') as f:
    classifier = pickle.load(f)


In [13]:
# Predicting using the SVM classifier
import numpy as np
y_pred_svm = classifier.predict(np.array(test_data['flaubert_embedding_flatten'].values.tolist()))

In [17]:
# Adding predicted difficulty labels to the test_data DataFrame
test_data['difficulty'] = y_pred_svm

In [20]:
# Selecting and displaying only the 'id' and 'difficulty' columns in test_data
test_data = test_data[['id','difficulty']]
test_data

Unnamed: 0,id,difficulty
0,0,C2
1,1,A2
2,2,B1
3,3,A2
4,4,C2
...,...,...
1195,1195,B1
1196,1196,A2
1197,1197,C2
1198,1198,B2


In [21]:
# Saving test_data as a CSV file for submission
test_data.to_csv('submission.csv', index=False)