<a href="https://colab.research.google.com/github/GiammarcoBozzelli/DSML/blob/main/Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler
import numpy as np
import nltk
from nltk.corpus import wordnet
from random import randint
import random
import string


# Load the dataset
url = "https://raw.githubusercontent.com/GiammarcoBozzelli/DSML/main/DATA/training_data.csv"
data = pd.read_csv(url)

In [None]:
# augmented_df = pd.read_csv('https://raw.githubusercontent.com/GiammarcoBozzelli/DSML/main/DATA/augmented_dataset.csv')

In [None]:
# !pip install transformers[torch] accelerate -U

## Distilbert

In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler
import numpy as np
import nltk
from nltk.corpus import wordnet
from random import randint
import random
import string

# Download necessary NLTK data
nltk.download('punkt')

# Load the dataset
url = "https://raw.githubusercontent.com/GiammarcoBozzelli/DSML/main/DATA/training_data.csv"
data = pd.read_csv(url)

# Map labels to integers
label_mapping = {'A1': 0, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 5}
data['difficulty'] = data['difficulty'].map(label_mapping)

# Feature Engineering Functions
def get_sentence_length(sentence):
    return len(sentence)

def get_word_count(sentence):
    words = nltk.word_tokenize(sentence)
    return len(words)

def get_avg_word_length(sentence):
    words = nltk.word_tokenize(sentence)
    return np.mean([len(word) for word in words])

def count_punctuation(sentence):
    return sum([1 for char in sentence if char in string.punctuation])

# Extract Features
data['sentence_length'] = data['sentence'].apply(get_sentence_length)
data['word_count'] = data['sentence'].apply(get_word_count)
data['avg_word_length'] = data['sentence'].apply(get_avg_word_length)
data['punctuation_count'] = data['sentence'].apply(count_punctuation)

# Standardize the features
scaler = StandardScaler()
feature_cols = ['sentence_length', 'word_count', 'avg_word_length', 'punctuation_count']
data[feature_cols] = scaler.fit_transform(data[feature_cols])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(data, data['difficulty'], test_size=0.2, random_state=42)

# Tokenization
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')

train_encodings = tokenizer(X_train['sentence'].tolist(), truncation=True, padding=True)
test_encodings = tokenizer(X_test['sentence'].tolist(), truncation=True, padding=True)

class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None, features=None):
        self.encodings = encodings
        self.labels = labels
        self.features = features

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        if self.features is not None:
            item['features'] = torch.tensor(self.features[idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# Set the device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')



# Cross-validation with KFold
kf = KFold(n_splits=3, shuffle=True, random_state=42)
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    print(f"Training fold {fold + 1}")

    # Prepare training and validation datasets
    train_data = data.iloc[train_idx]
    val_data = data.iloc[val_idx]
    # Load pre-trained model
    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-multilingual-cased', num_labels=6).to(device)

    train_encodings_fold = tokenizer(train_data['sentence'].tolist(), truncation=True, padding=True)
    val_encodings_fold = tokenizer(val_data['sentence'].tolist(), truncation=True, padding=True)

    train_dataset = ClassificationDataset(train_encodings_fold, train_data['difficulty'].tolist(), train_data[feature_cols].values)
    val_dataset = ClassificationDataset(val_encodings_fold, val_data['difficulty'].tolist(), val_data[feature_cols].values)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f'./results_fold_{fold + 1}',
        num_train_epochs=8,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        warmup_steps=1000,
        weight_decay=0.0015,
        logging_dir=f'./logs_fold_{fold + 1}',
        logging_steps=20,
        evaluation_strategy="epoch",
        learning_rate=0.00005,
        fp16=True
    )

    # Define trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    predictions, label_ids, metrics = trainer.predict(val_dataset)
    preds = predictions.argmax(-1)

    # Calculate metrics
    accuracy = accuracy_score(val_data['difficulty'], preds)
    precision = precision_score(val_data['difficulty'], preds, average='weighted')
    recall = recall_score(val_data['difficulty'], preds, average='weighted')
    f1 = f1_score(val_data['difficulty'], preds, average='weighted')

    # Store metrics
    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)

    # Print the results
    print(f'Fold {fold + 1} - Accuracy: {accuracy:.2f}')
    print(f'Fold {fold + 1} - Precision: {precision:.2f}')
    print(f'Fold {fold + 1} - Recall: {recall:.2f}')
    print(f'Fold {fold + 1} - F1 Score: {f1:.2f}')

# Print the average results across folds
print(f'Average Accuracy: {np.mean(accuracy_list):.2f}')
print(f'Average Precision: {np.mean(precision_list):.2f}')
print(f'Average Recall: {np.mean(recall_list):.2f}')
print(f'Average F1 Score: {np.mean(f1_list):.2f}')

# Alternatively, print a detailed classification report for the final fold
print(classification_report(val_data['difficulty'], preds, target_names=['A1', 'A2', 'B1', 'B2', 'C1', 'C2']))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Training fold 1


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.2996,1.238442
2,1.1893,1.190658
3,1.0663,1.236999
4,0.8021,1.295088
5,0.379,1.553977
6,0.0891,2.290314
7,0.1766,2.572154
8,0.0085,2.728797


Fold 1 - Accuracy: 0.55
Fold 1 - Precision: 0.58
Fold 1 - Recall: 0.55
Fold 1 - F1 Score: 0.55
Training fold 2


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.2433,1.292821
2,1.1259,1.190287
3,0.9775,1.269662
4,0.5956,1.421422
5,0.4088,1.986328
6,0.1608,2.612634
7,0.0519,2.999761
8,0.003,3.057564


Fold 2 - Accuracy: 0.50
Fold 2 - Precision: 0.52
Fold 2 - Recall: 0.50
Fold 2 - F1 Score: 0.51
Training fold 3


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.3117,1.314149
2,1.2194,1.2164
3,1.0658,1.159325
4,0.8573,1.320384
5,0.4238,1.665996
6,0.2584,2.245732
7,0.0912,2.60675
8,0.0523,2.633737


Fold 3 - Accuracy: 0.54
Fold 3 - Precision: 0.55
Fold 3 - Recall: 0.54
Fold 3 - F1 Score: 0.54
Average Accuracy: 0.53
Average Precision: 0.55
Average Recall: 0.53
Average F1 Score: 0.53
              precision    recall  f1-score   support

          A1       0.66      0.66      0.66       205
          A2       0.50      0.49      0.50       214
          B1       0.45      0.49      0.47       216
          B2       0.53      0.51      0.52       221
          C1       0.49      0.53      0.51       208
          C2       0.66      0.58      0.62       216

    accuracy                           0.54      1280
   macro avg       0.55      0.54      0.55      1280
weighted avg       0.55      0.54      0.54      1280



In [None]:
model.save_pretrained('./distilbert')
tokenizer.save_pretrained('./distilbert')

('./distilbert/tokenizer_config.json',
 './distilbert/special_tokens_map.json',
 './distilbert/vocab.txt',
 './distilbert/added_tokens.json')

In [None]:
!zip -r ./distilbert.zip ./distilbert/

  adding: distilbert/ (stored 0%)
  adding: distilbert/config.json (deflated 52%)
  adding: distilbert/vocab.txt (deflated 45%)
  adding: distilbert/special_tokens_map.json (deflated 42%)
  adding: distilbert/model.safetensors (deflated 7%)
  adding: distilbert/tokenizer_config.json (deflated 75%)


## Camembert

In [None]:
import pandas as pd
import torch
from transformers import CamembertTokenizer,CamembertForSequenceClassification, CamembertModel, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler
import numpy as np
import nltk
from nltk.corpus import wordnet
from random import randint
import random
import string

# Download necessary NLTK data
nltk.download('punkt')

# Load the dataset
url = "https://raw.githubusercontent.com/GiammarcoBozzelli/DSML/main/DATA/training_data.csv"
data = pd.read_csv(url)

# Map labels to integers
label_mapping = {'A1': 0, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 5}
data['difficulty'] = data['difficulty'].map(label_mapping)

# Feature Engineering Functions
def get_sentence_length(sentence):
    return len(sentence)

def get_word_count(sentence):
    words = nltk.word_tokenize(sentence)
    return len(words)

def get_avg_word_length(sentence):
    words = nltk.word_tokenize(sentence)
    return np.mean([len(word) for word in words])

def count_punctuation(sentence):
    return sum([1 for char in sentence if char in string.punctuation])

# Extract Features
data['sentence_length'] = data['sentence'].apply(get_sentence_length)
data['word_count'] = data['sentence'].apply(get_word_count)
data['avg_word_length'] = data['sentence'].apply(get_avg_word_length)
data['punctuation_count'] = data['sentence'].apply(count_punctuation)

# Standardize the features
scaler = StandardScaler()
feature_cols = ['sentence_length', 'word_count', 'avg_word_length', 'punctuation_count']
data[feature_cols] = scaler.fit_transform(data[feature_cols])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(data, data['difficulty'], test_size=0.2, random_state=42)

# Tokenization
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

train_encodings = tokenizer(X_train['sentence'].tolist(), truncation=True, padding=True)
test_encodings = tokenizer(X_test['sentence'].tolist(), truncation=True, padding=True)

class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None, features=None):
        self.encodings = encodings
        self.labels = labels
        self.features = features

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        if self.features is not None:
            item['features'] = torch.tensor(self.features[idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# Set the device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Cross-validation with KFold
kf = KFold(n_splits=3, shuffle=True, random_state=42)
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    print(f"Training fold {fold + 1}")

    model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=6).to(device)

    # Prepare training and validation datasets
    train_data = data.iloc[train_idx]
    val_data = data.iloc[val_idx]

    train_encodings_fold = tokenizer(train_data['sentence'].tolist(), truncation=True, padding=True)
    val_encodings_fold = tokenizer(val_data['sentence'].tolist(), truncation=True, padding=True)

    train_dataset = ClassificationDataset(train_encodings_fold, train_data['difficulty'].tolist(), train_data[feature_cols].values)
    val_dataset = ClassificationDataset(val_encodings_fold, val_data['difficulty'].tolist(), val_data[feature_cols].values)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f'./results_fold_{fold + 1}',
        num_train_epochs=7,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        warmup_steps=1000,
        weight_decay=0.0015,
        logging_dir=f'./logs_fold_{fold + 1}',
        logging_steps=20,
        evaluation_strategy="epoch",
        learning_rate=0.00001,
        fp16=True
    )

    # Define trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    predictions, label_ids, metrics = trainer.predict(val_dataset)
    preds = predictions.argmax(-1)

    # Calculate metrics
    accuracy = accuracy_score(val_data['difficulty'], preds)
    precision = precision_score(val_data['difficulty'], preds, average='weighted')
    recall = recall_score(val_data['difficulty'], preds, average='weighted')
    f1 = f1_score(val_data['difficulty'], preds, average='weighted')

    # Store metrics
    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)

    # Print the results
    print(f'Fold {fold + 1} - Accuracy: {accuracy:.2f}')
    print(f'Fold {fold + 1} - Precision: {precision:.2f}')
    print(f'Fold {fold + 1} - Recall: {recall:.2f}')
    print(f'Fold {fold + 1} - F1 Score: {f1:.2f}')

# Print the average results across folds
print(f'Average Accuracy: {np.mean(accuracy_list):.2f}')
print(f'Average Precision: {np.mean(precision_list):.2f}')
print(f'Average Recall: {np.mean(recall_list):.2f}')
print(f'Average F1 Score: {np.mean(f1_list):.2f}')

# Alternatively, print a detailed classification report for the final fold
print(classification_report(val_data['difficulty'], preds, target_names=['A1', 'A2', 'B1', 'B2', 'C1', 'C2']))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Training fold 1


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.7668,1.755382
2,1.5532,1.485113
3,1.3485,1.272319
4,1.1831,1.187115
5,1.0017,1.096363
6,0.9629,1.053249
7,0.9168,1.051031


Fold 1 - Accuracy: 0.57
Fold 1 - Precision: 0.57
Fold 1 - Recall: 0.57
Fold 1 - F1 Score: 0.56
Training fold 2


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.767,1.769972
2,1.5372,1.496732
3,1.2934,1.270783
4,1.125,1.209515
5,0.922,1.176997
6,0.8726,1.182077
7,0.887,1.186372


Fold 2 - Accuracy: 0.49
Fold 2 - Precision: 0.50
Fold 2 - Recall: 0.49
Fold 2 - F1 Score: 0.48
Training fold 3


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.7691,1.768578
2,1.564,1.5222
3,1.3217,1.313596
4,1.1992,1.166784
5,1.0553,1.098279
6,0.9394,1.097569
7,0.8252,1.109191


Fold 3 - Accuracy: 0.52
Fold 3 - Precision: 0.52
Fold 3 - Recall: 0.52
Fold 3 - F1 Score: 0.51
Average Accuracy: 0.53
Average Precision: 0.53
Average Recall: 0.53
Average F1 Score: 0.52
              precision    recall  f1-score   support

          A1       0.61      0.81      0.70       205
          A2       0.48      0.43      0.45       214
          B1       0.46      0.48      0.47       216
          B2       0.44      0.53      0.48       221
          C1       0.42      0.29      0.35       208
          C2       0.70      0.59      0.64       216

    accuracy                           0.52      1280
   macro avg       0.52      0.52      0.51      1280
weighted avg       0.52      0.52      0.51      1280



In [None]:
model.save_pretrained('./camembert')
tokenizer.save_pretrained('./camembert')

('./camembert/tokenizer_config.json',
 './camembert/special_tokens_map.json',
 './camembert/sentencepiece.bpe.model',
 './camembert/added_tokens.json')

In [None]:
!zip -r ./camembert.zip ./camembert/

  adding: camembert/ (stored 0%)
  adding: camembert/config.json (deflated 55%)
  adding: camembert/sentencepiece.bpe.model (deflated 49%)
  adding: camembert/special_tokens_map.json (deflated 52%)
  adding: camembert/model.safetensors (deflated 12%)
  adding: camembert/tokenizer_config.json (deflated 81%)
  adding: camembert/added_tokens.json (stored 0%)


## Flaubert

In [None]:
import pandas as pd
import torch
from transformers import FlaubertModel, FlaubertTokenizer,FlaubertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler
import numpy as np
import nltk
from nltk.corpus import wordnet
from random import randint
import random
import string

# Download necessary NLTK data
nltk.download('punkt')

# Load the dataset
url = "https://raw.githubusercontent.com/GiammarcoBozzelli/DSML/main/DATA/training_data.csv"
data = pd.read_csv(url)

# Map labels to integers
label_mapping = {'A1': 0, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 5}
data['difficulty'] = data['difficulty'].map(label_mapping)

# Feature Engineering Functions
def get_sentence_length(sentence):
    return len(sentence)

def get_word_count(sentence):
    words = nltk.word_tokenize(sentence)
    return len(words)

def get_avg_word_length(sentence):
    words = nltk.word_tokenize(sentence)
    return np.mean([len(word) for word in words])

def count_punctuation(sentence):
    return sum([1 for char in sentence if char in string.punctuation])

# Extract Features
data['sentence_length'] = data['sentence'].apply(get_sentence_length)
data['word_count'] = data['sentence'].apply(get_word_count)
data['avg_word_length'] = data['sentence'].apply(get_avg_word_length)
data['punctuation_count'] = data['sentence'].apply(count_punctuation)

# Standardize the features
scaler = StandardScaler()
feature_cols = ['sentence_length', 'word_count', 'avg_word_length', 'punctuation_count']
data[feature_cols] = scaler.fit_transform(data[feature_cols])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(data, data['difficulty'], test_size=0.2, random_state=42)

# Tokenization
tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased')

train_encodings = tokenizer(X_train['sentence'].tolist(), truncation=True, padding=True)
test_encodings = tokenizer(X_test['sentence'].tolist(), truncation=True, padding=True)

class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None, features=None):
        self.encodings = encodings
        self.labels = labels
        self.features = features

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        if self.features is not None:
            item['features'] = torch.tensor(self.features[idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# Set the device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Load pre-trained model


# Cross-validation with KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    print(f"Training fold {fold + 1}")

    # Prepare training and validation datasets
    train_data = data.iloc[train_idx]
    val_data = data.iloc[val_idx]

    model = FlaubertForSequenceClassification.from_pretrained('flaubert/flaubert_base_cased', num_labels=6).to(device)

    train_encodings_fold = tokenizer(train_data['sentence'].tolist(), truncation=True, padding=True)
    val_encodings_fold = tokenizer(val_data['sentence'].tolist(), truncation=True, padding=True)

    train_dataset = ClassificationDataset(train_encodings_fold, train_data['difficulty'].tolist(), train_data[feature_cols].values)
    val_dataset = ClassificationDataset(val_encodings_fold, val_data['difficulty'].tolist(), val_data[feature_cols].values)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f'./results_fold_{fold + 1}',
        num_train_epochs=10,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        warmup_steps=1000,
        weight_decay=0.0015,
        logging_dir=f'./logs_fold_{fold + 1}',
        logging_steps=20,
        evaluation_strategy="epoch",
        learning_rate=0.00001,
        fp16=True
    )

    # Define trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    predictions, label_ids, metrics = trainer.predict(val_dataset)
    preds = predictions.argmax(-1)

    # Calculate metrics
    accuracy = accuracy_score(val_data['difficulty'], preds)
    precision = precision_score(val_data['difficulty'], preds, average='weighted')
    recall = recall_score(val_data['difficulty'], preds, average='weighted')
    f1 = f1_score(val_data['difficulty'], preds, average='weighted')

    # Store metrics
    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)

    # Print the results
    print(f'Fold {fold + 1} - Accuracy: {accuracy:.2f}')
    print(f'Fold {fold + 1} - Precision: {precision:.2f}')
    print(f'Fold {fold + 1} - Recall: {recall:.2f}')
    print(f'Fold {fold + 1} - F1 Score: {f1:.2f}')

# Print the average results across folds
print(f'Average Accuracy: {np.mean(accuracy_list):.2f}')
print(f'Average Precision: {np.mean(precision_list):.2f}')
print(f'Average Recall: {np.mean(recall_list):.2f}')
print(f'Average F1 Score: {np.mean(f1_list):.2f}')

# Alternatively, print a detailed classification report for the final fold
print(classification_report(val_data['difficulty'], preds, target_names=['A1', 'A2', 'B1', 'B2', 'C1', 'C2']))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Training fold 1


Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.7722,1.733193
2,1.6221,1.487104
3,1.1467,1.209727
4,1.239,1.133424
5,1.045,1.099472
6,1.027,1.054684
7,0.9398,1.057798
8,0.8835,1.077912
9,0.8036,1.094515
10,0.6519,1.101794


Fold 1 - Accuracy: 0.59
Fold 1 - Precision: 0.59
Fold 1 - Recall: 0.59
Fold 1 - F1 Score: 0.59
Training fold 2


Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.7433,1.676652
2,1.4282,1.421647
3,1.2097,1.169028
4,1.0917,1.065796
5,0.9508,1.059872
6,1.0056,1.081518
7,0.8699,1.08284
8,0.8394,1.130252
9,0.8365,1.099546
10,0.7157,1.112669


Fold 2 - Accuracy: 0.57
Fold 2 - Precision: 0.58
Fold 2 - Recall: 0.57
Fold 2 - F1 Score: 0.57
Training fold 3


Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.6937,1.699808
2,1.4933,1.452464
3,1.1922,1.273288
4,1.1461,1.200674
5,0.8204,1.244063
6,0.9447,1.16167
7,0.8354,1.226576
8,0.7247,1.256227
9,0.7793,1.217264
10,0.7161,1.209977


Fold 3 - Accuracy: 0.53
Fold 3 - Precision: 0.53
Fold 3 - Recall: 0.53
Fold 3 - F1 Score: 0.53
Training fold 4


Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.7757,1.671537
2,1.5538,1.471448
3,1.2434,1.244058
4,1.0481,1.152903
5,1.0846,1.112401
6,0.9028,1.119035
7,0.7403,1.150162
8,0.737,1.175533
9,0.7344,1.180974
10,0.7942,1.2064


Fold 4 - Accuracy: 0.54
Fold 4 - Precision: 0.55
Fold 4 - Recall: 0.54
Fold 4 - F1 Score: 0.54
Training fold 5


Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.7578,1.695904
2,1.4931,1.445942
3,1.3255,1.207282
4,1.2518,1.158775
5,1.1033,1.062548
6,0.9276,1.106098
7,0.9826,1.096968
8,0.8965,1.128776
9,0.7246,1.151782
10,0.686,1.103826


Fold 5 - Accuracy: 0.56
Fold 5 - Precision: 0.56
Fold 5 - Recall: 0.56
Fold 5 - F1 Score: 0.56
Average Accuracy: 0.56
Average Precision: 0.56
Average Recall: 0.56
Average F1 Score: 0.56
              precision    recall  f1-score   support

          A1       0.72      0.69      0.71       122
          A2       0.54      0.60      0.57       131
          B1       0.48      0.58      0.53       118
          B2       0.57      0.53      0.55       145
          C1       0.43      0.36      0.39       121
          C2       0.62      0.62      0.62       131

    accuracy                           0.56       768
   macro avg       0.56      0.56      0.56       768
weighted avg       0.56      0.56      0.56       768



##TRY WITH AUGMENTED DATA GPT2

In [None]:
augmented_df = pd.read_csv('https://raw.githubusercontent.com/GiammarcoBozzelli/DSML/main/DATA/augmented_dataset.csv')

In [None]:
def remove_duplicate_words(sentence):
  '''
  Function necessary to remove duplicated words in sentences generated by gpt-2 that made no sense.
  '''
  words = sentence.split()
  seen = set()
  unique_words = []
  for word in words:
      if word not in seen:
          unique_words.append(word)
          seen.add(word)
  return ' '.join(unique_words)

# Apply the function to the 'sentence' column
augmented_df.loc[4800:, 'sentence'] = augmented_df.loc[4800:, 'sentence'].apply(remove_duplicate_words)

In [None]:
# Function to fix the 'id' column
def fix_id_column(df):
    # Create a sequence of row numbers starting from 0
    correct_ids = pd.Series(range(len(df)))
    # Replace 'NaN' values and incorrect ids
    df['id'] = correct_ids
    return df

# Fix the 'id' column
augmented_df = fix_id_column(augmented_df)

In [None]:
import nltk
from nltk.corpus import wordnet
from random import randint
import random

nltk.download('wordnet')
nltk.download('omw-1.4')

def synonym_replacement(sentence, n):
    words = sentence.split()
    new_words = words.copy()
    random_word_list = list(set([word for word in words if wordnet.synsets(word)]))
    num_replaced = 0

    for random_word in random_word_list:
        synonyms = set()
        for syn in wordnet.synsets(random_word):
            for lemma in syn.lemmas():
                synonym = lemma.name().replace("_", " ").replace("-", " ").lower()
                synonyms.add(synonym)
        if len(synonyms) > 1:
            synonyms.discard(random_word)
            synonym = list(synonyms)[randint(0, len(synonyms) - 1)]
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:  # only replace up to n words
            break

    return ' '.join(new_words)

def shuffle_sentence(sentence):
    words = sentence.split()
    random.shuffle(words)
    return ' '.join(words)


augmented_data = []

# Augment data
for _, row in augmented_df.iterrows():
    original_sentence = row['sentence']
    difficulty = row['difficulty']

    # Generate augmented sentences
    augmented_sentence_synonym = synonym_replacement(original_sentence, 2)
    augmented_sentence_shuffled = shuffle_sentence(original_sentence) #idk


    # Append original and augmented sentences to the new list
    augmented_data.append({'sentence': original_sentence, 'difficulty': difficulty})
    augmented_data.append({'sentence': augmented_sentence_synonym, 'difficulty': difficulty})
    augmented_data.append({'sentence': augmented_sentence_shuffled, 'difficulty': difficulty})


# Create a new DataFrame from the augmented data
data = pd.DataFrame(augmented_data)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
# Map labels to integers
label_mapping = {'A1': 0, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 5}
data['difficulty'] = data['difficulty'].map(label_mapping)

# Feature Engineering Functions
def get_sentence_length(sentence):
    return len(sentence)

def get_word_count(sentence):
    words = nltk.word_tokenize(sentence)
    return len(words)

def get_avg_word_length(sentence):
    words = nltk.word_tokenize(sentence)
    return np.mean([len(word) for word in words])

def count_punctuation(sentence):
    return sum([1 for char in sentence if char in string.punctuation])

# Extract Features
data['sentence_length'] = data['sentence'].apply(get_sentence_length)
data['word_count'] = data['sentence'].apply(get_word_count)
data['avg_word_length'] = data['sentence'].apply(get_avg_word_length)
data['punctuation_count'] = data['sentence'].apply(count_punctuation)

# Standardize the features
scaler = StandardScaler()
feature_cols = ['sentence_length', 'word_count', 'avg_word_length', 'punctuation_count']
data[feature_cols] = scaler.fit_transform(data[feature_cols])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(data, data['difficulty'], test_size=0.2, random_state=42)

# Tokenization
tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased')

train_encodings = tokenizer(X_train['sentence'].tolist(), truncation=True, padding=True)
test_encodings = tokenizer(X_test['sentence'].tolist(), truncation=True, padding=True)

class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None, features=None):
        self.encodings = encodings
        self.labels = labels
        self.features = features

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        if self.features is not None:
            item['features'] = torch.tensor(self.features[idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# Set the device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Load pre-trained model


# Cross-validation with KFold
kf = KFold(n_splits=3, shuffle=True, random_state=42)
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    print(f"Training fold {fold + 1}")

    # Prepare training and validation datasets
    train_data = data.iloc[train_idx]
    val_data = data.iloc[val_idx]

    model = FlaubertForSequenceClassification.from_pretrained('flaubert/flaubert_base_cased', num_labels=6).to(device)

    train_encodings_fold = tokenizer(train_data['sentence'].tolist(), truncation=True, padding=True)
    val_encodings_fold = tokenizer(val_data['sentence'].tolist(), truncation=True, padding=True)

    train_dataset = ClassificationDataset(train_encodings_fold, train_data['difficulty'].tolist(), train_data[feature_cols].values)
    val_dataset = ClassificationDataset(val_encodings_fold, val_data['difficulty'].tolist(), val_data[feature_cols].values)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f'./results_fold_{fold + 1}',
        num_train_epochs=8,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=1000,
        weight_decay=0.005,
        logging_dir=f'./logs_fold_{fold + 1}',
        logging_steps=20,
        evaluation_strategy="epoch",
        learning_rate=0.000015,
        fp16=True
    )

    # Define trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    predictions, label_ids, metrics = trainer.predict(val_dataset)
    preds = predictions.argmax(-1)

    # Calculate metrics
    accuracy = accuracy_score(val_data['difficulty'], preds)
    precision = precision_score(val_data['difficulty'], preds, average='weighted')
    recall = recall_score(val_data['difficulty'], preds, average='weighted')
    f1 = f1_score(val_data['difficulty'], preds, average='weighted')

    # Store metrics
    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)

    # Print the results
    print(f'Fold {fold + 1} - Accuracy: {accuracy:.2f}')
    print(f'Fold {fold + 1} - Precision: {precision:.2f}')
    print(f'Fold {fold + 1} - Recall: {recall:.2f}')
    print(f'Fold {fold + 1} - F1 Score: {f1:.2f}')

# Print the average results across folds
print(f'Average Accuracy: {np.mean(accuracy_list):.2f}')
print(f'Average Precision: {np.mean(precision_list):.2f}')
print(f'Average Recall: {np.mean(recall_list):.2f}')
print(f'Average F1 Score: {np.mean(f1_list):.2f}')

# Alternatively, print a detailed classification report for the final fold
print(classification_report(val_data['difficulty'], preds, target_names=['A1', 'A2', 'B1', 'B2', 'C1', 'C2']))

#Extend Df for A2 to C1


In [3]:
filtered_data = data[data['difficulty'].isin(['A2', 'B1', 'B2', 'C1'])]
new_df = pd.concat([data, filtered_data])
new_df = new_df.reset_index(drop=True)
new_df['id'] = new_df.index

In [4]:
new_df

Unnamed: 0,id,sentence,difficulty
0,0,Les coûts kilométriques réels peuvent diverger...,C1
1,1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1
2,2,Le test de niveau en français est sur le site ...,A1
3,3,Est-ce que ton mari est aussi de Boston?,A1
4,4,"Dans les écoles de commerce, dans les couloirs...",B1
...,...,...,...
7975,7975,La réduction du dioxyde de carbone par l'eau n...,B2
7976,7976,Elle connaissait à présent la petitesse des pa...,C1
7977,7977,"C'est pourquoi, il décida de remplacer les hab...",B2
7978,7978,Il avait une de ces pâleurs splendides qui don...,C1


# DistilBERT with augmented df

In [5]:
import pandas as pd
import torch
import accelerate
from transformers import DistilBertModel, DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler
import numpy as np
import nltk
from nltk.corpus import wordnet
import string

# Download necessary NLTK data
nltk.download('punkt')

data = new_df  # Use the provided new_df

# Map labels to integers
label_mapping = {'A1': 0, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 5}
data['difficulty'] = data['difficulty'].map(label_mapping)

# Feature Engineering Functions
def get_sentence_length(sentence):
    return len(sentence)

def get_word_count(sentence):
    words = nltk.word_tokenize(sentence)
    return len(words)

def get_avg_word_length(sentence):
    words = nltk.word_tokenize(sentence)
    return np.mean([len(word) for word in words])

def count_punctuation(sentence):
    return sum([1 for char in sentence if char in string.punctuation])

# Extract Features
data['sentence_length'] = data['sentence'].apply(get_sentence_length)
data['word_count'] = data['sentence'].apply(get_word_count)
data['avg_word_length'] = data['sentence'].apply(get_avg_word_length)
data['punctuation_count'] = data['sentence'].apply(count_punctuation)

# Standardize the features
scaler = StandardScaler()
feature_cols = ['sentence_length', 'word_count', 'avg_word_length', 'punctuation_count']
data[feature_cols] = scaler.fit_transform(data[feature_cols])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(data, data['difficulty'], test_size=0.2, random_state=42)

# Tokenization
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')

train_encodings = tokenizer(X_train['sentence'].tolist(), truncation=True, padding=True)
test_encodings = tokenizer(X_test['sentence'].tolist(), truncation=True, padding=True)

class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None, features=None):
        self.encodings = encodings
        self.labels = labels
        self.features = features

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        if self.features is not None:
            item['features'] = torch.tensor(self.features[idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# Set the device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Load pre-trained model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-multilingual-cased', num_labels=6).to(device)

# Prepare the datasets
train_dataset = ClassificationDataset(train_encodings, y_train.tolist(), X_train[feature_cols].values)
test_dataset = ClassificationDataset(test_encodings, y_test.tolist(), X_test[feature_cols].values)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=1000,
    weight_decay=0.0005,
    logging_dir='./logs',
    logging_steps=20,
    evaluation_strategy="epoch",
    learning_rate=0.000005,
    fp16=True
)

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train the model
trainer.train()

# Evaluate the model
predictions, label_ids, metrics = trainer.predict(test_dataset)
preds = predictions.argmax(-1)

# Calculate metrics
accuracy = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds, average='weighted')
recall = recall_score(y_test, preds, average='weighted')
f1 = f1_score(y_test, preds, average='weighted')

# Print the results
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

# Print a detailed classification report
print(classification_report(y_test, preds, target_names=['A1', 'A2', 'B1', 'B2', 'C1', 'C2']))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.3643,1.308142
2,1.1394,1.121433
3,1.0126,0.996074
4,0.7812,1.032601
5,0.8486,0.914891
6,0.6775,0.870037
7,0.6914,0.8435
8,0.5553,0.834816
9,0.6045,0.824424
10,0.3964,0.813546


Accuracy: 0.72
Precision: 0.72
Recall: 0.72
F1 Score: 0.72
              precision    recall  f1-score   support

          A1       0.71      0.57      0.63       169
          A2       0.70      0.77      0.74       318
          B1       0.74      0.78      0.76       329
          B2       0.73      0.80      0.76       310
          C1       0.75      0.76      0.76       312
          C2       0.65      0.45      0.53       158

    accuracy                           0.72      1596
   macro avg       0.71      0.69      0.70      1596
weighted avg       0.72      0.72      0.72      1596



In [6]:
model.save_pretrained('./distilbert')
tokenizer.save_pretrained('./distilbert')

('./distilbert/tokenizer_config.json',
 './distilbert/special_tokens_map.json',
 './distilbert/vocab.txt',
 './distilbert/added_tokens.json')

In [7]:
!zip -r ./distilbert.zip ./distilbert/

  adding: distilbert/ (stored 0%)
  adding: distilbert/tokenizer_config.json (deflated 75%)
  adding: distilbert/vocab.txt (deflated 45%)
  adding: distilbert/model.safetensors (deflated 7%)
  adding: distilbert/special_tokens_map.json (deflated 42%)
  adding: distilbert/config.json (deflated 52%)


# CamemBERT with augmented df

In [None]:
import pandas as pd
import torch
from transformers import CamembertModel, CamembertTokenizer, CamembertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler
import numpy as np
import nltk
from nltk.corpus import wordnet
import string

# Download necessary NLTK data
nltk.download('punkt')

data = new_df  # Use the provided new_df

# Map labels to integers
label_mapping = {'A1': 0, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 5}
data['difficulty'] = data['difficulty'].map(label_mapping)

# Feature Engineering Functions
def get_sentence_length(sentence):
    return len(sentence)

def get_word_count(sentence):
    words = nltk.word_tokenize(sentence)
    return len(words)

def get_avg_word_length(sentence):
    words = nltk.word_tokenize(sentence)
    return np.mean([len(word) for word in words])

def count_punctuation(sentence):
    return sum([1 for char in sentence if char in string.punctuation])

# Extract Features
data['sentence_length'] = data['sentence'].apply(get_sentence_length)
data['word_count'] = data['sentence'].apply(get_word_count)
data['avg_word_length'] = data['sentence'].apply(get_avg_word_length)
data['punctuation_count'] = data['sentence'].apply(count_punctuation)

# Standardize the features
scaler = StandardScaler()
feature_cols = ['sentence_length', 'word_count', 'avg_word_length', 'punctuation_count']
data[feature_cols] = scaler.fit_transform(data[feature_cols])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(data, data['difficulty'], test_size=0.2, random_state=42)

# Tokenization
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

train_encodings = tokenizer(X_train['sentence'].tolist(), truncation=True, padding=True)
test_encodings = tokenizer(X_test['sentence'].tolist(), truncation=True, padding=True)

class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None, features=None):
        self.encodings = encodings
        self.labels = labels
        self.features = features

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        if self.features is not None:
            item['features'] = torch.tensor(self.features[idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# Set the device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Load pre-trained model
model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=6).to(device)

# Prepare the datasets
train_dataset = ClassificationDataset(train_encodings, y_train.tolist(), X_train[feature_cols].values)
test_dataset = ClassificationDataset(test_encodings, y_test.tolist(), X_test[feature_cols].values)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=1000,
    weight_decay=0.0005,
    logging_dir='./logs',
    logging_steps=20,
    evaluation_strategy="epoch",
    learning_rate=0.000005,
    fp16=True
)

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train the model
trainer.train()

# Evaluate the model
predictions, label_ids, metrics = trainer.predict(test_dataset)
preds = predictions.argmax(-1)

# Calculate metrics
accuracy = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds, average='weighted')
recall = recall_score(y_test, preds, average='weighted')
f1 = f1_score(y_test, preds, average='weighted')

# Print the results
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

# Print a detailed classification report
print(classification_report(y_test, preds, target_names=['A1', 'A2', 'B1', 'B2', 'C1', 'C2']))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infer

Epoch,Training Loss,Validation Loss
1,1.6508,1.601327
2,1.301,1.238415
3,1.1384,1.101928
4,0.9082,1.12226
5,0.9776,0.982874
6,0.7885,0.983709
7,0.7985,0.998185
8,0.7962,1.008876
9,0.6925,0.955878
10,0.6308,0.954595


Accuracy: 0.63
Precision: 0.64
Recall: 0.63
F1 Score: 0.61
              precision    recall  f1-score   support

          A1       0.73      0.64      0.68       169
          A2       0.65      0.80      0.71       318
          B1       0.67      0.69      0.68       329
          B2       0.54      0.73      0.62       310
          C1       0.61      0.50      0.55       312
          C2       0.75      0.19      0.30       158

    accuracy                           0.63      1596
   macro avg       0.66      0.59      0.59      1596
weighted avg       0.64      0.63      0.61      1596



# FlauBERT with augmented df

In [None]:
import pandas as pd
import torch
from transformers import FlaubertModel, FlaubertTokenizer, FlaubertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler
import numpy as np
import nltk
from nltk.corpus import wordnet
import string

# Download necessary NLTK data
nltk.download('punkt')

data = new_df  # Use the provided new_df

# Map labels to integers
label_mapping = {'A1': 0, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 5}
data['difficulty'] = data['difficulty'].map(label_mapping)

# Feature Engineering Functions
def get_sentence_length(sentence):
    return len(sentence)

def get_word_count(sentence):
    words = nltk.word_tokenize(sentence)
    return len(words)

def get_avg_word_length(sentence):
    words = nltk.word_tokenize(sentence)
    return np.mean([len(word) for word in words])

def count_punctuation(sentence):
    return sum([1 for char in sentence if char in string.punctuation])

# Extract Features
data['sentence_length'] = data['sentence'].apply(get_sentence_length)
data['word_count'] = data['sentence'].apply(get_word_count)
data['avg_word_length'] = data['sentence'].apply(get_avg_word_length)
data['punctuation_count'] = data['sentence'].apply(count_punctuation)

# Standardize the features
scaler = StandardScaler()
feature_cols = ['sentence_length', 'word_count', 'avg_word_length', 'punctuation_count']
data[feature_cols] = scaler.fit_transform(data[feature_cols])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(data, data['difficulty'], test_size=0.2, random_state=42)

# Tokenization
tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased')

train_encodings = tokenizer(X_train['sentence'].tolist(), truncation=True, padding=True)
test_encodings = tokenizer(X_test['sentence'].tolist(), truncation=True, padding=True)

class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None, features=None):
        self.encodings = encodings
        self.labels = labels
        self.features = features

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        if self.features is not None:
            item['features'] = torch.tensor(self.features[idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# Set the device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Load pre-trained model
model = FlaubertForSequenceClassification.from_pretrained('flaubert/flaubert_base_cased', num_labels=6).to(device)

# Prepare the datasets
train_dataset = ClassificationDataset(train_encodings, y_train.tolist(), X_train[feature_cols].values)
test_dataset = ClassificationDataset(test_encodings, y_test.tolist(), X_test[feature_cols].values)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=16,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=1000,
    weight_decay=0.0005,
    logging_dir='./logs',
    logging_steps=20,
    evaluation_strategy="epoch",
    learning_rate=0.000005,
    fp16=True
)

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train the model
trainer.train()

# Evaluate the model
predictions, label_ids, metrics = trainer.predict(test_dataset)
preds = predictions.argmax(-1)

# Calculate metrics
accuracy = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds, average='weighted')
recall = recall_score(y_test, preds, average='weighted')
f1 = f1_score(y_test, preds, average='weighted')

# Print the results
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

# Print a detailed classification report
print(classification_report(y_test, preds, target_names=['A1', 'A2', 'B1', 'B2', 'C1', 'C2']))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.588,1.525918
2,1.2739,1.153823
3,1.0686,1.035337
4,0.7769,0.95785
5,0.7809,0.938287
6,0.5852,0.891768
7,0.5986,0.855803
8,0.4364,0.776548
9,0.3513,0.871536
10,0.2638,0.743021


Epoch,Training Loss,Validation Loss
1,1.588,1.525918
2,1.2739,1.153823
3,1.0686,1.035337
4,0.7769,0.95785
5,0.7809,0.938287
6,0.5852,0.891768
7,0.5986,0.855803
8,0.4364,0.776548
9,0.3513,0.871536
10,0.2638,0.743021


Accuracy: 0.82
Precision: 0.83
Recall: 0.82
F1 Score: 0.82
              precision    recall  f1-score   support

          A1       0.75      0.67      0.71       169
          A2       0.75      0.89      0.81       318
          B1       0.86      0.83      0.85       329
          B2       0.86      0.86      0.86       310
          C1       0.87      0.89      0.88       312
          C2       0.85      0.62      0.72       158

    accuracy                           0.82      1596
   macro avg       0.82      0.79      0.80      1596
weighted avg       0.83      0.82      0.82      1596



## Pipeline for final prediction

In [None]:
from transformers import pipeline, FlaubertForSequenceClassification, FlaubertTokenizer
unlabelled_url = "https://raw.githubusercontent.com/GiammarcoBozzelli/DSML/main/DATA/unlabelled_test_data.csv"
unlabelled_data = pd.read_csv(unlabelled_url)

# Load the saved CamemBERT model and tokenizer
distilbert_path = './distilbert'
distilbert_model = DistilBertForSequenceClassification.from_pretrained(distilbert_path)
distilbert_tokenizer = DistilBertTokenizer.from_pretrained(distilbert_path)

# Load the saved CamemBERT model and tokenizer
camembert_path = './camembert'
camembert_model = CamembertForSequenceClassification.from_pretrained(camembert_path)
camembert_tokenizer = CamembertTokenizer.from_pretrained(camembert_path)

# Load the saved Flaubert model and tokenizer
flaubert_path = './flaubert'
flaubert_model = FlaubertForSequenceClassification.from_pretrained(flaubert_path)
flaubert_tokenizer = FlaubertTokenizer.from_pretrained(flaubert_path)

# Create prediction pipelines to get probabilities
device = 0 if torch.cuda.is_available() else -1
distilbert_classifier = pipeline('text-classification', model=distilbert_model, tokenizer=distilbert_tokenizer, framework='pt', device=device, return_all_scores=True)
camembert_classifier = pipeline('text-classification', model=camembert_model, tokenizer=camembert_tokenizer, framework='pt', device=device, return_all_scores=True)
flaubert_classifier = pipeline('text-classification', model=flaubert_model, tokenizer=flaubert_tokenizer, framework='pt', device=device, return_all_scores=True)

# Predict probabilities for the unlabelled data using both models
distilbert_probs = distilbert_classifier(unlabelled_data['sentence'].tolist())
camembert_probs = camembert_classifier(unlabelled_data['sentence'].tolist())
flaubert_probs = flaubert_classifier(unlabelled_data['sentence'].tolist())

# Convert the predictions to numpy arrays
distilbertprobs_array = np.array([[prob['score'] for prob in probs] for probs in distilbert_probs])
camembert_probs_array = np.array([[prob['score'] for prob in probs] for probs in camembert_probs])
flaubert_probs_array = np.array([[prob['score'] for prob in probs] for probs in flaubert_probs])

# Combine predictions using soft voting (average probabilities)
average_probs = (distilbertprobs_array + camembert_probs_array + flaubert_probs_array) / 3
# average_probs = ( camembert_probs_array + flaubert_probs_array) / 2
final_preds = np.argmax(average_probs, axis=1)
S
inverse_label_mapping = {v: k for k, v in label_mapping.items()}
final_labels = [inverse_label_mapping[pred] for pred in final_preds]

# Create a DataFrame to export
results_df = pd.DataFrame({
    'id': unlabelled_data['id'],
    'difficulty': final_labels
})

# Save the results to a new CSV file
results_df.to_csv('s.csv', index=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
