## Loading data



In [3]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns

# Lecture du dataframe
def load_data(path=r'./train.txt'):
    df = pd.read_csv(path, header=None, sep=":::",engine='python')
    df['text']=df[0].str.extract(r'^\([A-Z]{3}\) (.*)$')
    df['category']=df[0].str.extract(r'^\(([A-Z]{3})\)')
    df = df.drop([0], axis=1)
    return df
    
df = load_data()
df.tail()

Unnamed: 0,text,category
9895,"Nowadays , more and more people go abroad , no...",CHI
9896,In accomplishing something that is risky comes...,KOR
9897,"At the beginning of the 21st century , the inc...",SPA
9898,The number of cars in use across the world has...,HIN
9899,Many people think it is betters to have borad ...,CHI


## Split data

In [4]:
def get_splitted_df(df, x_col='text', y_col='category', test_size=0.2, random_state=42, verbose=True):
  np.random.seed(random_state)
  df_train, df_dev, df_test = np.split(df.sample(frac=1, random_state=random_state),
                                      [int((1-test_size)*len(df)), int((1-test_size/2)*len(df))])

  X_train = df_train[x_col]
  X_dev = df_dev[x_col]
  X_test = df_test[x_col]
  y_train = df_train[y_col]
  y_dev = df_dev[y_col]
  y_test = df_test[y_col]

  if verbose:
    # Affichage du shape de chaque variable
    print("Shape de X_train :", X_train.shape)
    print("Shape de y_train :", y_train.shape)
    print("Shape de X_dev :", X_dev.shape)
    print("Shape de y_dev :", y_dev.shape)
    print("Shape de X_test :", X_test.shape)
    print("Shape de y_test :", y_test.shape)

    # Final data check
    print("\ny_train split:\n", y_train.value_counts())
    print("\ny_dev split:\n", y_dev.value_counts())
    print("\ny_test split:\n", y_test.value_counts())

  return X_train, X_dev, X_test, y_train, y_dev, y_test

# load data
df = load_data()

# variables et target
#X = df['text']
#y = df['category']

X_train, X_dev, X_test, y_train, y_dev, y_test = get_splitted_df(df, x_col='text', y_col='category', test_size=0.2, random_state=42, verbose=True)

set_X_train = set(X_train)
set_X_test = set(X_test)
print("\nlength of intersection set_X_train X set_X_test =", len(set_X_train.intersection(set_X_test)))

Shape de X_train : (7920,)
Shape de y_train : (7920,)
Shape de X_dev : (990,)
Shape de y_dev : (990,)
Shape de X_test : (990,)
Shape de y_test : (990,)

y_train split:
 KOR    728
ITA    726
CHI    726
SPA    722
GER    721
JPN    721
TEL    721
TUR    718
ARA    717
FRE    711
HIN    709
Name: category, dtype: int64

y_dev split:
 HIN    96
FRE    94
JPN    94
GER    94
SPA    93
ARA    91
TUR    91
ITA    86
TEL    84
KOR    84
CHI    83
Name: category, dtype: int64

y_test split:
 HIN    95
TEL    95
FRE    95
ARA    92
CHI    91
TUR    91
ITA    88
KOR    88
SPA    85
JPN    85
GER    85
Name: category, dtype: int64

length of intersection set_X_train X set_X_test = 0


### Stratified shuffle split

In [5]:
"""
from sklearn.model_selection import StratifiedShuffleSplit

def split_data(X_to_split, y_to_split, test_size=0.2, random_state=42):
    # Séparation des données en train et test avec Stratified Shuffle Split
    sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    train_index, test_index = next(sss.split(X_to_split, y_to_split))
    X_train, y_train = X.iloc[train_index], y.iloc[train_index]
    X_test, y_test = X.iloc[test_index], y.iloc[test_index]
    ## debug
    print("train_index>>>", train_index)
    print("test_index>>>", test_index)
    return X_train, X_test, y_train, y_test

def get_train_dev_test_data(X_to_split, y_to_split, test_size=0.2, random_state=42, verbose=True):
  # Split data to Train, Test (step 1)
  print("\n----- step 1 -----")
  X_train, X_test, y_train, y_test = split_data(X, y, test_size, random_state)
  # Split test data to Test, Dev (step 2)
  print("\n----- step 2 -----")
  X_test, X_dev, y_test, y_dev = split_data(X_test, y_test, test_size=0.5, random_state=42)

  if verbose:
    # Affichage du shape de chaque variable
    print("Shape de X_train :", X_train.shape)
    print("Shape de y_train :", y_train.shape)
    print("Shape de X_dev :", X_dev.shape)
    print("Shape de y_dev :", y_dev.shape)
    print("Shape de X_test :", X_test.shape)
    print("Shape de y_test :", y_test.shape)

    # Final data check
    print("\ny_train split:\n", y_train.value_counts())
    print("\ny_dev split:\n", y_dev.value_counts())
    print("\ny_test split:\n", y_test.value_counts())
  
  return X_train, X_dev, X_test, y_train, y_dev, y_test

# load data
df = load_data()

# variables et target
X = df['text']
y = df['category']

# split
X_train, X_dev, X_test, y_train, y_dev, y_test = get_train_dev_test_data(X, y, test_size=0.2, random_state=42, verbose=True)

set_X_train = set(X_train)
set_X_test = set(X_test)
print("length of intersection set_X_train X set_X_test =", len(set_X_train.intersection(set_X_test)))
"""

'\nfrom sklearn.model_selection import StratifiedShuffleSplit\n\ndef split_data(X_to_split, y_to_split, test_size=0.2, random_state=42):\n    # Séparation des données en train et test avec Stratified Shuffle Split\n    sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)\n    train_index, test_index = next(sss.split(X_to_split, y_to_split))\n    X_train, y_train = X.iloc[train_index], y.iloc[train_index]\n    X_test, y_test = X.iloc[test_index], y.iloc[test_index]\n    ## debug\n    print("train_index>>>", train_index)\n    print("test_index>>>", test_index)\n    return X_train, X_test, y_train, y_test\n\ndef get_train_dev_test_data(X_to_split, y_to_split, test_size=0.2, random_state=42, verbose=True):\n  # Split data to Train, Test (step 1)\n  print("\n----- step 1 -----")\n  X_train, X_test, y_train, y_test = split_data(X, y, test_size, random_state)\n  # Split test data to Test, Dev (step 2)\n  print("\n----- step 2 -----")\n  X_test, X_dev, y_tes

## Label encoder

In [6]:
from sklearn.preprocessing import LabelEncoder

def get_label_encoder_for(y_to_encode):
  label_encoder = LabelEncoder()
  label_encoder.fit(y_to_encode) # y = df['category'] or df['group'] ...

  # to inverse transform we use :
  #label_encoder.inverse_transform([0, 1, 2, 3, 4])
  d = zip([i for i in range(len(y_to_encode.unique()))], label_encoder.inverse_transform([i for i in range(len(y_to_encode.unique()))]))
  class_labels_dict = {}
  for k,v in d:
      class_labels_dict[k] = v
  
  return label_encoder, class_labels_dict

label_encoder, class_labels_dict = get_label_encoder_for(df['category'])
print("Labelled classes : ", class_labels_dict)

Labelled classes :  {0: 'ARA', 1: 'CHI', 2: 'FRE', 3: 'GER', 4: 'HIN', 5: 'ITA', 6: 'JPN', 7: 'KOR', 8: 'SPA', 9: 'TEL', 10: 'TUR'}


## Fonctions d'évaluation

In [7]:

def print_metrics(y_test, y_pred):
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    # micro : Calculate metrics globally by counting the total true positives, false negatives and false positives.
    # macro : faire le calcul étiquette par étiquette, puis faire la moyenne
    labels_list=['ARA','CHI','FRE','GER','HIN','ITA','JPN','KOR','SPA','TEL','TUR']
    average_param = "macro"
    print("\nAccuracy ", accuracy_score(y_test, y_pred))

    print("\n- Precision -", average_param)
    tmp = "\n"+str(list(zip(labels_list, precision_score(y_test, y_pred, average=None, zero_division=1))))
    print("precision by language :", tmp.replace("),",")\n").replace("[",'').replace("]",''))
    print("\nprecision score ", precision_score(y_test, y_pred, average=average_param, zero_division=1))

    print("\n- Recall -", average_param)
    tmp = "\n"+str(list(zip(labels_list, recall_score(y_test, y_pred, average=None, zero_division=1))))
    print("recall by language ", tmp.replace("),",")\n").replace("[",'').replace("]",''))
    print("\nrecall score ",   recall_score(y_test, y_pred, average=average_param, zero_division=1))
    tmp = "\n"+str(list(zip(labels_list, f1_score(y_test, y_pred, average=None, zero_division=1))))
    
    print("\n- F1 -", average_param)
    print("f1 by language ", tmp.replace("),",")\n").replace("[",'').replace("]",''))
    print("\nf1 score ", f1_score(y_test, y_pred, average=average_param, zero_division=1))


def draw_confusion_matrix(y_test, y_pred, langs_label=['ARA',\
 'CHI',
 'FRE',
 'GER',
 'HIN',
 'ITA',
 'JPN',
 'KOR',
 'SPA',
 'TEL',
 'TUR'], 
                          class_labels_dict=class_labels_dict):
  
  from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
  import plotly.express as px
  lang2num = {v: k for k, v in class_labels_dict.items()}
  print(">>>", lang2num)
  print(">>", langs_label)
  lang_list =  [ lang2num[k] for k in langs_label] 
  data = confusion_matrix(y_true=y_test, y_pred=y_pred, labels=lang_list)
  fig = px.imshow(data,
                  labels=dict(x="Languages y_test", y="Languages y_pred", color="confusion"),
                  x=langs_label,
                  y=langs_label,
                  text_auto=True,
                  aspect="auto")
  fig.update_xaxes(side="top")
  fig.show()


def evaluation(y_test, y_pred, labels_list=['ARA',\
'CHI',
'FRE',
'GER',
'HIN',
'ITA',
'JPN',
'KOR',
'SPA',
'TEL',
'TUR'],
              class_labels_dict:dict=class_labels_dict, 
              exclude:list=[""]):
  print("\n-- EVALUATION --\n")
  draw_confusion_matrix(y_test, y_pred, langs_label=labels_list, class_labels_dict=class_labels_dict)

  print_metrics(y_test, y_pred)


## Fonction de test de plusiurs modèles et plusieurs méthodes de vectorisation du texte


In [8]:
# Fonction de test de plusiurs modèles et plusieurs méthodes de vectorisation du texte
def train_and_evaluate_model(model_name, vectorizer_type, X_train, X_dev, X_test, y_train, y_dev, y_test, label_encoder, labels_list=['ARA',\
'CHI',
'FRE',
'GER',
'HIN',
'ITA',
'JPN',
'KOR',
'SPA',
'TEL',
'TUR'],
              class_labels_dict:dict=class_labels_dict):
  from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
  from sklearn.linear_model import LogisticRegression
  from sklearn.svm import SVC
  from sklearn.metrics import accuracy_score

  # Vectorisation des données textuelles
  if vectorizer_type == 'count_vectorizer':
      vectorizer = CountVectorizer()
  elif vectorizer_type == 'tfidf_vectorizer':
      vectorizer = TfidfVectorizer()
  else:
      raise ValueError("Vectorizer type should be 'count_vectorizer' or 'tfidf_vectorizer'.")
      
  X_train_vectors = vectorizer.fit_transform(X_train)
  X_dev_vectors = vectorizer.transform(X_dev)
  X_test_vectors = vectorizer.transform(X_test)

  # Transformer les catégories en nombres entiers avec label encoder déjà entrainé 
  y_train_labels = label_encoder.transform(y_train)
  y_dev_labels = label_encoder.transform(y_dev)
  y_test_labels = label_encoder.transform(y_test)

  # Entraîner le modèle correspondant au nom spécifié
  if model_name == 'logistic_regression':
      clf = LogisticRegression()
  elif model_name == 'svm':
      clf = SVC()
  else:
      raise ValueError("Model name should be 'logistic_regression' or 'svm'.")
      
  clf.fit(X_train_vectors, y_train_labels)

  # Prédiction sur les données de développement (dev)
  y_pred_dev = clf.predict(X_dev_vectors)

  # Calcul de l'accuracy sur les données de développement (dev)
  accuracy_dev = accuracy_score(y_dev_labels, y_pred_dev)
  print(f"Accuracy sur les données de dev avec {vectorizer_type} et {model_name}: {accuracy_dev:.3f}")

  # Prédiction sur les données de test
  y_pred_test = clf.predict(X_test_vectors)

  # Calcul de l'accuracy sur les données de test
  accuracy_test = accuracy_score(y_test_labels, y_pred_test)
  print(f"Accuracy sur les données de test avec {vectorizer_type} et {model_name}: {accuracy_test:.3f}")

  # Evaluation function
  evaluation(y_test_labels, y_pred_test, labels_list, class_labels_dict)

  # return the trained model
  return clf, vectorizer


## Logistic regression
### Logistic regression avec Bag of Words

In [None]:
# Logistic regression avec Bag of Words 
train_and_evaluate_model('logistic_regression', 'count_vectorizer', X_train, X_dev, X_test, y_train, y_dev, y_test, label_encoder)

Accuracy sur les données de dev avec count_vectorizer et logistic_regression: 0.680
Accuracy sur les données de test avec count_vectorizer et logistic_regression: 0.660

-- EVALUATION --




lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




Accuracy  0.6595959595959596

- Precision - macro
precision by language : 
('ARA', 0.6597938144329897)
 ('CHI', 0.6666666666666666)
 ('FRE', 0.7261904761904762)
 ('GER', 0.6804123711340206)
 ('HIN', 0.6288659793814433)
 ('ITA', 0.7045454545454546)
 ('JPN', 0.654320987654321)
 ('KOR', 0.5918367346938775)
 ('SPA', 0.5384615384615384)
 ('TEL', 0.7439024390243902)
 ('TUR', 0.6818181818181818)

precision score  0.6615286040003054

- Recall - macro
recall by language  
('ARA', 0.6956521739130435)
 ('CHI', 0.6373626373626373)
 ('FRE', 0.6421052631578947)
 ('GER', 0.7764705882352941)
 ('HIN', 0.6421052631578947)
 ('ITA', 0.7045454545454546)
 ('JPN', 0.6235294117647059)
 ('KOR', 0.6590909090909091)
 ('SPA', 0.5764705882352941)
 ('TEL', 0.6421052631578947)
 ('TUR', 0.6593406593406593)

recall score  0.6598889283601529

- F1 - macro
f1 by language  
('ARA', 0.6772486772486773)
 ('CHI', 0.651685393258427)
 ('FRE', 0.6815642458100559)
 ('GER', 0.7252747252747253)
 ('HIN', 0.6354166666666667)
 ('IT

### Logistic regression avec TF-IDF

In [35]:
# Logistic regression avec TF IDF 
train_and_evaluate_model('logistic_regression', 'tfidf_vectorizer', X_train, X_dev, X_test, y_train, y_dev, y_test, label_encoder)

Accuracy sur les données de dev avec tfidf_vectorizer et logistic_regression: 0.681
Accuracy sur les données de test avec tfidf_vectorizer et logistic_regression: 0.647

-- EVALUATION --

>>> {'ARA': 0, 'CHI': 1, 'FRE': 2, 'GER': 3, 'HIN': 4, 'ITA': 5, 'JPN': 6, 'KOR': 7, 'SPA': 8, 'TEL': 9, 'TUR': 10}
>> ['ARA', 'CHI', 'FRE', 'GER', 'HIN', 'ITA', 'JPN', 'KOR', 'SPA', 'TEL', 'TUR']



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




Accuracy  0.6474747474747474

- Precision - macro
precision by language : 
('ARA', 0.6962025316455697)
 ('CHI', 0.5980392156862745)
 ('FRE', 0.7160493827160493)
 ('GER', 0.6634615384615384)
 ('HIN', 0.5794392523364486)
 ('ITA', 0.7777777777777778)
 ('JPN', 0.6582278481012658)
 ('KOR', 0.6195652173913043)
 ('SPA', 0.5306122448979592)
 ('TEL', 0.6333333333333333)
 ('TUR', 0.7142857142857143)

precision score  0.6533630960575669

- Recall - macro
recall by language  
('ARA', 0.5978260869565217)
 ('CHI', 0.6703296703296703)
 ('FRE', 0.6105263157894737)
 ('GER', 0.8117647058823529)
 ('HIN', 0.6526315789473685)
 ('ITA', 0.7159090909090909)
 ('JPN', 0.611764705882353)
 ('KOR', 0.6477272727272727)
 ('SPA', 0.611764705882353)
 ('TEL', 0.6)
 ('TUR', 0.6043956043956044)

recall score  0.6486036125183692

- F1 - macro
f1 by language  
('ARA', 0.6432748538011697)
 ('CHI', 0.6321243523316062)
 ('FRE', 0.6590909090909091)
 ('GER', 0.7301587301587301)
 ('HIN', 0.6138613861386139)
 ('ITA', 0.745562130

## SVM
### SVM avec Bag of Words

In [None]:
# SVM avec Bag of Words
train_and_evaluate_model('svm', 'count_vectorizer', X_train, X_dev, X_test, y_train, y_dev, y_test, label_encoder)

Accuracy sur les données de dev avec count_vectorizer et svm: 0.576
Accuracy sur les données de test avec count_vectorizer et svm: 0.534

-- EVALUATION --




Accuracy  0.5343434343434343

- Precision - macro
precision by language : 
('ARA', 0.5)
 ('CHI', 0.53125)
 ('FRE', 0.5053763440860215)
 ('GER', 0.5247524752475248)
 ('HIN', 0.5)
 ('ITA', 0.5853658536585366)
 ('JPN', 0.5111111111111111)
 ('KOR', 0.6266666666666667)
 ('SPA', 0.47619047619047616)
 ('TEL', 0.5698924731182796)
 ('TUR', 0.581081081081081)

precision score  0.5374260437417907

- Recall - macro
recall by language  
('ARA', 0.4891304347826087)
 ('CHI', 0.5604395604395604)
 ('FRE', 0.49473684210526314)
 ('GER', 0.6235294117647059)
 ('HIN', 0.5894736842105263)
 ('ITA', 0.5454545454545454)
 ('JPN', 0.5411764705882353)
 ('KOR', 0.5340909090909091)
 ('SPA', 0.47058823529411764)
 ('TEL', 0.5578947368421052)
 ('TUR', 0.4725274725274725)

recall score  0.5344583911909137

- F1 - macro
f1 by language  
('ARA', 0.4945054945054945)
 ('CHI', 0.5454545454545454)
 ('FRE', 0.5)
 ('GER', 0.5698924731182795)
 ('HIN', 0.5410628019323671)
 ('ITA', 0.5647058823529411)
 ('JPN', 0.5257142857142858)

### SVM avec TF IDF

In [None]:
# SVM avec TF IDF
train_and_evaluate_model('svm', 'tfidf_vectorizer', X_train, X_dev, X_test, y_train, y_dev, y_test, label_encoder)

Accuracy sur les données de dev avec tfidf_vectorizer et svm: 0.684
Accuracy sur les données de test avec tfidf_vectorizer et svm: 0.657

-- EVALUATION --




Accuracy  0.6565656565656566

- Precision - macro
precision by language : 
('ARA', 0.6436781609195402)
 ('CHI', 0.6037735849056604)
 ('FRE', 0.75)
 ('GER', 0.6666666666666666)
 ('HIN', 0.5517241379310345)
 ('ITA', 0.84)
 ('JPN', 0.6973684210526315)
 ('KOR', 0.625)
 ('SPA', 0.5252525252525253)
 ('TEL', 0.7183098591549296)
 ('TUR', 0.7215189873417721)

precision score  0.667572031202251

- Recall - macro
recall by language  
('ARA', 0.6086956521739131)
 ('CHI', 0.7032967032967034)
 ('FRE', 0.631578947368421)
 ('GER', 0.8235294117647058)
 ('HIN', 0.6736842105263158)
 ('ITA', 0.7159090909090909)
 ('JPN', 0.6235294117647059)
 ('KOR', 0.6818181818181818)
 ('SPA', 0.611764705882353)
 ('TEL', 0.5368421052631579)
 ('TUR', 0.6263736263736264)

recall score  0.6579110951946522

- F1 - macro
f1 by language  
('ARA', 0.6256983240223463)
 ('CHI', 0.6497461928934011)
 ('FRE', 0.6857142857142857)
 ('GER', 0.7368421052631577)
 ('HIN', 0.6066350710900473)
 ('ITA', 0.7730061349693251)
 ('JPN', 0.6583850

## Modèles en cascade
Grouper les catégories qui se ressemblent en se référent à la matrice de confusion, on prend les catégories que le modèle confond et on les mets dans un même groupe. Des modèles spécialisés seront créés par la suite pour chaque groupe afin de diminuer la confusion.

In [36]:
"""
0:[ARA,TUR]; 1:[KOR,CHI,JPN] 2:[FRE,SPA,ITA,GER]; 3:[HIN,TEL] 
"""
label_encoder, class_labels_dict = get_label_encoder_for(df['category']) # df['group'] au lieu de df['category']

lang_to_group = {'ARA':0, 'CHI':1, 'FRE':2, 'GER':2, 'HIN':3, 'ITA':2, 'JPN':1, 'KOR':1, 'SPA':2, 'TEL':3, 'TUR':0}
label_to_group = {i:lang_to_group[class_labels_dict[i]] for i in range(len(class_labels_dict))}

print(lang_to_group)
print(label_to_group)

# load data
df = load_data()

# créer les groupes
df['group'] = df['category'].apply(lambda x: lang_to_group[x])

# variables et target
#X = df['text']
#y = df['group'] # target = 'group' aulieu de category !

# split
X_train, X_dev, X_test, y_train, y_dev, y_test = get_splitted_df(df, x_col='text', y_col='group', test_size=0.2, random_state=42, verbose=True)


{'ARA': 0, 'CHI': 1, 'FRE': 2, 'GER': 2, 'HIN': 3, 'ITA': 2, 'JPN': 1, 'KOR': 1, 'SPA': 2, 'TEL': 3, 'TUR': 0}
{0: 0, 1: 1, 2: 2, 3: 2, 4: 3, 5: 2, 6: 1, 7: 1, 8: 2, 9: 3, 10: 0}
Shape de X_train : (7920,)
Shape de y_train : (7920,)
Shape de X_dev : (990,)
Shape de y_dev : (990,)
Shape de X_test : (990,)
Shape de y_test : (990,)

y_train split:
 2    2880
1    2175
0    1435
3    1430
Name: group, dtype: int64

y_dev split:
 2    367
1    261
0    182
3    180
Name: group, dtype: int64

y_test split:
 2    353
1    264
3    190
0    183
Name: group, dtype: int64


In [37]:
# label pour les groupes
label_encoder, class_labels_dict = get_label_encoder_for(df['group']) # df['group'] au lieu de df['category']
print("Labelled classes : ", class_labels_dict)

# LR avec BOW
train_and_evaluate_model('logistic_regression', 'count_vectorizer', X_train, X_dev, X_test, y_train, y_dev, y_test, label_encoder, labels_list=list(class_labels_dict.values()), class_labels_dict=class_labels_dict)

Labelled classes :  {0: 0, 1: 1, 2: 2, 3: 3}
Accuracy sur les données de dev avec count_vectorizer et logistic_regression: 0.832
Accuracy sur les données de test avec count_vectorizer et logistic_regression: 0.841

-- EVALUATION --

>>> {0: 0, 1: 1, 2: 2, 3: 3}
>> [0, 1, 2, 3]



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




Accuracy  0.8414141414141414

- Precision - macro
precision by language : 
('ARA', 0.7857142857142857)
 ('CHI', 0.8643410852713178)
 ('FRE', 0.8272251308900523)
 ('GER', 0.8901098901098901)

precision score  0.8418475979963865

- Recall - macro
recall by language  
('ARA', 0.7213114754098361)
 ('CHI', 0.8446969696969697)
 ('FRE', 0.8951841359773371)
 ('GER', 0.8526315789473684)

recall score  0.8284560400078779

- F1 - macro
f1 by language  
('ARA', 0.7521367521367521)
 ('CHI', 0.8544061302681992)
 ('FRE', 0.8598639455782313)
 ('GER', 0.8709677419354839)

f1 score  0.8343436424796666


In [None]:
# label pour les groupes
label_encoder, class_labels_dict = get_label_encoder_for(df['group']) # df['group'] au lieu de df['category']
print("Labelled classes : ", class_labels_dict)

# LR avec TF IDF
train_and_evaluate_model('logistic_regression', 'tfidf_vectorizer', X_train, X_dev, X_test, y_train, y_dev, y_test, label_encoder, labels_list=list(class_labels_dict.values()), class_labels_dict=class_labels_dict)

Labelled classes :  {0: 0, 1: 1, 2: 2, 3: 3}
Accuracy sur les données de dev avec tfidf_vectorizer et logistic_regression: 0.831
Accuracy sur les données de test avec tfidf_vectorizer et logistic_regression: 0.823

-- EVALUATION --




lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




Accuracy  0.8232323232323232

- Precision - macro
precision by language : 
('ARA', 0.8230088495575221)
 ('CHI', 0.8245614035087719)
 ('FRE', 0.7913669064748201)
 ('GER', 0.8971428571428571)

precision score  0.8340200041709929

- Recall - macro
recall by language  
('ARA', 0.5081967213114754)
 ('CHI', 0.8901515151515151)
 ('FRE', 0.9348441926345609)
 ('GER', 0.8263157894736842)

recall score  0.7898770546428089

- F1 - macro
f1 by language  
('ARA', 0.6283783783783784)
 ('CHI', 0.8561020036429873)
 ('FRE', 0.8571428571428571)
 ('GER', 0.8602739726027397)

f1 score  0.8004743029417407


In [None]:
# label pour les groupes
label_encoder, class_labels_dict = get_label_encoder_for(df['group']) # df['group'] au lieu de df['category']
print("Labelled classes : ", class_labels_dict)

# SVM avec BOW
train_and_evaluate_model('svm', 'count_vectorizer', X_train, X_dev, X_test, y_train, y_dev, y_test, label_encoder, labels_list=list(class_labels_dict.values()), class_labels_dict=class_labels_dict)

Labelled classes :  {0: 0, 1: 1, 2: 2, 3: 3}
Accuracy sur les données de dev avec count_vectorizer et svm: 0.779
Accuracy sur les données de test avec count_vectorizer et svm: 0.761

-- EVALUATION --




Accuracy  0.7606060606060606

- Precision - macro
precision by language : 
('ARA', 0.6554621848739496)
 ('CHI', 0.7665505226480837)
 ('FRE', 0.7555555555555555)
 ('GER', 0.8324022346368715)

precision score  0.7524926244286151

- Recall - macro
recall by language  
('ARA', 0.4262295081967213)
 ('CHI', 0.8333333333333334)
 ('FRE', 0.8668555240793201)
 ('GER', 0.7842105263157895)

recall score  0.7276572229812911

- F1 - macro
f1 by language  
('ARA', 0.5165562913907285)
 ('CHI', 0.7985480943738658)
 ('FRE', 0.8073878627968337)
 ('GER', 0.8075880758807589)

f1 score  0.7325200811105468


In [None]:
# label pour les groupes
label_encoder, class_labels_dict = get_label_encoder_for(df['group']) # df['group'] au lieu de df['category']
print("Labelled classes : ", class_labels_dict)

# SVM avec TF IDF
train_and_evaluate_model('svm', 'tfidf_vectorizer', X_train, X_dev, X_test, y_train, y_dev, y_test, label_encoder, labels_list=list(class_labels_dict.values()), class_labels_dict=class_labels_dict)

Labelled classes :  {0: 0, 1: 1, 2: 2, 3: 3}
Accuracy sur les données de dev avec tfidf_vectorizer et svm: 0.839
Accuracy sur les données de test avec tfidf_vectorizer et svm: 0.838

-- EVALUATION --




Accuracy  0.8383838383838383

- Precision - macro
precision by language : 
('ARA', 0.828125)
 ('CHI', 0.8404255319148937)
 ('FRE', 0.8074074074074075)
 ('GER', 0.9142857142857143)

precision score  0.8475609134020038

- Recall - macro
recall by language  
('ARA', 0.5792349726775956)
 ('CHI', 0.8977272727272727)
 ('FRE', 0.9263456090651558)
 ('GER', 0.8421052631578947)

recall score  0.8113532794069798

- F1 - macro
f1 by language  
('ARA', 0.6816720257234726)
 ('CHI', 0.8681318681318682)
 ('FRE', 0.8627968337730871)
 ('GER', 0.8767123287671234)

f1 score  0.8223282640988878


## Choix du modèle pour la première étape de classification par groupes :
La régression logistique avec une vectorisation BOW donne les meilleurs résultats en général;
Même si le SVM avec TF-IDF est légèrement meilleur sur les groupes 1,2.
La différence se fait en grande majorité sur le groupe 1.

## Conclusion :
On peut essayer de séparer ARA et TUR chacune dans un goupe différent.
Car c'est le seul goupe qui pause un soucis.

## Nouveaux groupes

In [81]:
"""
0:[ARA]; 1:[KOR,CHI,JPN] 2:[FRE,SPA,ITA,GER]; 3:[HIN,TEL]; 4:[TUR]
"""
label_encoder, class_labels_dict = get_label_encoder_for(df['category']) # df['group'] au lieu de df['category']

lang_to_group = {'ARA':0, 'CHI':1, 'FRE':2, 'GER':2, 'HIN':3, 'ITA':2, 'JPN':1, 'KOR':1, 'SPA':2, 'TEL':3, 'TUR':4}
label_to_group = {i:lang_to_group[class_labels_dict[i]] for i in range(len(class_labels_dict))}

print(lang_to_group)
print(label_to_group)

# load data
df = load_data()

# créer les groupes
df['group'] = df['category'].apply(lambda x: lang_to_group[x])

# variables et target
#X = df['text']
#y = df['group'] # target = 'group' aulieu de category !

# split
X_train, X_dev, X_test, y_train, y_dev, y_test = get_splitted_df(df, x_col='text', y_col='group', test_size=0.2, random_state=42, verbose=True)


{'ARA': 0, 'CHI': 1, 'FRE': 2, 'GER': 2, 'HIN': 3, 'ITA': 2, 'JPN': 1, 'KOR': 1, 'SPA': 2, 'TEL': 3, 'TUR': 4}
{0: 0, 1: 1, 2: 2, 3: 2, 4: 3, 5: 2, 6: 1, 7: 1, 8: 2, 9: 3, 10: 4}
Shape de X_train : (7920,)
Shape de y_train : (7920,)
Shape de X_dev : (990,)
Shape de y_dev : (990,)
Shape de X_test : (990,)
Shape de y_test : (990,)

y_train split:
 2    2880
1    2175
3    1430
4     718
0     717
Name: group, dtype: int64

y_dev split:
 2    367
1    261
3    180
0     91
4     91
Name: group, dtype: int64

y_test split:
 2    353
1    264
3    190
0     92
4     91
Name: group, dtype: int64


In [65]:
# label pour les groupes
label_encoder, class_labels_dict = get_label_encoder_for(df['group']) # df['group'] au lieu de df['category']
print("Labelled classes : ", class_labels_dict)

# LR avec BOW
model_5_groups, vectorizer_5_groups = train_and_evaluate_model('logistic_regression', 'count_vectorizer', X_train, X_dev, X_test, y_train, y_dev, y_test, label_encoder, labels_list=list(class_labels_dict.values()), class_labels_dict=class_labels_dict)

# LR avec TF IDF
train_and_evaluate_model('logistic_regression', 'tfidf_vectorizer', X_train, X_dev, X_test, y_train, y_dev, y_test, label_encoder, labels_list=list(class_labels_dict.values()), class_labels_dict=class_labels_dict)


Labelled classes :  {0: 0, 1: 1, 2: 2, 3: 3, 4: 4}
Accuracy sur les données de dev avec count_vectorizer et logistic_regression: 0.827
Accuracy sur les données de test avec count_vectorizer et logistic_regression: 0.831

-- EVALUATION --

>>> {0: 0, 1: 1, 2: 2, 3: 3, 4: 4}
>> [0, 1, 2, 3, 4]



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




Accuracy  0.8313131313131313

- Precision - macro
precision by language : 
('ARA', 0.6923076923076923)
 ('CHI', 0.8513011152416357)
 ('FRE', 0.8232189973614775)
 ('GER', 0.9147727272727273)
 ('HIN', 0.7733333333333333)

precision score  0.8109867731033733

- Recall - macro
recall by language  
('ARA', 0.6847826086956522)
 ('CHI', 0.8674242424242424)
 ('FRE', 0.8838526912181303)
 ('GER', 0.8473684210526315)
 ('HIN', 0.6373626373626373)

recall score  0.7841581201506588

- F1 - macro
f1 by language  
('ARA', 0.6885245901639345)
 ('CHI', 0.8592870544090057)
 ('FRE', 0.8524590163934426)
 ('GER', 0.8797814207650274)
 ('HIN', 0.6987951807228915)

f1 score  0.7957694524908604
Accuracy sur les données de dev avec tfidf_vectorizer et logistic_regression: 0.803
Accuracy sur les données de test avec tfidf_vectorizer et logistic_regression: 0.808

-- EVALUATION --

>>> {0: 0, 1: 1, 2: 2, 3: 3, 4: 4}
>> [0, 1, 2, 3, 4]



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




Accuracy  0.8080808080808081

- Precision - macro
precision by language : 
('ARA', 0.8780487804878049)
 ('CHI', 0.7953020134228188)
 ('FRE', 0.7655172413793103)
 ('GER', 0.8888888888888888)
 ('HIN', 0.9444444444444444)

precision score  0.8544402737246534

- Recall - macro
recall by language  
('ARA', 0.391304347826087)
 ('CHI', 0.8977272727272727)
 ('FRE', 0.943342776203966)
 ('GER', 0.8421052631578947)
 ('HIN', 0.37362637362637363)

recall score  0.6896212067083188

- F1 - macro
f1 by language  
('ARA', 0.5413533834586466)
 ('CHI', 0.8434163701067615)
 ('FRE', 0.8451776649746193)
 ('GER', 0.8648648648648649)
 ('HIN', 0.5354330708661418)

f1 score  0.7260490708542069


(LogisticRegression(), TfidfVectorizer())

In [17]:
# label pour les groupes
label_encoder, class_labels_dict = get_label_encoder_for(df['group']) # df['group'] au lieu de df['category']
print("Labelled classes : ", class_labels_dict)

# SVM avec BOW
train_and_evaluate_model('svm', 'count_vectorizer', X_train, X_dev, X_test, y_train, y_dev, y_test, label_encoder, labels_list=list(class_labels_dict.values()), class_labels_dict=class_labels_dict)

# SVM avec TF IDF
train_and_evaluate_model('svm', 'tfidf_vectorizer', X_train, X_dev, X_test, y_train, y_dev, y_test, label_encoder, labels_list=list(class_labels_dict.values()), class_labels_dict=class_labels_dict)


Labelled classes :  {0: 0, 1: 1, 2: 2, 3: 3, 4: 4}
Accuracy sur les données de dev avec count_vectorizer et svm: 0.757
Accuracy sur les données de test avec count_vectorizer et svm: 0.745

-- EVALUATION --




Accuracy  0.7454545454545455

- Precision - macro
precision by language : 
('ARA', 0.725)
 ('CHI', 0.7287581699346405)
 ('FRE', 0.7245370370370371)
 ('GER', 0.8118279569892473)
 ('HIN', 0.8461538461538461)

precision score  0.7672554020229542

- Recall - macro
recall by language  
('ARA', 0.31521739130434784)
 ('CHI', 0.8446969696969697)
 ('FRE', 0.886685552407932)
 ('GER', 0.7947368421052632)
 ('HIN', 0.24175824175824176)

recall score  0.6166189994545509

- F1 - macro
f1 by language  
('ARA', 0.4393939393939394)
 ('CHI', 0.7824561403508772)
 ('FRE', 0.7974522292993631)
 ('GER', 0.8031914893617021)
 ('HIN', 0.3760683760683761)

f1 score  0.6397124348948516
Accuracy sur les données de dev avec tfidf_vectorizer et svm: 0.821
Accuracy sur les données de test avec tfidf_vectorizer et svm: 0.820

-- EVALUATION --




Accuracy  0.8202020202020202

- Precision - macro
precision by language : 
('ARA', 0.8723404255319149)
 ('CHI', 0.8169491525423729)
 ('FRE', 0.765661252900232)
 ('GER', 0.9096045197740112)
 ('HIN', 0.975)

precision score  0.8679110701497061

- Recall - macro
recall by language  
('ARA', 0.44565217391304346)
 ('CHI', 0.9128787878787878)
 ('FRE', 0.9348441926345609)
 ('GER', 0.8473684210526315)
 ('HIN', 0.42857142857142855)

recall score  0.7138630008100905

- F1 - macro
f1 by language  
('ARA', 0.5899280575539568)
 ('CHI', 0.8622540250447227)
 ('FRE', 0.8418367346938777)
 ('GER', 0.8773841961852861)
 ('HIN', 0.5954198473282443)

f1 score  0.7533645721612174


## Conclusion après séparation en 5 groupes
La régression logistique avec BOW l'emporte.
On garde les 5 groupes pour économiser la création d'un autre modèle spécialisé.

In [66]:
import pickle
# save the model to disk
filename = 'model_5_groups.sav'
pickle.dump(model_5_groups, open(filename, 'wb'))

# save the vectorizer to disk
filename = 'vectorizer_5_groups.pkl'
pickle.dump(vectorizer_5_groups, open(filename, 'wb'))

# load the model from disk
#loaded_model = pickle.load(open(filename, 'rb'))

## Modèle spécialisé Groupe 1 [KOR,CHI,JPN]
Passons maintenant aux modèles spécialisés pour séparer les classes du groupe 1

In [82]:
df_g1 = df[df['group'].isin([1])]


In [83]:
# split df_g1 par category
X_train, X_dev, X_test, y_train, y_dev, y_test = get_splitted_df(df_g1, x_col='text', y_col='category', test_size=0.2, random_state=42, verbose=True)


Shape de X_train : (2160,)
Shape de y_train : (2160,)
Shape de X_dev : (270,)
Shape de y_dev : (270,)
Shape de X_test : (270,)
Shape de y_test : (270,)

y_train split:
 JPN    723
CHI    719
KOR    718
Name: category, dtype: int64

y_dev split:
 KOR    98
CHI    93
JPN    79
Name: category, dtype: int64

y_test split:
 JPN    98
CHI    88
KOR    84
Name: category, dtype: int64


In [84]:
# label pour le groupe 1
label_encoder, class_labels_dict = get_label_encoder_for(df_g1['category']) # category de df_g1
print("Labelled classes : ", class_labels_dict)

# LR avec BOW
train_and_evaluate_model('logistic_regression', 'count_vectorizer', X_train, X_dev, X_test, y_train, y_dev, y_test, label_encoder, labels_list=list(class_labels_dict.values()), class_labels_dict=class_labels_dict)

# LR avec TF IDF
model_g1, vectorizer_g1 = train_and_evaluate_model('logistic_regression', 'tfidf_vectorizer', X_train, X_dev, X_test, y_train, y_dev, y_test, label_encoder, labels_list=list(class_labels_dict.values()), class_labels_dict=class_labels_dict)


Labelled classes :  {0: 'CHI', 1: 'JPN', 2: 'KOR'}
Accuracy sur les données de dev avec count_vectorizer et logistic_regression: 0.748
Accuracy sur les données de test avec count_vectorizer et logistic_regression: 0.711

-- EVALUATION --

>>> {'CHI': 0, 'JPN': 1, 'KOR': 2}
>> ['CHI', 'JPN', 'KOR']



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




Accuracy  0.7111111111111111

- Precision - macro
precision by language : 
('ARA', 0.7386363636363636)
 ('CHI', 0.7244897959183674)
 ('FRE', 0.6666666666666666)

precision score  0.7099309420737993

- Recall - macro
recall by language  
('ARA', 0.7386363636363636)
 ('CHI', 0.7244897959183674)
 ('FRE', 0.6666666666666666)

recall score  0.7099309420737993

- F1 - macro
f1 by language  
('ARA', 0.7386363636363636)
 ('CHI', 0.7244897959183674)
 ('FRE', 0.6666666666666666)

f1 score  0.7099309420737993
Accuracy sur les données de dev avec tfidf_vectorizer et logistic_regression: 0.744
Accuracy sur les données de test avec tfidf_vectorizer et logistic_regression: 0.741

-- EVALUATION --

>>> {'CHI': 0, 'JPN': 1, 'KOR': 2}
>> ['CHI', 'JPN', 'KOR']



Accuracy  0.7407407407407407

- Precision - macro
precision by language : 
('ARA', 0.7528089887640449)
 ('CHI', 0.8)
 ('FRE', 0.6770833333333334)

precision score  0.7432974406991262

- Recall - macro
recall by language  
('ARA', 0.7613636363636364)
 ('CHI', 0.6938775510204082)
 ('FRE', 0.7738095238095238)

recall score  0.7430169037311894

- F1 - macro
f1 by language  
('ARA', 0.7570621468926553)
 ('CHI', 0.7431693989071038)
 ('FRE', 0.7222222222222222)

f1 score  0.7408179226739938


In [43]:
# label pour le groupe 1
label_encoder, class_labels_dict = get_label_encoder_for(df_g1['category']) # category de df_g1
print("Labelled classes : ", class_labels_dict)

# SVM avec BOW
train_and_evaluate_model('svm', 'count_vectorizer', X_train, X_dev, X_test, y_train, y_dev, y_test, label_encoder, labels_list=list(class_labels_dict.values()), class_labels_dict=class_labels_dict)

# SVM avec TF IDF
train_and_evaluate_model('svm', 'tfidf_vectorizer', X_train, X_dev, X_test, y_train, y_dev, y_test, label_encoder, labels_list=list(class_labels_dict.values()), class_labels_dict=class_labels_dict)


Labelled classes :  {0: 'CHI', 1: 'JPN', 2: 'KOR'}
Accuracy sur les données de dev avec count_vectorizer et svm: 0.685
Accuracy sur les données de test avec count_vectorizer et svm: 0.674

-- EVALUATION --

>>> {'CHI': 0, 'JPN': 1, 'KOR': 2}
>> ['CHI', 'JPN', 'KOR']



Accuracy  0.674074074074074

- Precision - macro
precision by language : 
('ARA', 0.6931818181818182)
 ('CHI', 0.6770833333333334)
 ('FRE', 0.6511627906976745)

precision score  0.673809314070942

- Recall - macro
recall by language  
('ARA', 0.6931818181818182)
 ('CHI', 0.6632653061224489)
 ('FRE', 0.6666666666666666)

recall score  0.6743712636569779

- F1 - macro
f1 by language  
('ARA', 0.6931818181818182)
 ('CHI', 0.6701030927835052)
 ('FRE', 0.6588235294117646)

f1 score  0.6740361467923627
Accuracy sur les données de dev avec tfidf_vectorizer et svm: 0.726
Accuracy sur les données de test avec tfidf_vectorizer et svm: 0.737

-- EVALUATION --

>>> {'CHI': 0, 'JPN': 1, 'KOR': 2}
>> ['CHI', 'JPN', 'KOR']



Accuracy  0.737037037037037

- Precision - macro
precision by language : 
('ARA', 0.7613636363636364)
 ('CHI', 0.7752808988764045)
 ('FRE', 0.6774193548387096)

precision score  0.7380212966929168

- Recall - macro
recall by language  
('ARA', 0.7613636363636364)
 ('CHI', 0.7040816326530612)
 ('FRE', 0.75)

recall score  0.7384817563388992

- F1 - macro
f1 by language  
('ARA', 0.7613636363636364)
 ('CHI', 0.7379679144385026)
 ('FRE', 0.7118644067796611)

f1 score  0.7370653191939334


## Conclusion pour le groupe 1 ['CHI', 'JPN', 'KOR']
On prend logistic regression avec tf-idf

In [85]:
import pickle
# save the model to disk
filename = 'model_g1.sav'
pickle.dump(model_g1, open(filename, 'wb'))

# save the vectorizer to disk
filename = 'vectorizer_g1.pkl'
pickle.dump(vectorizer_g1, open(filename, 'wb'))

## Groupe 2 : [FRE,SPA,ITA,GER]

In [86]:
df_g2 = df[df['group'].isin([2])]

# split df_g2 par category
X_train, X_dev, X_test, y_train, y_dev, y_test = get_splitted_df(df_g2, x_col='text', y_col='category', test_size=0.2, random_state=42, verbose=True)

# label pour le groupe 2
label_encoder, class_labels_dict = get_label_encoder_for(df_g2['category']) # category de df_g2
print("Labelled classes : ", class_labels_dict)

# LR avec BOW
train_and_evaluate_model('logistic_regression', 'count_vectorizer', X_train, X_dev, X_test, y_train, y_dev, y_test, label_encoder, labels_list=list(class_labels_dict.values()), class_labels_dict=class_labels_dict)

# LR avec TF IDF
model_g2, vectorizer_g2 = train_and_evaluate_model('logistic_regression', 'tfidf_vectorizer', X_train, X_dev, X_test, y_train, y_dev, y_test, label_encoder, labels_list=list(class_labels_dict.values()), class_labels_dict=class_labels_dict)

# SVM avec BOW
train_and_evaluate_model('svm', 'count_vectorizer', X_train, X_dev, X_test, y_train, y_dev, y_test, label_encoder, labels_list=list(class_labels_dict.values()), class_labels_dict=class_labels_dict)

# SVM avec TF IDF
train_and_evaluate_model('svm', 'tfidf_vectorizer', X_train, X_dev, X_test, y_train, y_dev, y_test, label_encoder, labels_list=list(class_labels_dict.values()), class_labels_dict=class_labels_dict)


Shape de X_train : (2880,)
Shape de y_train : (2880,)
Shape de X_dev : (360,)
Shape de y_dev : (360,)
Shape de X_test : (360,)
Shape de y_test : (360,)

y_train split:
 SPA    732
GER    725
ITA    712
FRE    711
Name: category, dtype: int64

y_dev split:
 ITA    103
FRE     98
GER     85
SPA     74
Name: category, dtype: int64

y_test split:
 SPA    94
FRE    91
GER    90
ITA    85
Name: category, dtype: int64
Labelled classes :  {0: 'FRE', 1: 'GER', 2: 'ITA', 3: 'SPA'}
Accuracy sur les données de dev avec count_vectorizer et logistic_regression: 0.756
Accuracy sur les données de test avec count_vectorizer et logistic_regression: 0.781

-- EVALUATION --

>>> {'FRE': 0, 'GER': 1, 'ITA': 2, 'SPA': 3}
>> ['FRE', 'GER', 'ITA', 'SPA']



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




Accuracy  0.7805555555555556

- Precision - macro
precision by language : 
('ARA', 0.7582417582417582)
 ('CHI', 0.8536585365853658)
 ('FRE', 0.7608695652173914)
 ('GER', 0.7578947368421053)

precision score  0.7826661492216552

- Recall - macro
recall by language  
('ARA', 0.7582417582417582)
 ('CHI', 0.7777777777777778)
 ('FRE', 0.8235294117647058)
 ('GER', 0.7659574468085106)

recall score  0.781376598648188

- F1 - macro
f1 by language  
('ARA', 0.7582417582417583)
 ('CHI', 0.8139534883720929)
 ('FRE', 0.7909604519774012)
 ('GER', 0.761904761904762)

f1 score  0.7812651151240035
Accuracy sur les données de dev avec tfidf_vectorizer et logistic_regression: 0.761
Accuracy sur les données de test avec tfidf_vectorizer et logistic_regression: 0.814

-- EVALUATION --

>>> {'FRE': 0, 'GER': 1, 'ITA': 2, 'SPA': 3}
>> ['FRE', 'GER', 'ITA', 'SPA']



Accuracy  0.8138888888888889

- Precision - macro
precision by language : 
('ARA', 0.8111111111111111)
 ('CHI', 0.8210526315789474)
 ('FRE', 0.8160919540229885)
 ('GER', 0.8068181818181818)

precision score  0.8137684696328071

- Recall - macro
recall by language  
('ARA', 0.8021978021978022)
 ('CHI', 0.8666666666666667)
 ('FRE', 0.8352941176470589)
 ('GER', 0.7553191489361702)

recall score  0.8148694338619245

- F1 - macro
f1 by language  
('ARA', 0.8066298342541437)
 ('CHI', 0.8432432432432433)
 ('FRE', 0.8255813953488372)
 ('GER', 0.7802197802197802)

f1 score  0.8139185632665011
Accuracy sur les données de dev avec count_vectorizer et svm: 0.650
Accuracy sur les données de test avec count_vectorizer et svm: 0.678

-- EVALUATION --

>>> {'FRE': 0, 'GER': 1, 'ITA': 2, 'SPA': 3}
>> ['FRE', 'GER', 'ITA', 'SPA']



Accuracy  0.6777777777777778

- Precision - macro
precision by language : 
('ARA', 0.6744186046511628)
 ('CHI', 0.7216494845360825)
 ('FRE', 0.5979381443298969)
 ('GER', 0.725)

precision score  0.6797515583792856

- Recall - macro
recall by language  
('ARA', 0.6373626373626373)
 ('CHI', 0.7777777777777778)
 ('FRE', 0.6823529411764706)
 ('GER', 0.6170212765957447)

recall score  0.6786286582281575

- F1 - macro
f1 by language  
('ARA', 0.655367231638418)
 ('CHI', 0.7486631016042781)
 ('FRE', 0.6373626373626374)
 ('GER', 0.6666666666666666)

f1 score  0.677014909318
Accuracy sur les données de dev avec tfidf_vectorizer et svm: 0.758
Accuracy sur les données de test avec tfidf_vectorizer et svm: 0.814

-- EVALUATION --

>>> {'FRE': 0, 'GER': 1, 'ITA': 2, 'SPA': 3}
>> ['FRE', 'GER', 'ITA', 'SPA']



Accuracy  0.8138888888888889

- Precision - macro
precision by language : 
('ARA', 0.8390804597701149)
 ('CHI', 0.8387096774193549)
 ('FRE', 0.8192771084337349)
 ('GER', 0.7628865979381443)

precision score  0.8149884608903373

- Recall - macro
recall by language  
('ARA', 0.8021978021978022)
 ('CHI', 0.8666666666666667)
 ('FRE', 0.8)
 ('GER', 0.7872340425531915)

recall score  0.8140246278544151

- F1 - macro
f1 by language  
('ARA', 0.8202247191011236)
 ('CHI', 0.8524590163934426)
 ('FRE', 0.8095238095238095)
 ('GER', 0.774869109947644)

f1 score  0.814269163741505


(SVC(), TfidfVectorizer())

## Conclusion Groupe 2 Groupe 2 : [FRE,SPA,ITA,GER]
On prend la régression logistique avec tf-idf

In [87]:
import pickle
# save the model to disk
filename = 'model_g2.sav'
pickle.dump(model_g2, open(filename, 'wb'))

# save the vectorizer to disk
filename = 'vectorizer_g2.pkl'
pickle.dump(vectorizer_g2, open(filename, 'wb'))

## Groupe 3 [HIN,TEL]

In [88]:
df_g3 = df[df['group'].isin([3])]

# split df_g3 par category
X_train, X_dev, X_test, y_train, y_dev, y_test = get_splitted_df(df_g3, x_col='text', y_col='category', test_size=0.2, random_state=42, verbose=True)

# label pour le groupe 2
label_encoder, class_labels_dict = get_label_encoder_for(df_g3['category']) # category de df_g3
print("Labelled classes : ", class_labels_dict)

# LR avec BOW
train_and_evaluate_model('logistic_regression', 'count_vectorizer', X_train, X_dev, X_test, y_train, y_dev, y_test, label_encoder, labels_list=list(class_labels_dict.values()), class_labels_dict=class_labels_dict)

# LR avec TF IDF
train_and_evaluate_model('logistic_regression', 'tfidf_vectorizer', X_train, X_dev, X_test, y_train, y_dev, y_test, label_encoder, labels_list=list(class_labels_dict.values()), class_labels_dict=class_labels_dict)

# SVM avec BOW
train_and_evaluate_model('svm', 'count_vectorizer', X_train, X_dev, X_test, y_train, y_dev, y_test, label_encoder, labels_list=list(class_labels_dict.values()), class_labels_dict=class_labels_dict)

# SVM avec TF IDF
model_g3, vectorizer_g3 = train_and_evaluate_model('svm', 'tfidf_vectorizer', X_train, X_dev, X_test, y_train, y_dev, y_test, label_encoder, labels_list=list(class_labels_dict.values()), class_labels_dict=class_labels_dict)


Shape de X_train : (1440,)
Shape de y_train : (1440,)
Shape de X_dev : (180,)
Shape de y_dev : (180,)
Shape de X_test : (180,)
Shape de y_test : (180,)

y_train split:
 HIN    724
TEL    716
Name: category, dtype: int64

y_dev split:
 HIN    90
TEL    90
Name: category, dtype: int64

y_test split:
 TEL    94
HIN    86
Name: category, dtype: int64
Labelled classes :  {0: 'HIN', 1: 'TEL'}
Accuracy sur les données de dev avec count_vectorizer et logistic_regression: 0.706
Accuracy sur les données de test avec count_vectorizer et logistic_regression: 0.772

-- EVALUATION --

>>> {'HIN': 0, 'TEL': 1}
>> ['HIN', 'TEL']



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




Accuracy  0.7722222222222223

- Precision - macro
precision by language : 
('ARA', 0.7647058823529411)
 ('CHI', 0.7789473684210526)

precision score  0.7718266253869969

- Recall - macro
recall by language  
('ARA', 0.7558139534883721)
 ('CHI', 0.7872340425531915)

recall score  0.7715239980207818

- F1 - macro
f1 by language  
('ARA', 0.7602339181286549)
 ('CHI', 0.783068783068783)

f1 score  0.771651350598719
Accuracy sur les données de dev avec tfidf_vectorizer et logistic_regression: 0.761
Accuracy sur les données de test avec tfidf_vectorizer et logistic_regression: 0.750

-- EVALUATION --

>>> {'HIN': 0, 'TEL': 1}
>> ['HIN', 'TEL']



Accuracy  0.75

- Precision - macro
precision by language : 
('ARA', 0.7252747252747253)
 ('CHI', 0.7752808988764045)

precision score  0.7502778120755649

- Recall - macro
recall by language  
('ARA', 0.7674418604651163)
 ('CHI', 0.7340425531914894)

recall score  0.7507422068283028

- F1 - macro
f1 by language  
('ARA', 0.7457627118644067)
 ('CHI', 0.7540983606557377)

f1 score  0.7499305362600721
Accuracy sur les données de dev avec count_vectorizer et svm: 0.711
Accuracy sur les données de test avec count_vectorizer et svm: 0.722

-- EVALUATION --

>>> {'HIN': 0, 'TEL': 1}
>> ['HIN', 'TEL']



Accuracy  0.7222222222222222

- Precision - macro
precision by language : 
('ARA', 0.6956521739130435)
 ('CHI', 0.75)

precision score  0.7228260869565217

- Recall - macro
recall by language  
('ARA', 0.7441860465116279)
 ('CHI', 0.7021276595744681)

recall score  0.723156853043048

- F1 - macro
f1 by language  
('ARA', 0.7191011235955057)
 ('CHI', 0.7252747252747254)

f1 score  0.7221879244351155
Accuracy sur les données de dev avec tfidf_vectorizer et svm: 0.756
Accuracy sur les données de test avec tfidf_vectorizer et svm: 0.767

-- EVALUATION --

>>> {'HIN': 0, 'TEL': 1}
>> ['HIN', 'TEL']



Accuracy  0.7666666666666667

- Precision - macro
precision by language : 
('ARA', 0.7156862745098039)
 ('CHI', 0.8333333333333334)

precision score  0.7745098039215687

- Recall - macro
recall by language  
('ARA', 0.8488372093023255)
 ('CHI', 0.6914893617021277)

recall score  0.7701632855022267

- F1 - macro
f1 by language  
('ARA', 0.7765957446808511)
 ('CHI', 0.7558139534883721)

f1 score  0.7662048490846116


## Conclusion groupe 3 [HIN,TEL]
On prend un SVM avec TF-IDF

In [89]:
import pickle
# save the model to disk
filename = 'model_g3.sav'
pickle.dump(model_g3, open(filename, 'wb'))

# save the vectorizer to disk
filename = 'vectorizer_g3.pkl'
pickle.dump(vectorizer_g3, open(filename, 'wb'))

## Creating cascade model

In [60]:
def create_cascade_model_5_groups(X_to_classify):
  """
  0:[ARA]; 1:[KOR,CHI,JPN] 2:[FRE,SPA,ITA,GER]; 3:[HIN,TEL]; 4:[TUR]
  """
  import pickle

  label_encoder, class_labels_dict = get_label_encoder_for(df['category']) # df['group'] au lieu de df['category']

  lang_to_group = {'ARA':0, 'CHI':1, 'FRE':2, 'GER':2, 'HIN':3, 'ITA':2, 'JPN':1, 'KOR':1, 'SPA':2, 'TEL':3, 'TUR':4}
  label_to_group = {i:lang_to_group[class_labels_dict[i]] for i in range(len(class_labels_dict))}

  #print(lang_to_group)
  #print(label_to_group)

  model_5_groups = pickle.load(open("model_5_groups.sav", 'rb'))
  vectorizer_5_groups = pickle.load(open("vectorizer_5_groups.pkl", 'rb'))
  X_to_classify_vectors = vectorizer_5_groups.transform(X_to_classify)
  predicted_groups = model_5_groups.predict(X_to_classify_vectors)
  df_X_to_classify = pd.DataFrame({"id":range(len(X_to_classify)), "text":X_to_classify, "group":predicted_groups})
  
  # modèles spécialisés
  df_X_to_classify['category'] = [None for _ in range(len(df_X_to_classify))]

  # groupe 0 : ARA
  df_X_to_classify.loc[df_X_to_classify['group'] == 0, 'category'] = 'ARA'

  # groupe 4 : TUR
  df_X_to_classify.loc[df_X_to_classify['group'] == 4, 'category'] = 'TUR'

  # groupe 1
  df_g1 = df_X_to_classify[df_X_to_classify['group'].isin([1])]
  class_labels_dict_g1 = {0: 'CHI', 1: 'JPN', 2: 'KOR'}
  label_encoder_g1 = {v: k for k, v in class_labels_dict_g1.items()}
  X_g1 = np.asarray(df_g1['text'])
  model_g1 = pickle.load(open("model_g1.sav", 'rb'))
  vectorizer_g1 = pickle.load(open("vectorizer_g1.pkl", 'rb'))
  g1_vectors = vectorizer_g1.transform(X_g1)
  g1_predictions = model_g1.predict(g1_vectors)
  g1_predictions = [class_labels_dict_g1[i] for i in g1_predictions]
  df_g1['category'] = g1_predictions
  mapping_dict = {row['id']: row['category'] for idx, row in df_g1.iterrows()}
  df_X_to_classify['category'] = df_X_to_classify.apply(lambda row: mapping_dict[row['id']] if row['id'] in mapping_dict else row['category'], axis=1)

  # groupe 2
  df_g2 = df_X_to_classify[df_X_to_classify['group'].isin([2])]
  label_encoder_g2 = {'FRE': 0, 'GER': 1, 'ITA': 2, 'SPA': 3}
  class_labels_dict_g2 = {v: k for k, v in label_encoder_g2.items()}
  X_g2 = np.asarray(df_g2['text'])
  model_g2 = pickle.load(open("model_g2.sav", 'rb'))
  vectorizer_g2 = pickle.load(open("vectorizer_g2.pkl", 'rb'))
  g2_vectors = vectorizer_g2.transform(X_g2)
  g2_predictions = model_g2.predict(g2_vectors)
  g2_predictions = [class_labels_dict_g2[i] for i in g2_predictions]
  df_g2['category'] = g2_predictions
  mapping_dict = {row['id']: row['category'] for idx, row in df_g2.iterrows()}
  df_X_to_classify['category'] = df_X_to_classify.apply(lambda row: mapping_dict[row['id']] if row['id'] in mapping_dict else row['category'], axis=1)

  # groupe 3
  df_g3 = df_X_to_classify[df_X_to_classify['group'].isin([3])]
  class_labels_dict_g3 = {0: 'HIN', 1: 'TEL'}
  label_encoder_g3 = {v: k for k, v in class_labels_dict_g3.items()}
  X_g3 = np.asarray(df_g3['text'])
  model_g3 = pickle.load(open("model_g3.sav", 'rb'))
  vectorizer_g3 = pickle.load(open("vectorizer_g3.pkl", 'rb'))
  g3_vectors = vectorizer_g3.transform(X_g3)
  g3_predictions = model_g3.predict(g3_vectors)
  g3_predictions = [class_labels_dict_g3[i] for i in g3_predictions]
  df_g3['category'] = g3_predictions
  mapping_dict = {row['id']: row['category'] for idx, row in df_g3.iterrows()}
  df_X_to_classify['category'] = df_X_to_classify.apply(lambda row: mapping_dict[row['id']] if row['id'] in mapping_dict else row['category'], axis=1)

  # final result
  #print(df_X_to_classify)
  
  return list(df_X_to_classify['category'])


In [62]:
## predict using cascade model
# load data
df = load_data()

# definir les groupes
lang_to_group = {'ARA':0, 'CHI':1, 'FRE':2, 'GER':2, 'HIN':3, 'ITA':2, 'JPN':1, 'KOR':1, 'SPA':2, 'TEL':3, 'TUR':4}
label_to_group = {i:lang_to_group[class_labels_dict[i]] for i in range(len(class_labels_dict))}

# créer les groupes (pour vérifier)
df['group'] = df['category'].apply(lambda x: lang_to_group[x])

# split
X_train, X_dev, X_test, y_train, y_dev, y_test = get_splitted_df(df, x_col='text', y_col='category', test_size=0.2, random_state=42, verbose=True)

predictions = create_cascade_model_5_groups(X_test)

label_encoder, class_labels_dict = get_label_encoder_for(df['category'])
predictions_labels = label_encoder.transform(predictions) # pour la fonction d'évaluation
y_test_labels = label_encoder.transform(y_test) # pour la fonction d'évaluation

# evaluation
evaluation(y_test=y_test_labels, y_pred=predictions_labels, labels_list=list(class_labels_dict.values()), class_labels_dict=class_labels_dict)

Shape de X_train : (7920,)
Shape de y_train : (7920,)
Shape de X_dev : (990,)
Shape de y_dev : (990,)
Shape de X_test : (990,)
Shape de y_test : (990,)

y_train split:
 KOR    728
ITA    726
CHI    726
SPA    722
GER    721
JPN    721
TEL    721
TUR    718
ARA    717
FRE    711
HIN    709
Name: category, dtype: int64

y_dev split:
 HIN    96
FRE    94
JPN    94
GER    94
SPA    93
ARA    91
TUR    91
ITA    86
TEL    84
KOR    84
CHI    83
Name: category, dtype: int64

y_test split:
 HIN    95
TEL    95
FRE    95
ARA    92
CHI    91
TUR    91
ITA    88
KOR    88
SPA    85
JPN    85
GER    85
Name: category, dtype: int64




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy




-- EVALUATION --

>>> {'ARA': 0, 'CHI': 1, 'FRE': 2, 'GER': 3, 'HIN': 4, 'ITA': 5, 'JPN': 6, 'KOR': 7, 'SPA': 8, 'TEL': 9, 'TUR': 10}
>> ['ARA', 'CHI', 'FRE', 'GER', 'HIN', 'ITA', 'JPN', 'KOR', 'SPA', 'TEL', 'TUR']




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy




Accuracy  0.7767676767676768

- Precision - macro
precision by language : 
('ARA', 0.6923076923076923)
 ('CHI', 0.6836734693877551)
 ('FRE', 0.8241758241758241)
 ('GER', 0.7029702970297029)
 ('HIN', 0.8072289156626506)
 ('ITA', 0.8620689655172413)
 ('JPN', 0.8607594936708861)
 ('KOR', 0.8369565217391305)
 ('SPA', 0.63)
 ('TEL', 0.9139784946236559)
 ('TUR', 0.7733333333333333)

precision score  0.7806775461316247

- Recall - macro
recall by language  
('ARA', 0.6847826086956522)
 ('CHI', 0.7362637362637363)
 ('FRE', 0.7894736842105263)
 ('GER', 0.8352941176470589)
 ('HIN', 0.7052631578947368)
 ('ITA', 0.8522727272727273)
 ('JPN', 0.8)
 ('KOR', 0.875)
 ('SPA', 0.7411764705882353)
 ('TEL', 0.8947368421052632)
 ('TUR', 0.6373626373626373)

recall score  0.7774205438218703

- F1 - macro
f1 by language  
('ARA', 0.6885245901639345)
 ('CHI', 0.7089947089947091)
 ('FRE', 0.8064516129032259)
 ('GER', 0.7634408602150539)
 ('HIN', 0.752808988764045)
 ('ITA', 0.8571428571428572)
 ('JPN', 0.829268

## Conclusion modèle en cascade
Nous avons gagné 11 points par rapport au meilleure modèle entrainé sur toutes les données !

0.66 -> 0.77

## LSTM avec Tokenizer

Techniquement, il est possible d'entraîner un modèle LSTM avec CountVectorizer. Cependant, les modèles LSTM sont généralement plus performants lorsqu'ils sont entraînés avec des données qui ont été traitées avec des méthodes de prétraitement telles que Tokenizer.

L'une des raisons pour lesquelles Tokenizer est plus efficace pour l'entraînement de modèles LSTM est qu'il est capable de conserver la séquence des mots dans une phrase, tandis que CountVectorizer ne prend en compte que la fréquence des mots dans un document. Les modèles LSTM sont particulièrement utiles pour comprendre la séquence des mots et des phrases dans un texte.

En outre, CountVectorizer est plus efficace pour l'entraînement de modèles de classification traditionnels tels que les SVM.

In [None]:

def LSTM_train_and_evaluate_model(model_name, vectorizer_type, X_train, X_dev, X_test, y_train, y_dev, y_test, label_encoder):
    import tensorflow as tf
    from tensorflow.keras.layers import LSTM, Dense, Dropout
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.preprocessing.text import Tokenizer
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    from sklearn.metrics import accuracy_score
    import numpy as np

    # Configuration du modèle LSTM
    max_words = 10000
    max_len = 200
    embedding_dim = 100
    lstm_units = 16
    dropout_rate = 0.2

    epochs = 5
    batch_size = 8
    
    # Tokenisation des données textuelles
    tokenizer = Tokenizer(num_words=max_words)
    if vectorizer_type == 'tokenizer':
        pass
    else:
        raise ValueError("Vectorizer type should be 'tokenizer'.")
    tokenizer.fit_on_texts(X_train)
    
    X_train_sequences = tokenizer.texts_to_sequences(X_train)
    X_dev_sequences = tokenizer.texts_to_sequences(X_dev)
    X_test_sequences = tokenizer.texts_to_sequences(X_test)
    
    X_train_padded = pad_sequences(X_train_sequences, maxlen=max_len)
    X_dev_padded = pad_sequences(X_dev_sequences, maxlen=max_len)
    X_test_padded = pad_sequences(X_test_sequences, maxlen=max_len)
    
    # Transformer les catégories en nombres entiers avec label encoder déjà entrainé 
    y_train_labels = label_encoder.transform(y_train)
    y_dev_labels = label_encoder.transform(y_dev)
    y_test_labels = label_encoder.transform(y_test)
    
    # Entraîner le modèle correspondant au nom spécifié
    if model_name == 'lstm':
      model = Sequential()
      model.add(tf.keras.layers.Embedding(max_words, embedding_dim, input_length=max_len))
      #model.add(LSTM(units=lstm_units, dropout=dropout_rate, return_sequences=True))
      model.add(LSTM(units=lstm_units, dropout=dropout_rate))
      model.add(Dense(len(label_encoder.classes_), activation='softmax'))
      model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
      model.fit(X_train_padded, y_train_labels, validation_data=(X_dev_padded, y_dev_labels), epochs=epochs, batch_size=batch_size)
    else:
      raise ValueError("Model name should be 'lstm'.")
    
    # Prédiction sur les données de développement (dev)
    y_pred_dev_proba = model.predict(X_dev_padded)
    y_pred_dev = np.argmax(y_pred_dev_proba, axis=1)
    # Calcul de l'accuracy sur les données de développement (dev)
    accuracy_dev = accuracy_score(y_dev_labels, y_pred_dev)
    print(f"Accuracy sur les données de dev avec {vectorizer_type} et {model_name}: {accuracy_dev:.3f}")

    # Prédiction sur les données de test
    y_pred_test_proba = model.predict(X_test_padded)
    y_pred_test = np.argmax(y_pred_test_proba, axis=1)
    # Calcul de l'accuracy sur les données de test
    accuracy_test = accuracy_score(y_test_labels, y_pred_test)
    print(f"Accuracy sur les données de test avec {vectorizer_type} et {model_name}: {accuracy_test:.3f}")

    # Evaluation function
    evaluation(y_test_labels, y_pred_test)


In [None]:
#LSTM_train_and_evaluate_model('lstm', 'tokenizer', X_train[:100], X_dev[:100], X_test[:100], y_train[:100], y_dev[:100], y_test[:100], label_encoder) # test avec peu de data
LSTM_train_and_evaluate_model('lstm', 'tokenizer', X_train, X_dev, X_test, y_train, y_dev, y_test, label_encoder) # test avec peu de data

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy sur les données de dev avec tokenizer et lstm: 0.437
Accuracy sur les données de test avec tokenizer et lstm: 0.431

-- EVALUATION --




Accuracy  0.4313131313131313

- Precision - macro
precision by language : 
('ARA', 0.28)
 ('CHI', 0.44642857142857145)
 ('FRE', 0.4129032258064516)
 ('GER', 0.42857142857142855)
 ('HIN', 0.4423076923076923)
 ('ITA', 0.32222222222222224)
 ('JPN', 0.4336283185840708)
 ('KOR', 0.38738738738738737)
 ('SPA', 0.4636363636363636)
 ('TEL', 0.5555555555555556)
 ('TUR', 0.46236559139784944)

precision score  0.42136421426341747

- Recall - macro
recall by language  
('ARA', 0.08139534883720931)
 ('CHI', 0.2777777777777778)
 ('FRE', 0.64)
 ('GER', 0.3707865168539326)
 ('HIN', 0.27710843373493976)
 ('ITA', 0.3918918918918919)
 ('JPN', 0.5384615384615384)
 ('KOR', 0.4673913043478261)
 ('SPA', 0.4766355140186916)
 ('TEL', 0.7058823529411765)
 ('TUR', 0.46236559139784944)

recall score  0.42633602456934844

- F1 - macro
f1 by language  
('ARA', 0.12612612612612614)
 ('CHI', 0.3424657534246575)
 ('FRE', 0.5019607843137255)
 ('GER', 0.39759036144578314)
 ('HIN', 0.34074074074074073)
 ('ITA', 0.3536585

## BERT
BERT pour la tâche de classification de texte en catégorie. En fait, BERT a été entraîné sur des tâches de compréhension de langage naturel similaires, et il a démontré d'excellentes performances dans diverses tâches de NLP, y compris la classification de texte.
L'utilisation de BERT pour la classification de texte nécessite une grande quantité de données d'entraînement et de ressources informatiques pour entraîner le modèle.

In [None]:
import tensorflow as tf
!pip install 'transformers[tf-cpu]' ##### to install
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.metrics import accuracy_score
import numpy as np

def BERT_train_and_evaluate_model(model_name, vectorizer_type, X_train, X_dev, X_test, y_train, y_dev, y_test, label_encoder):
    #vectorizer_type='bert_tokenizer'
    
    # Configuration du modèle BERT
    max_len = 200
    bert_model_name = 'bert-base-uncased'
    tokenizer = BertTokenizer.from_pretrained(bert_model_name)
    bert_model = TFBertForSequenceClassification.from_pretrained(bert_model_name, num_labels=len(label_encoder.classes_))

    # Encodage des données textuelles en utilisant le tokenizer de BERT
    X_train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=max_len)
    X_dev_encodings = tokenizer(list(X_dev), truncation=True, padding=True, max_length=max_len)
    X_test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=max_len)

    # Convertir en numpy.array
    X_train_encodings = np.array(X_train_encodings['input_ids'])
    X_dev_encodings = np.array(X_dev_encodings['input_ids'])
    X_test_encodings = np.array(X_test_encodings['input_ids'])

    # Transformer les catégories en nombres entiers avec label encoder déjà entrainé 
    y_train_labels = label_encoder.transform(y_train)
    y_dev_labels = label_encoder.transform(y_dev)
    y_test_labels = label_encoder.transform(y_test)
    #print(">>>>>>>>>", type(y_train_labels))
    
    # Entraîner le modèle correspondant au nom spécifié
    if model_name == 'bert':
        bert_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5), loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])
        bert_model.fit(X_train_encodings, y_train_labels, validation_data=(X_dev_encodings, y_dev_labels), epochs=5, batch_size=64)
    else:
        raise ValueError("Model name should be 'bert'.")

    # Prédiction sur les données de développement (dev)
    y_pred_dev_logits = bert_model.predict(X_dev_encodings).logits
    y_pred_dev = np.argmax(y_pred_dev_logits, axis=1)
    # Calcul de l'accuracy sur les données de développement (dev)
    accuracy_dev = accuracy_score(y_dev_labels, y_pred_dev)
    print(f"Accuracy sur les données de dev avec BERT: {accuracy_dev:.3f}")

    # Prédiction sur les données de test
    y_pred_test_logits = bert_model.predict(X_test_encodings).logits
    y_pred_test = np.argmax(y_pred_test_logits, axis=1)
    # Calcul de l'accuracy sur les données de test
    accuracy_test = accuracy_score(y_test_labels, y_pred_test)
    print(f"Accuracy sur les données de test avec BERT: {accuracy_test:.3f}")

    # Evaluation function
    evaluation(y_test_labels, y_pred_test)


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
#BERT_train_and_evaluate_model('bert', 'bert_tokenizer', X_train[:100], X_dev[:100], X_test[:100], y_train[:100], y_dev[:100], y_test[:100], label_encoder) # test avec peu de data
#BERT_train_and_evaluate_model('bert', 'bert_tokenizer', X_train, X_dev, X_test, y_train, y_dev, y_test, label_encoder) # test avec peu de data

NameError: ignored