Utiliser un random forest car non binaire

In [1]:
import numpy as np 
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import torch
import transformers as transf
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('/Users/nathansornet/Documents/Cours/INGE_3/ESME/NLP/TP_transf_finance/data.csv')
df.columns = df.columns.str.strip()

In [3]:
df.head()
df.columns

Index(['Sentence', 'Sentiment'], dtype='object')

In [4]:
df.Sentiment.value_counts()

Sentiment
neutral     3130
positive    1852
negative     860
Name: count, dtype: int64

In [5]:
df_sample = df.sample(1000)

In [6]:
df_sample.Sentiment.value_counts()

Sentiment
neutral     541
positive    307
negative    152
Name: count, dtype: int64

In [7]:
model_class, tokenizer_c, weights_pretrained = (transf.DistilBertModel, transf.DistilBertTokenizer, 'distilbert-base-uncased')

tokenizer = tokenizer_c.from_pretrained(weights_pretrained)
model = model_class.from_pretrained(weights_pretrained)

In [8]:
exemple = df_sample.iloc[4,][0]
exemple

'Operating profit totaled EUR 37,7 mn , up slightly from EUR 37.2 mn in the corresponding period in 2006 .                                                                                                                                                                                                                   '

In [9]:
print(tokenizer.encode(exemple, add_special_tokens=True))

[101, 4082, 5618, 23596, 7327, 2099, 4261, 1010, 1021, 24098, 1010, 2039, 3621, 2013, 7327, 2099, 4261, 1012, 1016, 24098, 1999, 1996, 7978, 2558, 1999, 2294, 1012, 102]


In [10]:
tokenizer.tokenize(exemple)

['operating',
 'profit',
 'totaled',
 'eu',
 '##r',
 '37',
 ',',
 '7',
 'mn',
 ',',
 'up',
 'slightly',
 'from',
 'eu',
 '##r',
 '37',
 '.',
 '2',
 'mn',
 'in',
 'the',
 'corresponding',
 'period',
 'in',
 '2006',
 '.']

In [11]:
tokenized = df_sample["Sentence"].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [12]:
tokenized

1367    [101, 5658, 5618, 1999, 1996, 2093, 2706, 2083...
4822     [101, 1002, 9779, 24759, 23612, 5012, 2181, 102]
4581    [101, 1996, 2047, 2171, 1997, 1996, 2624, 9626...
5580    [101, 2429, 2000, 2032, 1010, 2810, 2147, 2097...
243     [101, 4082, 5618, 23596, 7327, 2099, 4261, 101...
                              ...                        
3380    [101, 1002, 4705, 2278, 8299, 1024, 1013, 1013...
1252    [101, 1048, 3366, 1011, 11605, 8670, 1086, 126...
332     [101, 6327, 5658, 4341, 2013, 1996, 8021, 2449...
3269    [101, 1996, 4696, 2001, 4417, 2011, 16736, 199...
905     [101, 6983, 20196, 4518, 5804, 2038, 2180, 622...
Name: Sentence, Length: 1000, dtype: object

In [13]:
max_len = 0

for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

max_len

76

In [14]:
tokenized_zeroes = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [15]:
np.array(tokenized_zeroes).shape

(1000, 76)

In [16]:
attention_mask = np.where(tokenized_zeroes !=0,1,0)
attention_mask

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])

In [17]:
inputs_ids = torch.tensor(tokenized_zeroes)
attention_mask = torch.tensor(attention_mask)
inputs_ids

tensor([[  101,  5658,  5618,  ...,     0,     0,     0],
        [  101,  1002,  9779,  ...,     0,     0,     0],
        [  101,  1996,  2047,  ...,     0,     0,     0],
        ...,
        [  101,  6327,  5658,  ...,     0,     0,     0],
        [  101,  1996,  4696,  ...,     0,     0,     0],
        [  101,  6983, 20196,  ...,     0,     0,     0]])

In [18]:
with torch.no_grad():
    last_hidden_states = model(inputs_ids, attention_mask=attention_mask)

In [19]:
features = last_hidden_states[0][:,0,:]

In [20]:
features.shape

torch.Size([1000, 768])

In [21]:
labels = df_sample['Sentiment']
labels

1367     neutral
4822     neutral
4581     neutral
5580     neutral
243     positive
          ...   
3380    positive
1252    negative
332      neutral
3269     neutral
905     positive
Name: Sentiment, Length: 1000, dtype: object

In [22]:
train_features, test_features, train_labels, test_labels = train_test_split(features,labels)

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=500, random_state=42)
rf_classifier.fit(train_features, train_labels)

In [46]:
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'n_estimators': randint(100, 500),           # Tirage aléatoire du nombre d'arbres entre 100 et 500
    'max_depth': [None, 10, 20, 30, 40, 50],      # Profondeur des arbres
    'min_samples_split': randint(2, 21),          # Tirage aléatoire pour les splits (entre 2 et 20)
    'min_samples_leaf': randint(1, 11),           # Tirage aléatoire pour les feuilles (entre 1 et 10)
    'bootstrap': [True, False]                     # Bootstrap (avec ou sans remplacement)
}

# Recherche avec RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf_classifier, param_distributions=param_dist, n_iter=50, cv=3, n_jobs=-1, verbose=2, random_state=42)
random_search.fit(train_features, train_labels)

# Affichage des meilleurs paramètres
print("Best parameters found: ", random_search.best_params_)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] END bootstrap=True, max_depth=30, min_samples_leaf=8, min_samples_split=8, n_estimators=221; total time=   0.9s
[CV] END bootstrap=True, max_depth=30, min_samples_leaf=8, min_samples_split=8, n_estimators=221; total time=   1.0s
[CV] END bootstrap=True, max_depth=30, min_samples_leaf=8, min_samples_split=8, n_estimators=221; total time=   1.1s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=8, min_samples_split=5, n_estimators=459; total time=   2.3s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=8, min_samples_split=5, n_estimators=459; total time=   2.4s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=8, min_samples_split=5, n_estimators=459; total time=   2.3s
[CV] END bootstrap=False, max_depth=50, min_samples_leaf=6, min_samples_split=3, n_estimators=291; total time=   2.7s
[CV] END bootstrap=False, max_depth=50, min_samples_leaf=6, min_samples_split=3, n_estimators=291; total time=   2.6s


In [47]:
rf_classifier = RandomForestClassifier(bootstrap=False, max_depth=30, min_samples_leaf=2, min_samples_split=7, n_estimators=153)
rf_classifier.fit(train_features, train_labels)

In [48]:
rf_classifier.score(test_features,test_labels)

0.648

In [49]:
text = tokenizer.encode("it's good product, but i don't like it", add_special_tokens=True)
tokenized_zeroes = np.array([text + [0]*(max_len-len(text))])
inputs_ids = torch.tensor(tokenized_zeroes)
attention_mask = torch.tensor(np.where(tokenized_zeroes !=0,1,0))
with torch.no_grad():
    pred_hidden_states = model(inputs_ids, attention_mask=attention_mask)
features = pred_hidden_states[0][:,0,:]

In [41]:
y_pred = rf_classifier.predict(features)

In [42]:
rf_classifier.predict_proba(features)

array([[0.16066667, 0.54533333, 0.294     ]])

In [44]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Faire les prédictions sur les données de test
y_test_pred = rf_classifier.predict(test_features)

# Calcul des métriques de performance
accuracy = accuracy_score(test_labels, y_test_pred)
precision = precision_score(test_labels, y_test_pred, average='weighted')
recall = recall_score(test_labels, y_test_pred, average='weighted')
f1 = f1_score(test_labels, y_test_pred, average='weighted')

# Affichage des résultats
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("\nClassification Report:\n", classification_report(test_labels, y_test_pred))

Accuracy: 0.64
Precision: 0.5898692307692307
Recall: 0.64
F1 Score: 0.5954731296101159

Classification Report:
               precision    recall  f1-score   support

    negative       0.12      0.03      0.05        31
     neutral       0.66      0.90      0.76       133
    positive       0.65      0.45      0.53        86

    accuracy                           0.64       250
   macro avg       0.48      0.46      0.45       250
weighted avg       0.59      0.64      0.60       250

