In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV

# Data tokenization

In [2]:
def get_boolean_representation(corpus, stop_words_list='english'):
    vectorizer = CountVectorizer(binary=True, stop_words=stop_words_list)
    X = vectorizer.fit_transform(corpus)
    voc = vectorizer.get_feature_names_out()
    return X.toarray(), voc

def get_tf_representation(corpus, stop_words_list='english'):
    vectorizer = CountVectorizer(binary=False, stop_words=stop_words_list)
    X = vectorizer.fit_transform(corpus)
    voc = vectorizer.get_feature_names_out()
    return X.toarray(), voc

def get_tf_idf_representation(corpus, stop_words_list='english'):
    vectorizer = TfidfVectorizer(stop_words=stop_words_list)
    X = vectorizer.fit_transform(corpus)
    voc = vectorizer.get_feature_names_out()
    return X.toarray(), voc

# Random forest

In [3]:
index_train = np.load('../data/array_idx_train.npy')
index_val = np.load('../data/array_idx_val.npy')

df = pd.read_csv('../data/DataFrame_train_preprocessing_V2.csv')

representation_boolean, vocabulary = get_boolean_representation(df['text'].apply(lambda x : x.lower()))
representation_tf, _ = get_tf_representation(df['text'].apply(lambda x : x.lower()))
representation_tf_idf, _ = get_tf_idf_representation(df['text'].apply(lambda x : x.lower()))

y_train = df.loc[index_train, 'label']
y_val = df.loc[index_val, 'label']

del df

x_train_boolean = representation_boolean[index_train]
x_val_boolean = representation_boolean[index_val]

x_train_tf = representation_tf[index_train]
x_val_tf = representation_tf[index_val]

x_train_tf_idf = representation_tf_idf[index_train]
x_val_tf_idf = representation_tf_idf[index_val]

# Cross validation

In [5]:
parameters_to_tune = {'criterion' : ['gini'],
                      'n_estimators' : [2, 4, 5, 10],
                      'max_depth' : [3, 4, 5],
                      'max_features' : [0.25, 0.75],
                      'class_weight' : ['balanced']
                      }

model_to_tune = RandomForestClassifier(random_state=0)
model = GridSearchCV(model_to_tune, parameters_to_tune, scoring='roc_auc', verbose=2).fit(x_train_boolean, y_train)
print('Best score', model.best_score_, 'for parameters', model.best_params_)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END class_weight=balanced, criterion=gini, max_depth=3, max_features=0.25, n_estimators=2; total time=   1.0s
[CV] END class_weight=balanced, criterion=gini, max_depth=3, max_features=0.25, n_estimators=2; total time=   0.7s
[CV] END class_weight=balanced, criterion=gini, max_depth=3, max_features=0.25, n_estimators=2; total time=   0.6s
[CV] END class_weight=balanced, criterion=gini, max_depth=3, max_features=0.25, n_estimators=2; total time=   0.5s
[CV] END class_weight=balanced, criterion=gini, max_depth=3, max_features=0.25, n_estimators=2; total time=   0.5s
[CV] END class_weight=balanced, criterion=gini, max_depth=3, max_features=0.25, n_estimators=4; total time=   0.8s
[CV] END class_weight=balanced, criterion=gini, max_depth=3, max_features=0.25, n_estimators=4; total time=   0.8s
[CV] END class_weight=balanced, criterion=gini, max_depth=3, max_features=0.25, n_estimators=4; total time=   0.8s
[CV] END class_wei

# Boolean representation

In [12]:
model = RandomForestClassifier(random_state=0,  criterion = 'gini', max_depth = 5, max_features = 0.25, class_weight= 'balanced', n_estimators=20).fit(x_train_boolean, y_train)
y_predict = model.predict(x_val_boolean)

acc = accuracy_score(y_val, y_predict)
aucroc = roc_auc_score(y_val, y_predict)

print('**Boolean representation**')
print('accuracy', acc, 'auc-roc', aucroc)

**Boolean representation**
accuracy 0.97 auc-roc 0.9508870214752567


# TF representation

In [22]:
model = RandomForestClassifier(random_state=0,  criterion = 'gini', max_depth = 5, max_features = 0.25, class_weight= 'balanced', n_estimators=20).fit(x_train_tf, y_train)
y_predict = model.predict(x_val_tf)

acc = accuracy_score(y_val, y_predict)
aucroc = roc_auc_score(y_val, y_predict)

print('**TF representation**')
print('accuracy', acc, 'auc-roc', aucroc)

**TF representation**
accuracy 0.975 auc-roc 0.9583566760037349


# TF-IDF representation

In [30]:
model = RandomForestClassifier(random_state=0,  criterion = 'gini', max_depth = 5, max_features = 0.25, class_weight= 'balanced', n_estimators=20).fit(x_train_tf_idf, y_train)
y_predict = model.predict(x_val_tf_idf)

acc = accuracy_score(y_val, y_predict)
aucroc = roc_auc_score(y_val, y_predict)

print('**TF-IDF representation**')
print('accuracy', acc, 'auc-roc', aucroc)

**TF-IDF representation**
accuracy 0.97 auc-roc 0.9551820728291316
