# 08 - Choix des modèles de classification textuel


In [1]:
import os
current_dir = %pwd
project_dir = os.path.dirname(current_dir)
%cd $project_dir

c:\Users\Bryan Fernandez\Desktop\DataScientest\doc-classifier


In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyClassifier
from joblib import dump, load

In [3]:
df = pd.read_csv('data\processed\words_structure.csv')

In [4]:
df['words'] = df['words'].fillna('')

target = df['category']
features = df.drop('category', axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2)

# 1. Modèle sur les données words


In [5]:
# Encoding the categories
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

# Building the corpus of the train and test data for the TF-IDF vectorizer
X_train_corpus = X_train['words'].tolist()
X_test_corpus = X_test['words'].tolist()

# Learns the vocabulary and the IDF on training data only
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_corpus)

# We save the TF-IDF of the train data
dump(X_train_tfidf, 'models\\train_tfidf.joblib')

# Convert the training data to array because the model doesn't take csr_matrix
X_train_tfidf = X_train_tfidf.toarray()

# Transform and convert the test data
X_test_tfidf = tfidf_vectorizer.transform(X_test_corpus)
X_test_tfidf = X_test_tfidf.toarray()

In [6]:
# Approximatly 6h40
clf_tfidf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf_tfidf.fit(
    X_train_tfidf, X_test_tfidf, y_train_encoded, y_test_encoded)
display(models)

 97%|█████████▋| 28/29 [6:43:13<11:28, 688.14s/it]   

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027384 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 68260
[LightGBM] [Info] Number of data points in the train set: 5687, number of used features: 2664
[LightGBM] [Info] Start training from score -1.712515
[LightGBM] [Info] Start training from score -1.733195
[LightGBM] [Info] Start training from score -1.384713
[LightGBM] [Info] Start training from score -3.622058
[LightGBM] [Info] Start training from score -1.654761
[LightGBM] [Info] Start training from score -1.745207


100%|██████████| 29/29 [6:44:14<00:00, 836.37s/it]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ExtraTreesClassifier,0.84,0.78,,0.84,471.4
LinearSVC,0.83,0.78,,0.82,3432.67
LogisticRegression,0.84,0.77,,0.83,30.98
LGBMClassifier,0.82,0.77,,0.82,60.65
NearestCentroid,0.83,0.77,,0.83,23.29
RandomForestClassifier,0.83,0.77,,0.83,206.46
XGBClassifier,0.82,0.77,,0.82,224.29
PassiveAggressiveClassifier,0.82,0.77,,0.82,151.29
BaggingClassifier,0.78,0.73,,0.78,522.7
RidgeClassifierCV,0.8,0.73,,0.8,58.24


In [7]:
dump(clf_tfidf, 'models\lazypredict_tfidf.joblib', compress=5)

['models\\lazypredict_tfidf.joblib']

# 2. Modèle sur les données de structure


In [8]:
X_train_structure = X_train.drop('words', axis=1)
X_test_structure = X_test.drop('words', axis=1)

# Scaling the features on training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_structure)
X_test_scaled = scaler.transform(X_test_structure)

dump(X_train_scaled, 'models\\train_standardscaled_words_structure.joblib')

['models\\train_standardscaled_words_structure.joblib']

In [9]:
# Approximately 5min

clf_structure = LazyClassifier(
    verbose=0, ignore_warnings=True, custom_metric=None)

models, predictions = clf_structure.fit(
    X_train_scaled, X_test_scaled, y_train_encoded, y_test_encoded)
display(models)

 97%|█████████▋| 28/29 [00:11<00:00,  2.48it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000621 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6382
[LightGBM] [Info] Number of data points in the train set: 5687, number of used features: 26
[LightGBM] [Info] Start training from score -1.712515
[LightGBM] [Info] Start training from score -1.733195
[LightGBM] [Info] Start training from score -1.384713
[LightGBM] [Info] Start training from score -3.622058
[LightGBM] [Info] Start training from score -1.654761
[LightGBM] [Info] Start training from score -1.745207


100%|██████████| 29/29 [00:11<00:00,  2.42it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LGBMClassifier,0.82,0.77,,0.82,0.78
XGBClassifier,0.82,0.77,,0.82,0.9
LogisticRegression,0.81,0.76,,0.81,0.16
CalibratedClassifierCV,0.8,0.76,,0.81,0.79
ExtraTreesClassifier,0.81,0.76,,0.81,0.5
LinearSVC,0.8,0.76,,0.81,1.34
RandomForestClassifier,0.81,0.75,,0.81,1.1
BaggingClassifier,0.8,0.75,,0.8,0.49
RidgeClassifierCV,0.79,0.74,,0.79,0.03
RidgeClassifier,0.79,0.74,,0.79,0.03


In [10]:
dump(clf_structure, 'models\lazypredict_structure.joblib')

['models\\lazypredict_structure.joblib']

# 3. Modèle sur l'ensemble word_structure


In [11]:
# Rebuilding the dataframe while keeping the transformations
# Converting to np.array to optimize the performances of the model
X_train_word_structure = pd.concat(
    (pd.DataFrame(X_train_tfidf), pd.DataFrame(X_train_scaled)), axis=1).to_numpy()
X_test_word_structure = pd.concat(
    (pd.DataFrame(X_test_tfidf), pd.DataFrame(X_test_scaled)), axis=1).to_numpy()

In [12]:
# Approximately 7h00

clf_word_structure = LazyClassifier(
    verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf_word_structure.fit(
    X_train_word_structure, X_test_word_structure, y_train_encoded, y_test_encoded)
display(models)

 97%|█████████▋| 28/29 [6:22:44<11:53, 713.72s/it]   

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.028488 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 74642
[LightGBM] [Info] Number of data points in the train set: 5687, number of used features: 2690
[LightGBM] [Info] Start training from score -1.712515
[LightGBM] [Info] Start training from score -1.733195
[LightGBM] [Info] Start training from score -1.384713
[LightGBM] [Info] Start training from score -3.622058
[LightGBM] [Info] Start training from score -1.654761
[LightGBM] [Info] Start training from score -1.745207


100%|██████████| 29/29 [6:24:03<00:00, 794.60s/it]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
PassiveAggressiveClassifier,0.85,0.8,,0.85,139.04
LinearSVC,0.85,0.8,,0.85,3465.06
XGBClassifier,0.85,0.8,,0.85,270.16
ExtraTreesClassifier,0.85,0.8,,0.85,102.46
LGBMClassifier,0.85,0.8,,0.85,78.6
LogisticRegression,0.85,0.79,,0.85,40.66
NearestCentroid,0.83,0.79,,0.84,23.46
RandomForestClassifier,0.84,0.79,,0.84,68.51
RidgeClassifierCV,0.84,0.78,,0.84,54.45
Perceptron,0.82,0.78,,0.82,55.12


In [13]:
dump(clf_word_structure, 'models\lazypredict_words_structure.joblib')

['models\\lazypredict_words_structure.joblib']