# 08 - Choix des modèles de classification textuel


In [1]:
import os
current_dir = %pwd
project_dir = os.path.dirname(current_dir)
%cd $project_dir

/data/dhryniewski/DataScientest/doc-classifier


In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyClassifier
from joblib import dump, load

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [4]:
df = pd.read_csv('data/processed/words_structure.csv')
seed = 42

In [5]:
df['words'] = df['words'].fillna('')

target = df['category']
features = df.drop('category', axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, random_state=seed)

# 1. Modèle sur les données words


In [9]:
# Encoding the categories
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

# Building the corpus of the train and test data for the TF-IDF vectorizer
X_train_corpus = X_train['words'].tolist()
X_test_corpus = X_test['words'].tolist()

# Learns the vocabulary and the IDF on training data only
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_corpus)

# We save the TF-IDF of the train data
dump(X_train_tfidf, 'models/train_tfidf.joblib')

# Convert the training data to array because the model doesn't take csr_matrix
X_train_tfidf = X_train_tfidf.toarray()

# Transform and convert the test data
X_test_tfidf = tfidf_vectorizer.transform(X_test_corpus)
X_test_tfidf = X_test_tfidf.toarray()

In [10]:
# Approximatly 6h40
clf_tfidf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None, random_state=seed)
models, predictions = clf_tfidf.fit(
    X_train_tfidf, X_test_tfidf, y_train_encoded, y_test_encoded)

display(models)

 97%|█████████▋| 28/29 [7:06:15<07:48, 468.97s/it]   

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.037508 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 67454
[LightGBM] [Info] Number of data points in the train set: 5687, number of used features: 2628
[LightGBM] [Info] Start training from score -1.751268
[LightGBM] [Info] Start training from score -1.714466
[LightGBM] [Info] Start training from score -1.406723
[LightGBM] [Info] Start training from score -3.648726
[LightGBM] [Info] Start training from score -1.608032
[LightGBM] [Info] Start training from score -1.742191


100%|██████████| 29/29 [7:07:53<00:00, 885.29s/it]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LogisticRegression,0.86,0.82,,0.86,95.99
LinearSVC,0.85,0.82,,0.85,4094.08
NearestCentroid,0.85,0.81,,0.85,95.6
ExtraTreesClassifier,0.86,0.81,,0.86,256.07
PassiveAggressiveClassifier,0.84,0.8,,0.84,247.81
RandomForestClassifier,0.84,0.79,,0.84,190.07
RidgeClassifierCV,0.82,0.78,,0.83,132.24
RidgeClassifier,0.82,0.78,,0.83,87.57
XGBClassifier,0.82,0.77,,0.82,322.53
LGBMClassifier,0.82,0.76,,0.82,98.41


In [11]:
dump(clf_tfidf, 'models/lazypredict_tfidf.joblib', compress=5)

['models/lazypredict_tfidf.joblib']

# 2. Modèle sur les données de structure


In [7]:
X_train_structure = X_train.drop('words', axis=1)
X_test_structure = X_test.drop('words', axis=1)

# Scaling the features on training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_structure)
X_test_scaled = scaler.transform(X_test_structure)

dump(X_train_scaled, 'models/train_standardscaled_words_structure.joblib')

['models\\train_standardscaled_words_structure.joblib']

In [8]:
# Approximately 5min

clf_structure = LazyClassifier(
    verbose=0, ignore_warnings=True, custom_metric=None, random_state=seed)

models, predictions = clf_structure.fit(
    X_train_scaled, X_test_scaled, y_train_encoded, y_test_encoded)

display(models)

 97%|█████████▋| 28/29 [00:09<00:00,  2.74it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.055701 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6374
[LightGBM] [Info] Number of data points in the train set: 5687, number of used features: 26
[LightGBM] [Info] Start training from score -1.751268
[LightGBM] [Info] Start training from score -1.714466
[LightGBM] [Info] Start training from score -1.406723
[LightGBM] [Info] Start training from score -3.648726
[LightGBM] [Info] Start training from score -1.608032
[LightGBM] [Info] Start training from score -1.742191


100%|██████████| 29/29 [00:10<00:00,  2.77it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LGBMClassifier,0.83,0.79,,0.83,1.12
XGBClassifier,0.82,0.78,,0.83,0.89
RandomForestClassifier,0.82,0.78,,0.82,0.93
LogisticRegression,0.82,0.78,,0.82,0.11
ExtraTreesClassifier,0.82,0.77,,0.82,0.39
CalibratedClassifierCV,0.81,0.77,,0.82,0.88
LinearSVC,0.81,0.77,,0.81,1.65
SGDClassifier,0.81,0.77,,0.81,0.13
SVC,0.82,0.77,,0.82,0.65
BaggingClassifier,0.8,0.76,,0.8,0.4


In [10]:
dump(clf_structure, 'models/lazypredict_structure.joblib')

['models\\lazypredict_structure.joblib']

# 3. Modèle sur l'ensemble word_structure


In [14]:
# Rebuilding the dataframe while keeping the transformations
# Converting to np.array to optimize the performances of the model
X_train_word_structure = pd.concat(
    (pd.DataFrame(X_train_tfidf), pd.DataFrame(X_train_scaled)), axis=1).to_numpy()
X_test_word_structure = pd.concat(
    (pd.DataFrame(X_test_tfidf), pd.DataFrame(X_test_scaled)), axis=1).to_numpy()

In [2]:
from lazypredict.Supervised import CLASSIFIERS

# Take too long to train and don't get great results.
remove = ["CalibratedClassifierCV", "SVC"]
selected_models = []
for model_name, model in CLASSIFIERS:
    if model_name not in remove:
        selected_models.append(model)

In [24]:
# Approximately 2h00

clf_word_structure = LazyClassifier(
    verbose=0, ignore_warnings=True, custom_metric=None, random_state=seed, classifiers=selected_models)
models, predictions = clf_word_structure.fit(
    X_train_word_structure, X_test_word_structure, y_train_encoded, y_test_encoded)
display(models)

 96%|█████████▋| 26/27 [2:07:32<02:56, 176.16s/it]   

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.037050 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 73828
[LightGBM] [Info] Number of data points in the train set: 5687, number of used features: 2654
[LightGBM] [Info] Start training from score -1.751268
[LightGBM] [Info] Start training from score -1.714466
[LightGBM] [Info] Start training from score -1.406723
[LightGBM] [Info] Start training from score -3.648726
[LightGBM] [Info] Start training from score -1.608032
[LightGBM] [Info] Start training from score -1.742191


100%|██████████| 27/27 [2:09:11<00:00, 287.11s/it]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LinearSVC,0.87,0.84,,0.87,3918.09
NearestCentroid,0.86,0.83,,0.86,86.98
LogisticRegression,0.87,0.82,,0.87,107.65
ExtraTreesClassifier,0.87,0.82,,0.87,162.51
LGBMClassifier,0.85,0.81,,0.85,99.61
XGBClassifier,0.85,0.81,,0.85,350.9
RidgeClassifier,0.86,0.81,,0.86,109.8
RidgeClassifierCV,0.85,0.81,,0.86,142.02
RandomForestClassifier,0.85,0.8,,0.85,125.92
PassiveAggressiveClassifier,0.8,0.79,,0.81,199.41


In [13]:
dump(clf_word_structure, 'models/lazypredict_words_structure.joblib')

['models\\lazypredict_words_structure.joblib']