# 08 - Choix du modèle de classification textuel


In [1]:
import os
current_dir = %pwd
project_dir = os.path.dirname(current_dir)
%cd $project_dir

c:\Users\Bryan Fernandez\Desktop\DataScientest\doc-classifier


In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyClassifier
from joblib import dump, load

In [3]:
df = pd.read_csv('data\processed\words_structure.csv')

In [4]:
df['words'] = df['words'].fillna('')

target = df['category']
features = df.drop('category', axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2)

# 1. Modèle sur les données words


In [5]:
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

# Building the corpus for the train and test data for the TF-IDF Vectorizer
X_train_corpus = X_train['words'].tolist()
X_test_corpus = X_test['words'].tolist()

# Learns the vocabulary and the IDF on training data only
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_corpus)

# We save the TF-IDF of the train data
dump(X_train_tfidf, 'models\\train_tfidf.joblib')

# Convert the training data to array because the model doesn't take csr_matrix
X_train_tfidf = X_train_tfidf.toarray()

# Transform and convert the test data
X_test_tfidf = tfidf_vectorizer.transform(X_test_corpus)
X_test_tfidf = X_test_tfidf.toarray()

In [None]:
# Approximatly 6h40
clf_tfidf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf_tfidf.fit(
    X_train_tfidf, X_test_tfidf, y_train_encoded, y_test_encoded)
display(models)

In [None]:
dump(clf_tfidf, 'models\lazypredict_tfidf.joblib', compress=5)

# 2. Modèle sur les données de structure


In [6]:
X_train_structure = X_train.drop('words', axis=1)
X_test_structure = X_test.drop('words', axis=1)

# Scaling the features on training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_structure)
X_test_scaled = scaler.transform(X_test_structure)

dump(X_train_scaled, 'models\\train_standardscaled_words_structure.joblib')

['models\\train_standardscaled_words_structure.joblib']

In [None]:
clf_strucure = LazyClassifier(
    verbose=0, ignore_warnings=True, custom_metric=None)

models, predictions = clf_strucure.fit(
    X_train_scaled, X_test_scaled, y_train_encoded, y_test_encoded)
display(models)

In [None]:
dump(clf_strucure, 'models\lazypredict_structure.joblib')

# 3. Modèle sur l'ensemble


In [62]:
# Rebuilding the dataframe while keeping the transformations

X_train_scaled = pd.DataFrame(X_train_scaled)
X_train_scaled.columns = X_train_structure.columns
X_test_scaled = pd.DataFrame(X_test_scaled)
X_test_scaled.columns = X_test_structure.columns

# Converting to np.array to optimize the performances of the model
X_train_word_structure = pd.concat(
    (pd.DataFrame(X_train_tfidf), X_train_scaled), axis=1).to_numpy()
X_test_word_structure = pd.concat(
    (pd.DataFrame(X_test_tfidf), X_test_scaled), axis=1).to_numpy()


In [None]:
clf_word_structure = LazyClassifier(
    verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf_word_structure.fit(
    X_train_word_structure, X_test_word_structure, y_train_encoded, y_test_encoded)
display(models)

In [None]:
dump(clf_word_structure, 'models\lazypredict_words_structure.joblib')