# NLP with Ensemble Method

Les scores obtenus avec la regression logistique pour les différentes catégories étaient les suivants :
- f1 score pour la category_1 : 0.1601514910747391
- f1 score pour la category_2 : 0.6212450536216092
- f1 score pour la category_3 : 0.7704773846842813
- f1 score pour la category_4 : 0.0

Là où la regression logistique semble assez bien performée sur la catégorie 3, elle n'est d'aucune utilité pour prédire la catégorie 4. Nous allons donc essayer d'améliorer ces résultats en utilisant d'autres méthodes et prendre le meilleur classificateur pour chaque catégorie

### Extraction des données

In [1]:
import pandas as pd
import numpy as np

DIRECTORY = "challenge_dataset/"

# extract data
X_extract = pd.read_csv(DIRECTORY + 'X_train.csv', sep=';')
y = pd.read_csv(DIRECTORY + 'y_train.csv', sep=';').drop(columns=['Id'])

categories = list(y.columns)
X_extract = X_extract['Caption']

In [12]:
import spacy

def preprocess_data(data_input: pd.Series):
    nlp = spacy.load('fr_core_news_md')

    def tokenize_and_lemmatize(sentence):
        doc = nlp(sentence)
        return [token.lemma_ for token in doc if not (token.is_stop or token.is_punct or token.is_digit or '\r\n' in token.text)]

    lemmas = [tokenize_and_lemmatize(sentence) for sentence in data_input]

    return pd.Series(lemmas).apply(lambda x: ' '.join(x))

X = preprocess_data(X_extract)
X



0                       mourir heure revoir petit enfant
1      maladie conséquence jeune génération voir situ...
2                                       sortir mal loger
3      inquiétude santé proche fragile femme battre e...
4                       bien entendre contracter maladie
                             ...                        
480                                    forme grave civid
481    inquiétude retrouver liberté action total limi...
482    incertitude être voir petit fils jusque ruptur...
483    inquiétude normal face épidémie crainte voir p...
484                                   respecter barrière
Length: 485, dtype: object

In [4]:
from sklearn.metrics import get_scorer_names
get_scorer_names()

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'matthews_corrcoef',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_negative_likelihood_ratio',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'positive_likelihood_ratio',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',

## Naives Bayes

In [13]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

from sklearn.naive_bayes import MultinomialNB

n_splits = 10 # nombre de splits pour le KFold
NB = make_pipeline(
    CountVectorizer(),
    TfidfTransformer(),
    OneVsRestClassifier(MultinomialNB())
)
cv = StratifiedKFold(n_splits, shuffle=True)
scores = []
for category in categories:
    scores.append(np.mean(cross_val_score(NB, X, y[category], cv=cv, scoring='roc_auc')))
    print("score", category, ":", scores[-1])
print("total score", np.mean(scores))

score category_1 : 0.8049136577708007
score category_2 : 0.8465152593295382
score category_3 : 0.8944766002415457
score category_4 : 0.7863381321520856
total score 0.8330609123734926


## Linear SVC

In [65]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold

n_splits = 10 # nombre de splits pour le KFold
SVC = make_pipeline(
    CountVectorizer(ngram_range=(1,5)),
    OneVsRestClassifier(LinearSVC(C=0.1, max_iter=10000))
)
cv = StratifiedKFold(n_splits, shuffle=True)
scores = []
for category in categories:
    scores.append(np.mean(cross_val_score(SVC, X_extract, y[category], cv=cv, scoring='roc_auc')))
    print("score", category, ":", scores[-1])
print("total score", np.mean(scores))


score category_1 : 0.8155572998430142
score category_2 : 0.8645536843769767
score category_3 : 0.8626074879227053
score category_4 : 0.7679660391288298
total score 0.8276711278178815


In [44]:
from sklearn.ensemble import BaggingClassifier

n_splits = 10 # nombre de splits pour le KFold
SVC = make_pipeline(
    CountVectorizer(ngram_range=(1,2)),
    BaggingClassifier(LinearSVC(), bootstrap=False)
)
cv = StratifiedKFold(n_splits, shuffle=True)
scores = []
for category in categories:
    scores.append(np.mean(cross_val_score(SVC, X_extract, y[category], cv=cv, scoring='roc_auc')))
    print("score", category, ":", scores[-1])
print("total score", np.mean(scores))

score category_1 : 0.8044740973312402
score category_2 : 0.859310431161712
score category_3 : 0.86568961352657
score category_4 : 0.7705869324473975
total score 0.82501526861673


In [46]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfTransformer

n_splits = 10 # nombre de splits pour le KFold
SVC = make_pipeline(CountVectorizer(ngram_range=(1,2)),
    FunctionTransformer(lambda x: x.toarray()),
    OneVsRestClassifier(LinearDiscriminantAnalysis())
)
cv = StratifiedKFold(n_splits, shuffle=True)
scores = []
for category in categories:
    scores.append(np.mean(cross_val_score(SVC, X_extract, y[category], cv=cv, scoring='roc_auc')))
    print("score", category, ":", scores[-1])
print("total score", np.mean(scores))

score category_1 : 0.6970015698587128
score category_2 : 0.5397421199662661
score category_3 : 0.6981216787439614
score category_4 : 0.7239331856773717
total score 0.664699638561578


In [48]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

SVC = make_pipeline(CountVectorizer(),
    OneVsRestClassifier(LinearSVC())
)

scores = []
y_pred = pd.DataFrame()
X_train
SVC.fit(X_train, y_train['category_1'])


for category in categories:
    SVC.fit(X_train, y_train[category])
    y_pred[category] = SVC.predict(X_test)
    scores.append(roc_auc_score(y_test[category], y_pred[category]))
for index, row in y_pred.iterrows():
    if (row['category_1'] == 0) and (row['category_2'] == 0) and (row['category_3'] == 0):
        row['category_4'] = 1
scores[3] = roc_auc_score(y_test['category_4'], y_pred['category_4'])
for idx, category in enumerate(categories):
    print("score", category, ":", scores[idx])

score category_1 : 0.6891025641025642
score category_2 : 0.7928571428571429
score category_3 : 0.734006734006734
score category_4 : 0.6785714285714286


L'utilisation de stopwords n'augmente pas les résultats, et other_stopwords diminue les résultats

## Logistic Regression

In [51]:
from sklearn.linear_model import LogisticRegression

n_splits = 5 # nombre de splits pour le KFold
LR = make_pipeline(CountVectorizer(),
    OneVsRestClassifier(LogisticRegression(solver='sag', max_iter=10000))
)
cv = StratifiedKFold(n_splits, shuffle=True)
scores = []
for category in categories:
    scores.append(np.mean(cross_val_score(LR, X_extract, y[category], cv=cv, scoring='roc_auc')))
    print("score", category, ":", scores[-1])
print("total score", np.mean(scores))

score category_1 : 0.7655555555555555
score category_2 : 0.8531345441402367
score category_3 : 0.8665935012302792
score category_4 : 0.7476290262405174
total score 0.8082281567916472


Le meilleur classificateur semble être le Linear SVC

## Semi-supervised Learning

In [66]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.semi_supervised import LabelSpreading
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

# extract labeled data
X_labeled = pd.read_csv(DIRECTORY + 'X_train.csv', sep=';')
X_labeled = X_labeled['Caption']
y_labeled = pd.read_csv(DIRECTORY + 'y_train.csv', sep=';').drop(columns=['Id'])

# extract unlabeled data
X_unlabeled = pd.read_csv(DIRECTORY + 'nonlabeled_data.csv', sep=';')
X_unlabeled = X_unlabeled['Caption']
nb_unlabeled = X_unlabeled.shape[0]
y_unlabeled = pd.DataFrame([[-1, -1, -1, -1] for i in range(nb_unlabeled)], columns=categories)

# création du jeu d'entraînement et de test à partir des données labelisées
X_train, X_test, y_train, y_test = train_test_split(X_labeled, y_labeled, test_size=0.1)

preprocess_pipeline = make_pipeline(
    CountVectorizer(),
    TfidfTransformer(),
    FunctionTransformer(lambda x: x.toarray()),
)
model_pipeline = make_pipeline(
    BaggingClassifier(LinearSVC())
)
X = pd.concat([X_train, X_unlabeled])
X = preprocess_pipeline.fit_transform(X)
X_test = preprocess_pipeline.transform(X_test)
y = pd.concat([y_train, y_unlabeled])
y_label_spread = pd.DataFrame()
y_pred = pd.DataFrame()
for category in categories:
    ls = LabelSpreading()
    ls.fit(X, y[category])
    y_label_spread[category] = ls.transduction_
    model_pipeline.fit(X, y_label_spread[category])
    y_pred[category] = ls.predict(X_test)
    print("score", category, ":", roc_auc_score(y_test[category], y_pred[category]))

score category_1 : 0.5473484848484849
score category_2 : 0.5555555555555556
score category_3 : 0.5391666666666667
score category_4 : 0.5813953488372092


In [19]:
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.metrics import roc_auc_score

# extract labeled data
X_labeled = pd.read_csv(DIRECTORY + 'X_train.csv', sep=';')
#X_labeled = X_labeled['Caption']
X_labeled = preprocess_data(X_labeled['Caption'])

y_labeled = pd.read_csv(DIRECTORY + 'y_train.csv', sep=';').drop(columns=['Id'])

# extract unlabeled data
X_unlabeled = pd.read_csv(DIRECTORY + 'nonlabeled_data.csv', sep=';')
#X_unlabeled = X_unlabeled['Caption']
X_unlabeled = preprocess_data(X_unlabeled['Caption'])
nb_unlabeled = X_unlabeled.size
y_unlabeled = pd.DataFrame([[-1, -1, -1, -1] for i in range(nb_unlabeled)], columns=categories)


SVC_pipeline = make_pipeline(
    CountVectorizer(),
    SelfTrainingClassifier(SVC(kernel='linear', probability=True)),
)
y_pred = pd.DataFrame()
n_splits = 5
for category in categories:
    score = 0
    sss = StratifiedShuffleSplit(n_splits, test_size=0.1)
    for train_idx, test_idx in sss.split(X_labeled, y_labeled[category]):
        X_train = pd.concat([X_labeled[train_idx], X_unlabeled])
        y_train = pd.concat([y_labeled[category][train_idx], y_unlabeled[category]])
        SVC_pipeline.fit(X_train, y_train)
        y_pred[category] = SVC_pipeline.predict(X_labeled[test_idx])
        score += roc_auc_score(y_labeled[category][test_idx], y_pred[category])
    print("score", category, ":", score/n_splits)



score category_1 : 0.7328571428571429
score category_2 : 0.7409926470588235
score category_3 : 0.7738333333333334
score category_4 : 0.5286821705426357


## Prédiction pour Kaggle

In [21]:
from sklearn.svm import LinearSVC

X_test = pd.read_csv(DIRECTORY + 'X_test.csv', sep=';')
index = X_test['Id']
X_test = preprocess_data(X_test['Caption'])
X_test = X_test.apply(lambda x: ' '.join(x))
#X_test = X_test['Caption']

SVC = make_pipeline(
    CountVectorizer(),
    OneVsRestClassifier(LinearSVC())
)
y_pred = pd.DataFrame(index, columns=['Id'])
for category in categories:
    SVC.fit(X, y[category])
    y_pred[category] = SVC.predict(X_test)
y_pred

Unnamed: 0,Id,category_1,category_2,category_3,category_4
0,599,0,0,0,0
1,600,0,0,0,0
2,602,0,0,0,0
3,603,0,0,0,0
4,604,0,0,0,0
...,...,...,...,...,...
152,798,0,0,0,0
153,799,0,0,0,0
154,800,0,0,0,0
155,801,0,0,0,0


In [15]:
X_test = pd.read_csv(DIRECTORY + 'X_test.csv', sep=';')
index = X_test['Id']
X_test = preprocess_data(X_test['Caption'])
#X_test = X_test['Caption']

NB = make_pipeline(
    CountVectorizer(),
    TfidfTransformer(),
    OneVsRestClassifier(MultinomialNB())
)

y_pred = pd.DataFrame(index, columns=['Id'])
for category in categories:
    NB.fit(X, y[category])
    y_pred[category] = NB.predict(X_test)
y_pred



Unnamed: 0,Id,category_1,category_2,category_3,category_4
0,599,0,0,0,0
1,600,0,0,1,0
2,602,0,0,1,0
3,603,0,1,0,0
4,604,0,0,0,0
...,...,...,...,...,...
152,798,0,1,0,0
153,799,0,0,1,0
154,800,0,0,0,0
155,801,0,1,0,0


In [21]:
X_test = pd.read_csv(DIRECTORY + 'X_test.csv', sep=';')
index = X_test['Id']
X_test = preprocess_data(X_test['Caption'])
#X_test = X_test['Caption']

y_pred = pd.DataFrame(index, columns=['Id'])
for category in categories:
    y_pred[category] = SVC_pipeline.predict(X_test)
y_pred



Unnamed: 0,Id,category_1,category_2,category_3,category_4
0,599,0,0,0,0
1,600,0,0,0,0
2,602,0,0,0,0
3,603,0,0,0,0
4,604,0,0,0,0
...,...,...,...,...,...
152,798,1,1,1,1
153,799,0,0,0,0
154,800,0,0,0,0
155,801,0,0,0,0


In [16]:
y_pred.to_csv("results/y_linear_Naives_Bayes_clean.csv", index=False)