# Challenge Data : NLP

Extraction des données à partir des fichiers .CSV

In [2]:
import pandas as pd
import numpy as np

DIRECTORY = "challenge_dataset/"

# extract data
X = pd.read_csv(DIRECTORY + 'X_train.csv', sep=';')
y = pd.read_csv(DIRECTORY + 'y_train.csv', sep=';').drop(columns=['Id'])

categories = list(y.columns)
X = X['Caption']

## Algorithme SVC

Test du modèle avec score roc_auc

In [5]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

n_splits = 10 # nombre de splits pour le KFold
SVC = make_pipeline(
    CountVectorizer(),
    OneVsRestClassifier(LinearSVC())
)
cv = StratifiedKFold(n_splits, shuffle=True)
scores = []
for category in categories:
    scores.append(np.mean(cross_val_score(SVC, X, y[category], cv=cv, scoring='roc_auc')))
    print("score", category, ":", scores[-1])
print("total score", np.mean(scores))

score category_1 : 0.8113500784929355
score category_2 : 0.8338367067257011
score category_3 : 0.8706727053140095
score category_4 : 0.8065540789959394
total score 0.8306033923821464


## Preprocessing des mots

In [3]:
import spacy

def preprocess_data(data_input: pd.Series):
    nlp = spacy.load('fr_core_news_md')

    def tokenize_and_lemmatize(sentence):
        doc = nlp(sentence)
        return [token.lemma_ for token in doc if not (token.is_stop or token.is_punct or token.is_digit or '\r\n' in token.text)]

    lemmas = [tokenize_and_lemmatize(sentence) for sentence in data_input]

    return pd.Series(lemmas).apply(lambda x: ' '.join(x))

X_clean = preprocess_data(X)

Les résultats avec le preprocessing sont généralement moins ou aussi bons que sans, nous ne l'utilisons donc pas pour la suite

## Naives Bayes

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

n_splits = 10 # nombre de splits pour le KFold
NB = make_pipeline(
    CountVectorizer(),
    TfidfTransformer(),
    OneVsRestClassifier(MultinomialNB())
)
cv = StratifiedKFold(n_splits, shuffle=True)
scores = []
for category in categories:
    scores.append(np.mean(cross_val_score(NB, X, y[category], cv=cv, scoring='roc_auc')))
    print("score", category, ":", scores[-1])
print("total score", np.mean(scores))

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

n_splits = 5 # nombre de splits pour le KFold
LR = make_pipeline(CountVectorizer(),
    OneVsRestClassifier(LogisticRegression(solver='sag', max_iter=10000))
)
cv = StratifiedKFold(n_splits, shuffle=True)
scores = []
for category in categories:
    scores.append(np.mean(cross_val_score(LR, X, y[category], cv=cv, scoring='roc_auc')))
    print("score", category, ":", scores[-1])
print("total score", np.mean(scores))

score category_1 : 0.7655555555555555
score category_2 : 0.8531345441402367
score category_3 : 0.8665935012302792
score category_4 : 0.7476290262405174
total score 0.8082281567916472


## Semi-supervised Learning

On utilise le SVC en prenant en compte les données non labelisées pour augmenter le nombre de données d'entraînement

In [6]:
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.metrics import roc_auc_score

# extract labeled data
X_labeled = pd.read_csv(DIRECTORY + 'X_train.csv', sep=';')
#X_labeled = X_labeled['Caption']
X_labeled = preprocess_data(X_labeled['Caption'])

y_labeled = pd.read_csv(DIRECTORY + 'y_train.csv', sep=';').drop(columns=['Id'])

# extract unlabeled data
X_unlabeled = pd.read_csv(DIRECTORY + 'nonlabeled_data.csv', sep=';')
#X_unlabeled = X_unlabeled['Caption']
X_unlabeled = preprocess_data(X_unlabeled['Caption'])
nb_unlabeled = X_unlabeled.size
y_unlabeled = pd.DataFrame([[-1, -1, -1, -1] for i in range(nb_unlabeled)], columns=categories)


SVC_pipeline = make_pipeline(
    CountVectorizer(),
    SelfTrainingClassifier(SVC(kernel='linear', probability=True)),
)
y_pred = pd.DataFrame()
n_splits = 5
for category in categories:
    score = 0
    sss = StratifiedShuffleSplit(n_splits, test_size=0.1)
    for train_idx, test_idx in sss.split(X_labeled, y_labeled[category]):
        X_train = pd.concat([X_labeled[train_idx], X_unlabeled])
        y_train = pd.concat([y_labeled[category][train_idx], y_unlabeled[category]])
        SVC_pipeline.fit(X_train, y_train)
        y_pred[category] = SVC_pipeline.predict(X_labeled[test_idx])
        score += roc_auc_score(y_labeled[category][test_idx], y_pred[category])
    print("score", category, ":", score/n_splits)

score category_1 : 0.6857142857142857
score category_2 : 0.8009191176470589
score category_3 : 0.7943333333333334
score category_4 : 0.5096899224806202


# Résultats sur le meilleur modèle

Nos meilleurs résultats ont été obtenus avec l'aglorithme SVC linéaire

Prediction avec SVC linéaire sur les données de test

In [5]:
X_test = pd.read_csv(DIRECTORY + 'X_test.csv', sep=';')
index = X_test['Id']
X_test = X_test['Caption']

SVC = make_pipeline(
    CountVectorizer(),
    OneVsRestClassifier(LinearSVC())
)
y_pred = pd.DataFrame(index, columns=['Id'])
for category in categories:
    SVC.fit(X, y[category])
    y_pred[category] = SVC.predict(X_test)
y_pred

Unnamed: 0,Id,category_1,category_2,category_3,category_4
0,599,0,0,0,0
1,600,0,0,0,0
2,602,0,0,1,0
3,603,0,1,0,0
4,604,1,0,1,0
...,...,...,...,...,...
152,798,0,0,1,0
153,799,0,1,1,0
154,800,1,0,0,0
155,801,0,1,0,0


Export au format .CSV

In [None]:
y_pred.to_csv("results/y_pred.csv", index=False)