# NLP with Ensemble Method

Les scores obtenus avec la regression logistique pour les différentes catégories étaient les suivants :
- f1 score pour la category_1 : 0.1601514910747391
- f1 score pour la category_2 : 0.6212450536216092
- f1 score pour la category_3 : 0.7704773846842813
- f1 score pour la category_4 : 0.0

Là où la regression logistique semble assez bien performée sur la catégorie 3, elle n'est d'aucune utilité pour prédire la catégorie 4. Nous allons donc essayer d'améliorer ces résultats en utilisant d'autres méthodes et prendre le meilleur classificateur pour chaque catégorie

### Extraction des données

In [47]:
import pandas as pd
import numpy as np

DIRECTORY = "challenge_dataset/"

# extract data
X_extract = pd.read_csv(DIRECTORY + 'X_train.csv', sep=';').drop(columns=['Id'])
y = pd.read_csv(DIRECTORY + 'y_train.csv', sep=';').drop(columns=['Id'])

categories = list(y.columns)
print(categories)

['category_1', 'category_2', 'category_3', 'category_4']


In [81]:
import spacy

def preprocess_data(data_input: pd.Series):
    nlp = spacy.load('fr_core_news_md')

    def tokenize(sentence):
        doc = nlp(sentence)
        return [token.lemma_ for token in doc if not (token.is_stop or token.is_punct or token.is_digit or '\r\n' in token.text)]

    #tokenize data
    tokens = [tokenize(sentence) for sentence in data_input]

    return pd.Series(tokens)

X = preprocess_data(X_extract['Caption'])
X = X.apply(lambda x: ' '.join(x))
X



0                       mourir heure revoir petit enfant
1      maladie conséquence jeune génération voir situ...
2                                       sortir mal loger
3      inquiétude santé proche fragile femme battre e...
4                       bien entendre contracter maladie
                             ...                        
480                                    forme grave civid
481    inquiétude retrouver liberté action total limi...
482    incertitude être voir petit fils jusque ruptur...
483    inquiétude normal face épidémie crainte voir p...
484                                   respecter barrière
Length: 485, dtype: object

## Naives Bayes

In [82]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.naive_bayes import MultinomialNB

n_splits = 5 # nombre de splits pour le KFold
NB = make_pipeline(
    CountVectorizer(ngram_range=(1,2)),
    OneVsRestClassifier(MultinomialNB())
)
cv = KFold(n_splits, shuffle=True)
for category in categories:
    y_category = y[category]
    print("f1 score", category, ":", np.mean(cross_val_score(NB, X, y_category, cv=cv, scoring='f1', error_score='raise')))

f1 score category_1 : 0.5661996032760153
f1 score category_2 : 0.7225825317642476
f1 score category_3 : 0.7862591827863508
f1 score category_4 : 0.1042016806722689


## Linear SVC

In [93]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold

n_splits = 10 # nombre de splits pour le KFold
SVC = make_pipeline(
    CountVectorizer(ngram_range=(1,2), min_df=3),
    OneVsRestClassifier(LinearSVC())
)
cv = StratifiedKFold(n_splits, shuffle=True)
for category in categories:
    print("f1 score", category, ":", np.mean(cross_val_score(SVC, X, y[category], cv=cv, scoring='f1')))

f1 score category_1 : 0.5419441428137081
f1 score category_2 : 0.7275056248094575
f1 score category_3 : 0.7784471290614864
f1 score category_4 : 0.384949494949495


In [74]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfTransformer

n_splits = 10 # nombre de splits pour le KFold
SVC = make_pipeline(CountVectorizer(ngram_range=(1,2)),
    FunctionTransformer(lambda x: x.toarray()),
    OneVsRestClassifier(LinearDiscriminantAnalysis())
)
cv = StratifiedKFold(n_splits, shuffle=True)
for category in categories:
    print("f1 score", category, ":", np.mean(cross_val_score(SVC, X, y[category], cv=cv, scoring='f1')))

f1 score category_1 : 0.47407763439576805
f1 score category_2 : 0.4282271356094885
f1 score category_3 : 0.640858017304879
f1 score category_4 : 0.2115151515151515


In [39]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

SVC = make_pipeline(CountVectorizer(),
    OneVsRestClassifier(LinearSVC())
)

scores = []
y_pred = pd.DataFrame()
X_train
SVC.fit(X_train, y_train['category_1'])


for category in categories:
    SVC.fit(X_train, y_train[category])
    y_pred[category] = SVC.predict(X_test)
    scores.append(f1_score(y_test[category], y_pred[category]))
for index, row in y_pred.iterrows():
    if (row['category_1'] == 0) and (row['category_2'] == 0) and (row['category_3'] == 0):
        row['category_4'] = 1
scores[3] = f1_score(y_test['category_4'], y_pred['category_4'])
for idx, category in enumerate(categories):
    print("f1 score", category, ":", scores[idx])

f1 score category_1 : 0.6153846153846153
f1 score category_2 : 0.6
f1 score category_3 : 0.7441860465116279
f1 score category_4 : 0.09523809523809522


L'utilisation de stopwords n'augmente pas les résultats, et other_stopwords diminue les résultats

## Logistic Regression

In [41]:
from sklearn.linear_model import LogisticRegression

n_splits = 5 # nombre de splits pour le KFold
LR = make_pipeline(CountVectorizer(),
    OneVsRestClassifier(LogisticRegression(solver='sag', max_iter=10000))
)
cv = KFold(n_splits, shuffle=True)
for category in categories:
    print("f1 score", category, ":", np.mean(cross_val_score(LR, X, y[category], cv=cv, scoring='f1')))

f1 score category_1 : 0.5038190414786159
f1 score category_2 : 0.694308848501942
f1 score category_3 : 0.7739567789021589
f1 score category_4 : 0.17931623931623933


Le meilleur classificateur semble être le Linear SVC

## Semi-supervised Learning

In [49]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.semi_supervised import LabelSpreading
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

# extract labeled data
X_labeled = pd.read_csv(DIRECTORY + 'X_train.csv', sep=';')
X_labeled = X_labeled['Caption']
y_labeled = pd.read_csv(DIRECTORY + 'y_train.csv', sep=';').drop(columns=['Id'])

# extract unlabeled data
X_unlabeled = pd.read_csv(DIRECTORY + 'nonlabeled_data.csv', sep=';')
X_unlabeled = X_unlabeled['Caption']
nb_unlabeled = X_unlabeled.shape[0]
y_unlabeled = pd.DataFrame([[-1, -1, -1, -1] for i in range(nb_unlabeled)], columns=categories)

# création du jeu d'entraînement et de test à partir des données labelisées
X_train, X_test, y_train, y_test = train_test_split(X_labeled, y_labeled, test_size=0.1)

preprocess_pipeline = make_pipeline(
    CountVectorizer(),
    TfidfTransformer(),
    FunctionTransformer(lambda x: x.toarray()),
)
model_pipeline = make_pipeline(
    OneVsRestClassifier(LinearSVC())
)
X = pd.concat([X_train, X_unlabeled])
X = preprocess_pipeline.fit_transform(X)
X_test = preprocess_pipeline.transform(X_test)
y = pd.concat([y_train, y_unlabeled])
y_label_spread = pd.DataFrame()
y_pred = pd.DataFrame()
for category in categories:
    ls = LabelSpreading()
    ls.fit(X, y[category])
    y_label_spread[category] = ls.transduction_
    model_pipeline.fit(X, y_label_spread[category])
    y_pred[category] = ls.predict(X_test)
    print("f1 score", category, ":", f1_score(y_test[category], y_pred[category]))

f1 score category_1 : 0.4285714285714285
f1 score category_2 : 0.26666666666666666
f1 score category_3 : 0.23076923076923075
f1 score category_4 : 0.23255813953488372


In [97]:
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.model_selection import StratifiedShuffleSplit

# extract labeled data
X_labeled = pd.read_csv(DIRECTORY + 'X_train.csv', sep=';')
X_labeled = preprocess_data(X_labeled['Caption'])
X_labeled = X_labeled.apply(lambda x: ' '.join(x))

y_labeled = pd.read_csv(DIRECTORY + 'y_train.csv', sep=';').drop(columns=['Id'])

# extract unlabeled data
X_unlabeled = pd.read_csv(DIRECTORY + 'nonlabeled_data.csv', sep=';')
X_unlabeled = preprocess_data(X_unlabeled['Caption'])
X_unlabeled = X_unlabeled.apply(lambda x: ' '.join(x))
nb_unlabeled = X_unlabeled.shape[0]
y_unlabeled = pd.DataFrame([[-1, -1, -1, -1] for i in range(nb_unlabeled)], columns=categories)


SVC_pipeline = make_pipeline(
    CountVectorizer(ngram_range=(1,2), min_df=3),
    TfidfTransformer(),
    SelfTrainingClassifier(SVC(kernel='linear', probability=True))
)
y_pred = pd.DataFrame()
n_splits = 5
for category in categories:
    score = 0
    sss = StratifiedShuffleSplit(n_splits, test_size=0.1)
    for train_idx, test_idx in sss.split(X_labeled, y_labeled[category]):
        X_train = pd.concat([X_labeled[train_idx], X_unlabeled])
        y_train = pd.concat([y_labeled[category][train_idx], y_unlabeled[category]])
        SVC_pipeline.fit(X_train, y_train)
        y_pred[category] = SVC_pipeline.predict(X_labeled[test_idx])
        score += f1_score(y_labeled[category][test_idx], y_pred[category])
    print("f1 score", category, ":", score/n_splits)



## Prédiction pour Kaggle

In [95]:
from sklearn.svm import LinearSVC

X_test = pd.read_csv(DIRECTORY + 'X_test.csv', sep=';')
index = X_test['Id']
X_test = preprocess_data(X_test['Caption'])
X_test = X_test.apply(lambda x: ' '.join(x))

SVC = make_pipeline(
    CountVectorizer(ngram_range=(1,2), min_df=3),
    OneVsRestClassifier(LinearSVC())
)
y_pred = pd.DataFrame(index, columns=['Id'])
for category in categories:
    SVC.fit(X, y[category])
    y_pred[category] = SVC.predict(X_test)
y_pred



Unnamed: 0,Id,category_1,category_2,category_3,category_4
0,599,0,0,0,0
1,600,0,0,0,0
2,602,0,0,1,0
3,603,0,1,0,0
4,604,0,0,1,0
...,...,...,...,...,...
152,798,0,1,0,1
153,799,0,0,1,0
154,800,1,0,0,0
155,801,0,1,0,0


In [32]:
X_test = pd.read_csv(DIRECTORY + 'X_test.csv', sep=';')
index = X_test['Id']
X_test = X_test['Caption']

y_pred = pd.DataFrame(index, columns=['Id'])
for category in categories:
    y_pred[category] = SVC_pipeline.predict(X_test)
y_pred['category_3'].value_counts()

0    153
1      4
Name: category_3, dtype: int64

In [96]:
y_pred.to_csv("results/y_linear_SVC_on_labels.csv", index=False)