In [None]:
import numpy as np
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

import re, string
import seaborn as sns

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

TOXIC_COMMENT_DATA_PATH = Path().cwd().parent / 'data'
print(TOXIC_COMMENT_DATA_PATH)
SEED = 42

In [None]:
def load_dataset(toxic_comment_data_path):
    train_df = pd.read_csv(toxic_comment_data_path / 'train.csv')
    test_df = pd.read_csv(toxic_comment_data_path / 'test.csv')
    test_labels_df = pd.read_csv(toxic_comment_data_path / 'test_labels.csv')
    return train_df, test_df, test_labels_df

In [None]:
train_df.head()

In [None]:
train_df['comment_text'][0]

In [None]:
test_df.head()

In [None]:
lens = train_df.comment_text.str.len()
lens.mean(), lens.std(), lens.max()

In [None]:
lens.hist()

## Nombre de classe par réponse

In [None]:
rowsums=train_df.iloc[:,2:].sum(axis=1)
x = rowsums.value_counts().sort_index()

#plot
plt.figure(figsize=(8,4))
ax = sns.barplot(x.index, x.values, alpha=0.8)
plt.title("Multiple tags per comment")
plt.ylabel('# of Occurrences', fontsize=12)
plt.xlabel('# of tags ', fontsize=12)

#adding the text labels
rects = ax.patches
labels = x.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')

plt.show()

In [None]:
LABELS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
COMMENT = 'comment_text'

In [None]:
train_df.shape,len(test_df)

## Clean data

In [None]:
def data_preprocessing(train_df, test_df):
    train_df['none'] = 1-train_df[LABELS].max(axis=1)
    train_df[COMMENT].fillna("unknown", inplace=True)
    test_df[COMMENT].fillna("unknown", inplace=True)
    return train_df, test_df

In [None]:
train_df, test_df = data_preprocessing(train_df, test_df)

## Building the model

In [None]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [None]:
def TF_IDF_Vectorizer(train_df,test_df, comment):    
    vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
                   min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
                   smooth_idf=1, sublinear_tf=1 )
    trn_term_doc = vec.fit_transform(train_df[comment])
    test_term_doc = vec.transform(test_df[comment])
    return trn_term_doc, test_term_doc, vec

In [None]:
trn_term_doc, test_term_doc, vec = TF_IDF_Vectorizer(train_df,test_df, COMMENT)

In [None]:
# Naive bayes
def pr(x,y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [None]:
def get_mdl_cross_val_score(X, y):
    r = np.log(pr(X, 1,y) / pr(X, 0,y))
    classifier = LogisticRegression()
    X_nb = X.multiply(r)
    cv_score = np.mean(cross_val_score(classifier, X_nb, y, cv=3, scoring='roc_auc'))
    return cv_score

In [None]:
def get_fit_mdl(x,y):
    y = y.values
    r = np.log(pr(x,1,y) / pr(x,0,y))
    m = LogisticRegression()
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [None]:
def fit_classifier_for_each_theme_and_get_its_naiveB_coef(df, train_term_doc,labels):
    all_classifiers_and_r = []
    for idx, theme in enumerate(labels):
        print('fit', theme)
        target = df[theme]
        classifier,r = get_fit_mdl(train_term_doc, target)
        all_classifiers_and_r.append([classifier,r])
    return all_classifiers_and_r

In [None]:
x = trn_term_doc
test_x = test_term_doc
x, test_x

## Cross Val Score

In [None]:
def compute_CV_score_for_each_class(df,labels,term_doc):
    scores = []
    for label in labels:
        target = df[label].values
        X = term_doc

        cv_score = get_mdl_cross_val_score(X, target)
        scores.append(cv_score)
        print('CV score for class {} is {}'.format(label, cv_score))
    return scores

In [None]:
scores = compute_CV_score_for_each_class(train_df,LABELS,x)

In [None]:
print(np.mean(scores))

## Prediction

In [None]:
x = trn_term_doc
y = train_df[LABELS]

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=SEED)

In [None]:
all_classifiers_and_NB_coeff = fit_classifier_for_each_theme_and_get_its_naiveB_coef(y_train,x_train, LABELS)

In [None]:
def get_prediction_and_feature_importance(x_test, classifiers_and_coef, labels):
    preds = np.zeros((x_test.shape[0], len(labels)))
    feature_importance = []
    for idx, theme in enumerate(labels):
        classifier,r = classifiers_and_coef[idx][0],classifiers_and_coef[idx][1]
        preds[:,idx] = classifier.predict_proba(x_test.multiply(r))[:,1]
        coefficients = classifier.coef_[0]
        coefficients = 100.0 * (coefficients / coefficients.max())
        feature_importance.append(coefficients)
    return preds, feature_importance

#### Prediction x_val (test labellisé)

In [None]:
y_preds = get_prediction_and_feature_importance(x_val, all_classifiers_and_NB_coeff, LABELS)

#### Prediction pour le test_df (non labellisé)

In [None]:
y_preds_unlabelized = get_prediction_and_feature_importance(test_x, all_classifiers_and_NB_coeff, LABELS)