In [None]:
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from PIL import Image
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, roc_auc_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
%matplotlib inline

In [None]:
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

data1 = pd.read_csv('Users/Dell/Downloads/datac.csv', header=0).fillna(' ')
data = pd.read_csv('Users/Dell/Downloads/datab.csv', header=0).fillna(' ')

data1.head()

In [None]:
text = pd.concat([data1['comment_text'], data['comment_text']]).reset_index(drop=True)

In [None]:
tfidf = TfidfVectorizer(sublinear_tf=True,
strip_accents='unicode',
analyzer='word',
token_pattern=r'[a-z]{3,}',
stop_words="english",
ngram_range=(1, 2),
max_df=50000,
max_features=300)

tfidf.fit(text)
train_features = tfidf.transform(data1['comment_text'])

In [None]:
X_data1, X_data, y_data1, y_data = train_test_split(train_features.toarray(), data1[classes], test_size=0.3, random_state=0)

In [None]:
d = {k:[] for k in y_data.columns.tolist()}
plt.figure(0,figsize=(8,8)).clf()
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
avg_auc=0
for class_ in classes:
    logR = LogisticRegression(solver='sag', penalty='l2')
    model = GridSearchCV(estimator=logR, cv=3, param_grid={'C': [0.01, 0.1,1,10]}, scoring='roc_auc')
    model.fit(X_data1, y_data1[class_])
    print('CV Score for {} = {}'.format(class_, model.best_score_))
    prediction = model.predict_proba(X_data)
    actual = y_data[class_]
    fpr, tpr, threshold = roc_curve(actual, prediction[:,1])
    d[class_] = d[class_] + np.where(prediction[:,1] >= threshold [np.argmax(tpr-fpr)],1,0).tolist()
    AUC = np.round(roc_auc_score (actual, prediction[:,1]),2)
    avg_auc = avg_auc + AUC
    plt.plot(fpr, tpr, label=class_ + 'AUC =' +str(AUC))
    plt.legend (loc= "lower right")

plt.title('Logistic Regression with L2 Penalty and 3 Fold CV | Mean AUC = {}'.format(np.round(float(avg_auc)/6.0,2)))
y_pred = pd.DataFrame(d)
y_true = y_data.reset_index(drop=True)
match = {k:object for k in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']}
for i in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']:
    match[i] = np.where(y_true[i] == y_pred[i],1,0)

df_match = pd.DataFrame(match)

df_match['sum'] = df_match[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1)
df_match['hamming_match'] = df_match['sum'].astype(float)/6.0

In [None]:
d = {k:[] for k in y_data.columns.tolist()}
plt.figure(0,figsize=(8,8)).clf()
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
avg_auc=0
for class_ in classes:
    logR = RandomForestClassifier(n_estimators=1000, n_jobs, oob_score=True)
    model.fit(X_data1, y_data1[class_])
    prediction = model.predict_proba(X_data)
    actual = y_data[class_]
    fpr, tpr, threshold = roc_curve(actual, prediction[:,1])
    d[class_] = d[class_] + np.where(prediction[:,1] >= threshold [np.argmax(tpr-fpr)],1,0).tolist()
    AUC = np.round(roc_auc_score (actual, prediction[:,1]),2)
    avg_auc = avg_auc + AUC
    plt.plot(fpr, tpr, label=class_ + 'AUC =' +str(AUC))
    plt.legend (loc= "lower right")

plt.title('Random Forest | Mean AUC = {}'.format(np.round(float(avg_auc)/6.0,2)))
y_pred = pd.DataFrame(d)
y_true = y_data.reset_index(drop=True)
match = {k:object for k in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']}
for i in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']:
    match[i] = np.where(y_true[i] == y_pred[i],1,0)

df_match = pd.DataFrame(match)

df_match['sum'] = df_match[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1)
df_match['hamming_match'] = df_match['sum'].astype(float)/6.0

In [None]:
print('hamming score = %s' % str(np.round(df_match['hamming_match'].sum(axis=0)/len(df_match),2)))

In [None]:
d = {k:[] for k in y_data.columns.tolist()}
plt.figure(0,figsize=(8,8)).clf()
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
avg_auc=0
for class_ in classes:
    print(class_)
    model = MultinomialNB()
    model.fit(X_data1, y_data1[class_])
    prediction = model.predict_proba(X_data)
    actual = y_data[class_]
    fpr, tpr, threshold = roc_curve(actual, prediction[:,1])
    d[class_] = d[class_] + np.where(prediction[:,1] >= threshold [np.argmax(tpr-fpr)],1,0).tolist()
    AUC = np.round(roc_auc_score (actual, prediction[:,1]),2)
    avg_auc = avg_auc + AUC
    plt.plot(fpr, tpr, label=class_ + 'AUC =' +str(AUC))
    plt.legend (loc= "lower right")

plt.title('Multinomial NB | Mean AUC = {}'.format(np.round(float(avg_auc)/6.0,2)))
y_pred = pd.DataFrame(d)
y_true = y_data.reset_index(drop=True)
match = {k:object for k in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']}
for i in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']:
    match[i] = np.where(y_true[i] == y_pred[i],1,0)

df_match = pd.DataFrame(match)

df_match['sum'] = df_match[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1)
df_match['hamming_match'] = df_match['sum'].astype(float)/6.0

In [None]:
print('hamming score = %s' % str(np.round(df_match['hamming_match'].sum(axis=0)/len(df_match),2)))

In [None]:
def top_tfidf_feats(row, features, top_n=25):
    '''Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

def top_mean_feats(Xtr, features, grp_ids, min_tfidf=0.1, top_n=25):
    '''Return the top n features that on average are most important amongst documents in rows indentified by indices in grp_ids.'''
    D = Xtr[grp_ids].toarray()
    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

#modified for multilabel milticlass
def top_feats_by_class(Xtr, features, min_tfidf-8.1, top_n-28):
    '''Return a list of dfs, where each of holds top_n features and their mean tfidf value calculated across documents with the same class label.'''
    dfs = []
    cols = train_tags.columns
    for col in cols:
        ids = train_tags.index[train_tags[col]==1]
        feats_df = top_mean_feats(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
        feats_df.label = classes
        dfs.append(feats_df)
    return dfs

In [None]:
train_tags = data1.iloc[:,2:]
features = np.array(tfidf.get_feature_names())
train_unigrams = tfidf.transform(text.iloc[:data1.shape[0]])
tfidf_top_n_per_class = top_feats_by_class(train_unigrams, features)

tfidf.fit(text)
train_features = tfidf.transform(data1['comment_text'])

color = sns.color_palette()
sns.set_style("dark")
i = 0

for clas in classes:
    plt.figure(figsize=(16,22))
    plt.suptitle("Top words per class (unigrams)", fontsize=20)
    gridspec.GridSpec(4,2)
    plt.subplot2grid((4,2),(0,0))
    sns.barplot(tfidf_top_n_per_class[i].feature.iloc[0:9],tfidf_top_n_per_class[i].tfidf.iloc[8:9], color=color[i])
    plt.title("class: {}".format(clas), fontsize=15)
    plt.xlabel('word', fontsize=12)
    plt.ylabel('TF-IDF score', fontsize=12)
    i = i+1

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2)
sns.distplot(data1.toxic, ax = ax2)
sns.distplot(data1.insult, ax = ax1)
ax1.set_ylabel('Toxic')
ax2.set_ylabel('Insult')