In [29]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
%matplotlib inline

In [2]:
clean_data = pd.read_csv("clean_data.csv")

In [3]:
clean_data.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'CAPS1', 'CAPS2', 'CAPS3', 'CAPS4',
       'CAPS5', 'CAPS6', 'CAPS7', 'Obscenity1', 'Obscenity2', 'Obscenity3',
       'Obscenity4', 'Obscenity5', 'Obscenity6', 'Obscenity7', 'Threat1',
       'Threat2', 'Threat3', 'Threat4', 'Threat5', 'Threat6', 'Threat7',
       'Tweet', 'hatespeech1', 'hatespeech2', 'hatespeech3', 'hatespeech4',
       'hatespeech5', 'hatespeech6', 'hatespeech7', 'namecalling1',
       'namecalling2', 'namecalling3', 'namecalling4', 'namecalling5',
       'namecalling6', 'namecalling7', 'negprejudice1', 'negprejudice2',
       'negprejudice3', 'negprejudice4', 'negprejudice5', 'negprejudice6',
       'negprejudice7', 'noneng1', 'noneng2', 'noneng3', 'noneng4', 'noneng5',
       'noneng6', 'noneng7', 'porn1', 'porn2', 'porn3', 'porn4', 'porn5',
       'porn6', 'porn7', 'stereotypes1', 'stereotypes2', 'stereotypes3',
       'stereotypes4', 'stereotypes5', 'stereotypes6', 'stereotypes7',
       'clean_tweet', 'ID'],
      dtype='ob

In [4]:
train = pd.read_csv("train.csv")

In [5]:
#there are dupes in the training set?

In [5]:
train = clean_data.merge(train[['ID']], on = "ID", how = 'inner')

In [6]:
train.loc[1:10,['Tweet', 'clean_tweet']]

Unnamed: 0,Tweet,clean_tweet
1,RT @Wilderness: .@SecretaryJewell is opening t...,opening outdoors underprivileged kids honor la...
2,"RT @PMOIndia: All our people, Hindus, Muslims,...",people hindus muslims sikhs christians jains b...
3,RT @AVISKINSWEAT: Being non black and having c...,non black close relationships black people mak...
4,Needs and Concerns of Family Caregivers of Ame...,needs concerns family caregivers american indi...
5,.I wonder how many African-American students a...,wonder many africanamerican students classes a...
6,RT @zeynaiman: Laying in my girlfriend's bed w...,laying girlfriends bed watching ted cruz pasto...
7,RT @homeIwt: Michael Jackson // African-Americ...,michael jackson africanamerican best artist ti...
8,@EDinCali @WBVT_98FM @CNN @CBC @CJAD800 @stati...,98fm aroud cuba 18 african latin american nati...
9,All I wanna do is work in impoverished countri...,wan na work impoverished countries fascinate
10,RT @joanwalsh: The misogyny from @realDonaldTr...,misogyny combined w misogyny leftwing haters t...


In [7]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 3), stop_words='english')

In [9]:
train.loc[train['clean_tweet'].isnull(), 'Tweet']

29       https://t.co/MjOdG9SB3e https://t.co/ntSpKgYfTC 
526     @BryanFrench69 @Lying2yourselfe @charliesecho ...
580                              https://t.co/vBJe0lecJh 
582              RT @homenkovich: http://t.co/FvhAYrzs2m 
1058    https://t.co/H8TcpYOsVb... https://t.co/gEiolF...
1172                             https://t.co/ALJrEx96Nt 
2552    @Jnoubiii12 @iShabi7a @ShahTalks https://t.co/...
3178            RT @DuffMcKagan: https://t.co/bjXmca2AO3 
Name: Tweet, dtype: object

In [9]:
train = train.loc[train['clean_tweet'].isnull() == False, :]

In [10]:
features = tfidf.fit_transform(train['clean_tweet']).toarray()

### Create Labels

In [11]:
labels = ['CAPS', 'Obscenity', 'Threat', 'hatespeech', 'namecalling', 'negprejudice', 'noneng', 'porn', 'stereotypes']

In [12]:
for label in labels:
    cols = [label + str(x) for x in range(1,8)]
    train[label + '_num_yes'] = train[cols].sum(axis = 1)
    train[label] = train[label + '_num_yes'] >= 2

In [13]:
train[labels].sum(axis = 0)/train.shape[0]

CAPS            0.015535
Obscenity       0.052368
Threat          0.037835
hatespeech      0.024806
namecalling     0.083939
negprejudice    0.167878
noneng          0.081934
porn            0.007517
stereotypes     0.093961
dtype: float64

### Compare Performance on Different Labels

In [30]:
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(labels)))
entries = []
cms = []
for label  in labels:
    label_name = label
    fscores = cross_val_score(LinearSVC(), features, train[label], scoring='f1', cv=CV)
    y_pred = cross_val_predict(LinearSVC(), features, train[label], cv=5)
    conf_mat = confusion_matrix(train[label], y_pred)
    cms.append(conf_mat)
    for fold_idx, fscore in enumerate(fscores):    
        entries.append((label, fold_idx, fscore))
cv_df = pd.DataFrame(entries, columns=['label_name', 'fold_idx', 'fscore'])

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [25]:
cv_df.groupby('label_name').fscore.mean()
#many are performing terribly, but obscenity, hatespeech, and noneng show promise
#stereotypes performing the worst of these

label_name
CAPS            0.159341
Obscenity       0.625804
Threat          0.265596
hatespeech      0.672549
namecalling     0.345708
negprejudice    0.377458
noneng          0.651508
porn            0.442597
stereotypes     0.163216
Name: fscore, dtype: float64

In [None]:
#followed this tutorial for convenience: https://towardsdatascience.com/multi-class-text-classification-with-scikit-learn-12f1e60e0a9f