In [None]:
%matplotlib inline

import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec 
import matplotlib_venn as venn
import seaborn as sns
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer


import warnings
color = sns.color_palette()
warnings.filterwarnings("ignore")
eng_stopwords = set(stopwords.words("english"))
tokenizer = TweetTokenizer()
lem = WordNetLemmatizer()

In [None]:
train = pd.read_csv(r'train.csv')
test = pd.read_csv(r'test.csv')
# sub = pd.read_csv(r'sample_submission.csv')

In [None]:
train.head()

In [None]:
train.isnull().any()

Some of these comments that do not have any flags are actually clean. Let's determine how many there are.

In [None]:
rowsums = train.iloc[:, 2:].sum(axis=1)
train['clean'] = (rowsums == 0)

In [None]:
print('Number of clean comments: {:,} or {:.2%}'.format(train['clean'].sum(), train['clean'].sum() / train.shape[0]))

In [None]:
# train['comment_text'].loc[train['toxic'] == 1][:1].values[0]

Let's take a peak at some of these comments

In [None]:
# Display one comment from each label
for i in train.columns[2:]:
    print('[{}] \t {}'.format(i, train['comment_text'].loc[train[i] == 1][:1].values[0]))

What a bunch of meanies.

It looks like a comment can have multiple classifications (ie being toxic as well as obscene and an insult).

In [None]:
w = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].iloc[:].sum(axis=0)
x = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate', 'clean']].iloc[:].sum(axis=0)


fig, ax = plt.subplots(1,2, figsize=(16,6))
a = sns.barplot(w.index, w.values, ax=ax[0])
b = sns.barplot(x.index, x.values, ax=ax[1])

rects = b.patches
labels = x.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    b.text(rect.get_x() + rect.get_width() / 2, height + 5, label, ha='center', va='bottom')
    
rects = a.patches
labels = x.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    a.text(rect.get_x() + rect.get_width() / 2, height + 5, label, ha='center', va='bottom')
    
a.set_ylabel('Count', fontsize=14)
a.set_xlabel('Label ', fontsize=14)
b.set_xlabel('Label ', fontsize=14)

As expected, the label counts are not evenly distributed. Toxic comments heavily outweight any other label, while threats are quite rare.

However, toxic comments may have multiple labels.

In [None]:
x = rowsums.value_counts()


fig, ax = plt.subplots(1,1, figsize=(16,6))
a = sns.barplot(x.index, x.values)


rects = a.patches
labels = x.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    a.text(rect.get_x() + rect.get_width() / 2, height + 5, label, ha='center', va='bottom')

a.set_title('Multiple Label Counts')
a.set_ylabel('Count', fontsize=14)
a.set_xlabel('Number of multi-label counts ', fontsize=14)

In [None]:
main_col = "toxic"
corr_mats = []
temp_df = train.iloc[:, 2:-1]

for other_col in temp_df.columns[1:]:
    confusion_matrix = pd.crosstab(temp_df[main_col], temp_df[other_col])
    corr_mats.append(confusion_matrix)
out = pd.concat(corr_mats,axis=1,keys=temp_df.columns[1:])

# cell highlighting

def highlight_min(s):
    '''
    highlight the minimum in a Series yellow.
    '''
    is_min = s == s.min()
    return ['background-color: yellow' if v else '' for v in is_min]



out = out.style.apply(highlight_min, axis=0)
out

From the above chart which shows the counts of labels, we see that:
    - toxic = 15294
    - severe_toxic = 1595
    - obscene = 8449
    - threat = 478
    - insult = 7877
    - identity_hate = 1405
    
So now from the confusion matrix above, we can see that:
1. A severe_toxic comment is **always** toxic
2. Almost all obscene comments are toxic
3. Almost all threats, insults, and identity are toxic

## WordCloud - Most Frequent Words

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# Initalize stopwords
stopword = set(STOPWORDS)

In [None]:
dfclean = train[train['clean'] == True]
cleanComments = dfclean['comment_text'].values

cloud = WordCloud(background_color='black', max_words=2000, stopwords=stopword)
cloud.generate(" ".join(cleanComments))
plt.figure(figsize=(10,10))
plt.axis("off")
plt.title("Most Frequent Words - Clean Comments", fontsize=20)
plt.imshow(cloud.recolor(colormap= 'viridis' , random_state=17), alpha=0.98)
plt.show()

In [None]:
dftoxic = train.loc[train['toxic'] == 1]
toxicComments = dftoxic['comment_text'].values

cloud = WordCloud(background_color='black', max_words=2000, stopwords=stopword)
cloud.generate(" ".join(toxicComments))
plt.figure(figsize=(10,10))
plt.axis("off")
plt.title("Most Frequent Words - Toxic Comments", fontsize=20)
plt.imshow(cloud.recolor(colormap= 'viridis' , random_state=17), alpha=0.98)
plt.show()

In [None]:
dfSeveretoxic = train.loc[train['severe_toxic'] == 1]
SeveretoxicComments = dfSeveretoxic['comment_text'].values

cloud = WordCloud(background_color='black', max_words=2000, stopwords=stopword)
cloud.generate(" ".join(SeveretoxicComments))
plt.figure(figsize=(10,10))
plt.axis("off")
plt.title("Most Frequent Words - Severe Toxic Comments", fontsize=20)
plt.imshow(cloud.recolor(colormap= 'viridis' , random_state=17), alpha=0.98)
plt.show()

In [None]:
dfobscene = train.loc[train['obscene'] == 1]
obsceneComments = dfobscene['comment_text'].values

cloud = WordCloud(background_color='black', max_words=2000, stopwords=stopword)
cloud.generate(" ".join(obsceneComments))
plt.figure(figsize=(10,10))
plt.axis("off")
plt.title("Most Frequent Words - Obscene Comments", fontsize=20)
plt.imshow(cloud.recolor(colormap= 'viridis' , random_state=17), alpha=0.98)
plt.show()

In [None]:
dfthreat = train.loc[train['threat'] == 1]
threatComments = dfobscene['comment_text'].values

cloud = WordCloud(background_color='black', max_words=2000, stopwords=stopword)
cloud.generate(" ".join(threatComments))
plt.figure(figsize=(10,10))
plt.axis("off")
plt.title("Most Frequent Words - Threat Comments", fontsize=20)
plt.imshow(cloud.recolor(colormap= 'viridis' , random_state=17), alpha=0.98)
plt.show()

In [None]:
dfinsult = train.loc[train['insult'] == 1]
insultComments = dfinsult['comment_text'].values

cloud = WordCloud(background_color='black', max_words=2000, stopwords=stopword)
cloud.generate(" ".join(insultComments))
plt.figure(figsize=(10,10))
plt.axis("off")
plt.title("Most Frequent Words - Insult Comments", fontsize=20)
plt.imshow(cloud.recolor(colormap= 'viridis' , random_state=17), alpha=0.98)
plt.show()

In [None]:
dfhate = train.loc[train['identity_hate'] == 1]
hateComments = dfhate['comment_text'].values

cloud = WordCloud(background_color='black', max_words=2000, stopwords=stopword)
cloud.generate(" ".join(hateComments))
plt.figure(figsize=(10,10))
plt.axis("off")
plt.title("Most Frequent Words - Hate Comments", fontsize=20)
plt.imshow(cloud.recolor(colormap= 'viridis' , random_state=17), alpha=0.98)
plt.show()

## Data Cleanup & Feature Engineering

In [None]:
# Merge train & test dataset and reset index

merged = pd.concat([train.iloc[:, 0:2], test.iloc[:, 0:2]])
df = merged.reset_index(drop=True)

In [None]:
df['comment_text'][0]

Note that if we remove punctuation, we will end up removing complete sentences. Instead we will count various metrics in order to further understand the data.

In [None]:
# Find the number of new lines '\n'
df['sentenceCount'] = df['comment_text'].apply(lambda x: len(re.findall("\n", str(x))) + 1)

In [None]:
# Number of words 
df['wordCount'] = df['comment_text'].apply(lambda x: len(str(x).split()))

# Unique number of words
df['uniqueWordCount'] = df['comment_text'].apply(lambda x: len(set(str(x).split())))

# Number of letters
df['letterCount'] = df['comment_text'].apply(lambda x: len(str(x)))

# Punctuation count
df['puncCount'] = df['comment_text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

# Number of uppercase words
df["uppercaseCount"] = df["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))

# Number of titled words (words starting with capital letter)
df["titleWordCount"] = df["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

# Number of stopwords
df["stopwordCount"] = df["comment_text"].apply(lambda x: len([w for w in str(x).lower().split() if w in stopword]))

# Average word length
df["meanWordLength"] = df["comment_text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

# Unique word count percentage
df['wordCountPercent'] = np.round(df['uniqueWordCount'] / df['wordCount'] * 100, 2)

# Punctuation percentage per comment
df['puncPercent'] = np.round(df['puncCount'] / df['wordCount'] * 100, 2)

In [None]:
# Seperate Train/Test data
trainFeatures = df.iloc[0:len(train)]
testFeatures = df.iloc[len(train) :,]

# Join tags
trainTags = train.iloc[:, 2:]
trainFeatures = pd.concat([trainFeatures, trainTags], axis=1)

- Are comments with more punctuation more toxic?
- Are longer comments more toxic?

In [None]:
trainFeatures.loc[df['wordCount']]

In [None]:
trainFeatures[['wordCount', 'toxic']].sort_values(by='wordCount', ascending=False).shape

In [None]:
trainFeatures['wordCount'].loc[trainFeatures['toxic'] == 0].describe()

In [None]:
fig, ax = plt.subplots(1,2, figsize=(16,10))
a = sns.violinplot(x='toxic', y='wordCount', data=trainFeatures, ax=ax[0])
b = sns.violinplot(x='toxic', y='puncCount', data=trainFeatures, ax=ax[1])

a.set_title('Word Count vs Toxicity', fontsize=14)
b.set_title('Punc Count vs Toxicity', fontsize=14)

As per violin plots, neither word count nor punctuation count seems to affect toxicity. There are huge outliers which may indicate toxic comments are more 'spammy'.

We can determine 'spam' by looking at word count versus unique word count. When the ratio of unique word count to total word count is low, it would indicate spam.

In [None]:
trainFeatures['spamRatio'] = 1 - np.round(trainFeatures['uniqueWordCount'] / trainFeatures['wordCount'], 2)

In [None]:
trainFeatures.loc[trainFeatures['spamRatio'] == 1][:5].index

In [None]:
# Let's see what this comment looks like (first 250 characters)
trainFeatures.iloc[2420][1][:250]

In [None]:
trainFeatures.iloc[8705][1][:250]

Sure does look like spam - and toxic as well.

In [None]:
fig, ax = plt.subplots(1,2, figsize=(16,10))
c = sns.violinplot(x='toxic', y='spamRatio', data=trainFeatures.loc[trainFeatures['spamRatio'] > 0.75], ax=ax[0])
d = sns.violinplot(x='toxic', y='wordCount', data=trainFeatures.loc[trainFeatures['spamRatio'] > 0.75], ax=ax[1])

c.set_title('Spam Ratio vs Toxicity', fontsize=14)
d.set_title('Word Count vs Toxicity', fontsize=14)

As predicted, spam coincides with toxicity. Spammers are more toxic!

However, it's important to note that it is possible to spam and not be toxic. Let's see what that looks like:

In [None]:
trainFeatures.loc[(trainFeatures['spamRatio'] > 0.75) & (trainFeatures['toxic'] != 1)][:5].index

In [None]:
trainFeatures.iloc[2567][1][:250]

In this scenario, spam becomes toxic to our model too.

Let's create our own Count Vectorizer to pick up specific items.

In [None]:
# Leaky features
df['IP'] = df["comment_text"].apply(lambda x: re.findall("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", str(x)))

# Count of IP addresses
df['countIP'] = df["IP"].apply(lambda x: len(x))

# URLs
df['URL'] = df["comment_text"].apply(lambda x: re.findall("http://.*com",str(x)))

# Count of URLs
df['countURL'] = df["URL"].apply(lambda x: len(x))

# Article IDs
df['articleID'] = df["comment_text"].apply(lambda x: re.findall("\d:\d\d\s{0,5}$",str(x)))
df['articleIDFlag'] = df['articleID'].apply(lambda x: len(x))

# Username
df['username'] = df["comment_text"].apply(lambda x: re.findall("\[\[User(.*)\|",str(x)))

#count of username mentions
df['countUsernames'] = df["username"].apply(lambda x: len(x))

# check if features are created
# df.username[df.count_usernames>0]

# Leaky IP
cv = CountVectorizer()
count_feats_ip = cv.fit_transform(df["IP"].apply(lambda x : str(x)))


# Leaky usernames

cv = CountVectorizer()
count_feats_user = cv.fit_transform(df["username"].apply(lambda x : str(x)))

In [None]:
# Let's check some feature names

cv.get_feature_names()[100:115]

It may be useful to determine if we have a lot of reocurring features - such as IPs and URLs. 

In [None]:
df.columns

In [None]:
leaky_feats = df[["IP", "URL", "articleID", "username", "countIP","countURL","countUsernames","articleIDFlag"]]

leaky_feats_train = leaky_feats.iloc[:train.shape[0]]
leaky_feats_test = leaky_feats.iloc[train.shape[0]:]

In [None]:
import matplotlib_venn as venn

# Filter out items that do not contain IPs
train_IPs = leaky_feats_train.loc[(leaky_feats_train['IP'].str.len() != 0)]
test_IPs = leaky_feats_test.loc[(leaky_feats_test['IP'].str.len() != 0)]

# Obtain list of unique IPs
train_IP_list = list(set([a for b in train_IPs['IP'].tolist() for a in b]))
test_IP_list = list(set([a for b in test_IPs['IP'].tolist() for a in b]))

# Obtain common elements
common_IP_list = list(set(train_IP_list).intersection(test_IP_list))

plt.figure(figsize=(8, 8))
plt.title("Common IP addresses")
venn.venn2(subsets=(len(train_IP_list), len(test_IP_list), len(common_IP_list)),
           set_labels=("# of unique IPs (Train)", "# of unique IPs (Test)"))
plt.show()

In [None]:
# Filter out items without URLs
train_URLs = leaky_feats_train.loc[(leaky_feats_train['URL'].str.len() != 0)]
test_URLs = leaky_feats_test.loc[(leaky_feats_test['URL'].str.len() != 0)]

# Obtain list of unique URLs
train_URLs_list = list(set([a for b in train_IPs['URL'].tolist() for a in b]))
test_URLs_list = list(set([a for b in test_IPs['URL'].tolist() for a in b]))

# Obtain common elements
common_URLs_list=list(set(train_URLs_list).intersection(test_URLs_list))

plt.figure(figsize=(8, 8))
plt.title("Common URLs")
venn.venn2(subsets=(len(train_URLs_list),len(test_URLs_list),len(common_URLs_list)),
           set_labels=("# of unique URLs (Train)","# of unique URLs (Test)"))
plt.show()

In [None]:
# Filter out items without usernames
train_users = leaky_feats_train['username'].loc[(leaky_feats_train['countUsernames'] != 0)]
test_users = leaky_feats_test['username'].loc[(leaky_feats_test['countUsernames'] != 0)]

# Obtain list of unique usernames
train_users_list = list(set([a for b in train_users.tolist() for a in b]))
test_users_list = list(set([a for b in test_users.tolist() for a in b]))
                        
# Obtain common elements
common_users_list = list(set(train_users_list).intersection(test_users_list))
                       
plt.title("Common usernames")
venn.venn2(subsets=(len(train_users_list),len(test_users_list),len(common_users_list)),
           set_labels=("# of unique usernames (Train)","# of unique usernames (Test)"))
plt.show()

The feature stability (or reoccurance) of train dataset usernames in the test dataset seems to be minimal. 

Therefore can just use the IPs/URLs in common (intersection) for test and train in our feature engineering.

Note that it may be useful to look more into these IPs - for example, there may be invalid or blocked IPs present that we would not want to interfere with our model (https://en.wikipedia.org/wiki/Wikipedia:Database_reports/Indefinitely_blocked_IPs)

### Corpus Cleaning

In [None]:
# Aphost lookup dict
APPO = {
"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not",
"tryin'":"trying"
}

In [None]:
corpus = merged['comment_text']

In [None]:
def cleanComment(comment):
    """Takes a comment and returns cleaned copy."""
    
    # Lower case
    comment = comment.lower()
    
    # Remove '\n'
    comment = re.sub('\\n', '', comment)
    
    # Remove IPs
    comment = re.sub('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', '', comment)
    
    # Remove username
    comment = re.sub('\[\[.*\]', '', comment)
    
    # Split comment (sentences) into words (tokens)
    words = tokenizer.tokenize(comment)
    
    # Replace apostrophes; you're --> you are  
    # using basic dictionary lookup 
    words = [APPO[word] if word in APPO else word for word in words]
    words = [lem.lemmatize(word, 'v') for word in words]
    words = [w for w in words if not w in eng_stopwords]
    
    cleaned = ' '.join(words)

    return(cleaned)

In [None]:
corpus.iloc[12235]

In [None]:
cleanComment(corpus.iloc[12235])

In [None]:
# Clean entire corpus

%time clean_corpus = corpus.apply(lambda x: cleanComment(x))

### Direct Features

- Count Based Features (unigrams)

Let's create some features based on the frequency distribution of the words. We can start by taking words one at a time (unigrams).

- CountVectorizer 
    - Creates a matrix with frequency counts of each word in the text corpus
- TF IDF Vectorizer
    - Term Frequency: Count of the words (terms) in the corpus (same as CountVectorizer)
    - Inverse Document Frequency: Penalizes words that are too frequent (can be thought of as regularization)
- HashingVectorizer
    - Creates a hashmap (word to number mapping based on hashing technique) instead of a dictionary for words
 

In [None]:
# https://buhrmann.github.io/tfidf-analysis.html

def top_tfidf_feats(row, features, top_n=25):
    """Get top n tfidf values in row and return them with their corresponding feature names."""
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    """Top tfidf features in specific document (matrix row)"""
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

def top_mean_feats(Xtr, features, grp_ids, min_tfidf=0.1, top_n=25):
    """Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids."""
    
    D = Xtr[grp_ids].toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

# modified for multilabel milticlass
def top_feats_by_class(Xtr, features, min_tfidf=0.1, top_n=20):
    """Return a list of dfs, where each df holds top_n features and their mean tfidf value
       calculated across documents with the same class label."""
    dfs = []
    cols=train_tags.columns
    for col in cols:
        ids = train_tags.index[train_tags[col]==1]
        feats_df = top_mean_feats(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
        feats_df.label = label
        dfs.append(feats_df)
    return dfs

In [None]:
# Parameter Descriptions

# min_df=10; ignores terms that appear less than 10 times
# max_features=None; creates as many words as present in the corpus, restricting to 10k for memory capacity
# analyzer='word'; creates features from words 
# ngram_range=(1,1); use only one word at a time (ie unigram)
# strip_accents='unicode'; removes accents
# use_idf=1, smooth_idf=1; enables IDF
# sublinear_tf=1; apply sublinear scaling - ie replace tf with 1 + log(tf)


startUnigrams = time.time()

tfv = TfidVectorizer(min_df=10, max_features=10000, strip_accents='unicode', analyzer='word', ngram_range=(1,1),
                     use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english')
tfv.fit(clean_corpus)
features = np.array(tfv.get_feature_names())

train_unigrams = tfv.transform(clean_corpus.iloc[:train.shape[0]])
test_unigrams = tfv.transform(clean_corpus.iloc[train.shape[0]:])

In [None]:
# Fetch top 'n' for Unigrams

tfidf_top_n_per_class = top_feats_by_class(train_unigrams, features)

endUnigrams = time.time()

print("Time to compute unigrams: {:.2f}".format(endUnigrams - startUnigrams))

### TF-IDF Output

In [None]:
plt.figure(figsize=(16, 22))
plt.suptitle("TF-IDF Top Words Per Class (Unigrams)", fontsize=20)
gridspec.GridSpec(4,2)
plt.subplot2grid((4,2), (0,0))
sns.barplot(tfidf_top_n_per_class[0].feature.iloc[0:9], tfidf_top_n_per_class[0].tfidf.iloc[0:9], color=color[0])
plt.title("Class: Toxic", fontsize=15)
plt.xlabel("Word", fontsize=12)
plt.ylabel("TF-IDF Score", fontsize=12)

plt.subplot2grid((4,2), (0,1))
sns.barplot(tfidf_top_n_per_lass[1].feature.iloc[0:9],tfidf_top_n_per_lass[1].tfidf.iloc[0:9],color=color[1])
plt.title("Class: Severe Toxic", fontsize=15)
plt.xlabel('Word', fontsize=12)
plt.ylabel('TF-IDF Score', fontsize=12)


plt.subplot2grid((4,2), (1,0))
sns.barplot(tfidf_top_n_per_lass[2].feature.iloc[0:9],tfidf_top_n_per_lass[2].tfidf.iloc[0:9],color=color[2])
plt.title("Class: Obscene", fontsize=15)
plt.xlabel('Word', fontsize=12)
plt.ylabel('TF-IDF Score', fontsize=12)


plt.subplot2grid((4,2), (1,1))
sns.barplot(tfidf_top_n_per_lass[3].feature.iloc[0:9],tfidf_top_n_per_lass[3].tfidf.iloc[0:9],color=color[3])
plt.title("Class: Threat", fontsize=15)
plt.xlabel('Word', fontsize=12)
plt.ylabel('TF-IDF Score', fontsize=12)


plt.subplot2grid((4,2), (2,0))
sns.barplot(tfidf_top_n_per_lass[4].feature.iloc[0:9],tfidf_top_n_per_lass[4].tfidf.iloc[0:9],color=color[4])
plt.title("Class: Insult", fontsize=15)
plt.xlabel('Word', fontsize=12)
plt.ylabel('TF-IDF Score', fontsize=12)


plt.subplot2grid((4,2),(2,1))
sns.barplot(tfidf_top_n_per_lass[5].feature.iloc[0:9],tfidf_top_n_per_lass[5].tfidf.iloc[0:9],color=color[5])
plt.title("Class: Identity Hate", fontsize=15)
plt.xlabel('Word', fontsize=12)
plt.ylabel('TF-IDF Score', fontsize=12)


plt.subplot2grid((4,2),(3,0),colspan=2)
sns.barplot(tfidf_top_n_per_lass[6].feature.iloc[0:19],tfidf_top_n_per_lass[6].tfidf.iloc[0:19])
plt.title("Class: Clean", fontsize=15)
plt.xlabel('Word', fontsize=12)
plt.ylabel('TF-IDF Score', fontsize=12)

plt.show()

Let's do the same analysis - except for Bigrams this time.

In [None]:
startBigrams = time.time()

# Set min to 150 to quickly get top features; change back to 10 for better results
tfv = TfidVectorizer(min_df=10, max_features=30000, strip_accents='unicode', analyzer='word', ngram_range=(2,2),
                     use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english')

tfv.fit(clean_corpus)
features = np.array(tfv.get_feature_names())

train_bigrams = tfv.transform(clean_corpus.iloc[:train.shape[0]])
test_bigrams = tfv.transform(clean_corpus.iloc[train.shape[0]:])



# Fetch top 'n' Bigrams
tfidf_top_n_per_class = top_feats_by_class(train_unigrams, features)

endBigrams = time.time()

print("Time to compute bigrams: {:.2f}".format(endBigrams - startBigrams))

In [None]:
plt.figure(figsize=(16, 22))
plt.suptitle("TF-IDF Top Words Per Class (Bigrams)", fontsize=20)
gridspec.GridSpec(4,2)
plt.subplot2grid((4,2), (0,0))
sns.barplot(tfidf_top_n_per_lass[0].feature.iloc[0:5],tfidf_top_n_per_lass[0].tfidf.iloc[0:5],color=color[0])
plt.title("Class: Toxic", fontsize=15)
plt.xlabel('Word', fontsize=12)
plt.ylabel('TF-IDF Score', fontsize=12)

plt.subplot2grid((4,2), (0,1))
sns.barplot(tfidf_top_n_per_lass[1].feature.iloc[0:5],tfidf_top_n_per_lass[1].tfidf.iloc[0:5],color=color[1])
plt.title("Class: Severe Toxic",fontsize=15)
plt.xlabel('Word', fontsize=12)
plt.ylabel('TF-IDF Score', fontsize=12)


plt.subplot2grid((4,2), (1,0))
sns.barplot(tfidf_top_n_per_lass[2].feature.iloc[0:5],tfidf_top_n_per_lass[2].tfidf.iloc[0:5],color=color[2])
plt.title("Class: Obscene",fontsize=15)
plt.xlabel('Word', fontsize=12)
plt.ylabel('TF-IDF score', fontsize=12)


plt.subplot2grid((4,2), (1,1))
sns.barplot(tfidf_top_n_per_lass[3].feature.iloc[0:5],tfidf_top_n_per_lass[3].tfidf.iloc[0:5],color=color[3])
plt.title("Class: Threat",fontsize=15)
plt.xlabel('Word', fontsize=12)
plt.ylabel('TF-IDF Score', fontsize=12)


plt.subplot2grid((4,2), (2,0))
sns.barplot(tfidf_top_n_per_lass[4].feature.iloc[0:5],tfidf_top_n_per_lass[4].tfidf.iloc[0:5],color=color[4])
plt.title("Class: Insult",fontsize=15)
plt.xlabel('Word', fontsize=12)
plt.ylabel('TF-IDF Score', fontsize=12)


plt.subplot2grid((4,2), (2,1))
sns.barplot(tfidf_top_n_per_lass[5].feature.iloc[0:5],tfidf_top_n_per_lass[5].tfidf.iloc[0:5],color=color[5])
plt.title("Class: Identity Hate",fontsize=15)
plt.xlabel('Word', fontsize=12)
plt.ylabel('TF-IDF Score', fontsize=12)


plt.subplot2grid((4,2), (3,0), colspan=2)
sns.barplot(tfidf_top_n_per_lass[6].feature.iloc[0:9],tfidf_top_n_per_lass[6].tfidf.iloc[0:9])
plt.title("Class: Clean",fontsize=15)
plt.xlabel('Word', fontsize=12)
plt.ylabel('TF-IDF Score', fontsize=12)

plt.show()

Now let's do it again for character-ngrams (specifically 1 x 4).

In [None]:
startChargrams = time.time()

tfv = TfidVectorizer(min_df=10, max_features=30000, strip_accents='unicode', analyzer='char', ngram_range=(1,4),
                     use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english')

tfv.fit(clean_corpus)
features = np.array(tfv.get_feature_names())

train_chargrams = tfv.transform(clean_corpus.iloc[:train.shape[0]])
test_chargrams = tfv.transform(clean_corpus.iloc[train.shape[0]:])



# Fetch top 'n' Bigrams
tfidf_top_n_per_class = top_feats_by_class(train_unigrams, features)

endChargrams = time.time()

print("Time to compute bigrams: {:.2f}".format(endChargrams - startChargrams))

### Model Building

In [None]:
trainFeatures.columns

In [None]:
SELECTED_COLS = ['sentenceCount', 'wordCount', 'uniqueWordCount', 'letterCount', 
                 'puncCount', 'uppercaseCount', 'titleWordCount','stopwordCount',
                 'meanWordLength', 'wordCountPercent', 'puncPercent']

target_x = trainFeatures[SELECTED_COLS]

TARGET_COLS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
target_y = trainTags[TARGET_COLS]