# Importing the necessary libraries and loading data

In [None]:
import pandas as pd
# For visualizations
import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords

from matplotlib_venn import venn3

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [None]:
PATH = "train.csv"
data = pd.read_csv(PATH)
print("Shape of data=>",data.shape)
# list of categories
cat = data.columns.tolist()[2:]

## First draft visualisations

In [None]:
# Plotting the frequency of each category
df = pd.melt(data,
             id_vars=['id'],
             value_vars = cat,
             var_name = 'Category',
             value_name = 'Count')
df = df.loc[df.Count>0]
ax = sns.countplot(x='Category', data=df)
for p in ax.patches: 
    ax.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.15, p.get_height()+1)) 
plt.xticks(rotation=45, fontsize=12, ha="right")
plt.show()

In [None]:
# plotting the counts of multilabled comments
data['label_count'] = data['toxic'] + data['severe_toxic'] + data['obscene'] + data['threat'] + data['insult'] + data['identity_hate']
ax = sns.countplot(x='label_count', data=data[data['label_count'] > 0])
for p in ax.patches: 
    ax.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.15, p.get_height()+0.01)) 
plt.show()

# Creating word clouds for each category
source for wordcloud above: https://www.geeksforgeeks.org/generating-word-cloud-python/

In [None]:
# Plotting word clouds for each category
stopwords = set(STOPWORDS)

axes = []
fig = plt.figure(figsize=(25, 15))
for a in range(len(cat)):
    comment_words = ''
    subdata = data[(data[cat[a]] == 1)]
    for t_text in subdata.comment_text:
        # typecaste each val to string
        t_text = str(t_text)
        # split the value
        tokens = t_text.split()
        # Converts each token into lowercase
        for i in range(len(tokens)):
            tokens[i] = tokens[i].lower()
        comment_words += " ".join(tokens)+" "
    wordcloud = WordCloud(width = 800, height = 800,
                    background_color ='black',
                    stopwords = stopwords,
                    min_font_size = 10).generate(comment_words)
    # plot the WordCloud images
    axes.append(fig.add_subplot(2, 3, a+1))
    subplot_title=(cat[a])
    axes[-1].set_title(subplot_title, fontsize=40) 
    plt.imshow(wordcloud)
    plt.axis("off")      
fig.tight_layout()
plt.show()


# Average length of comments due to their category

In [None]:
# get average number of words per comment

def get_avg_word_num(data):
    final_count = []    
    for i in range(len(data)):
        words = data['comment_text'].iloc[i].split()
        final_count.append(len(words))
    avg = sum(final_count) / len(final_count)    
    return avg

In [None]:
toxic_data = data[(data['toxic'] == 1)]
sev_tox_data = data[(data['severe_toxic'] == 1)]
obs_data = data[(data['obscene'] == 1)]
threat_data = data[(data['threat'] == 1)]
insult_data = data[(data['insult'] == 1)]
identity_data = data[(data['identity_hate'] == 1)]
non_toxic_data = data[(data['toxic'] == 0) & (data['severe_toxic'] == 0) & (data['obscene'] == 0) & (data['threat'] == 0) & (data['insult'] == 0) & (data['identity_hate'] == 0)]

all_dfs = [non_toxic_data, toxic_data, sev_tox_data, obs_data, threat_data, insult_data, identity_data]

sev_tox_data.name = 'severe_toxic'
toxic_data.name = 'toxic'
obs_data.name = 'obscene'
threat_data.name = 'threat'
insult_data.name = 'insult'
identity_data.name = 'identity_hate'
non_toxic_data.name = 'non_toxic'

In [None]:
avg_word_count = []
df_names = []
for df in all_dfs:
        avg_word_count.append(get_avg_word_num(df))
        df_names.append(df.name)
sns.barplot(df_names, avg_word_count)
plt.xticks(rotation=45, fontsize=12, ha="right")

# Creating Venn-Diagrams

In [None]:
t = data[(data['toxic'] == 1) & (data['insult'] == 0) & (data['obscene'] == 0)].shape[0]
i = data[(data['toxic'] == 0) & (data['insult'] == 1) & (data['obscene'] == 0)].shape[0]
o = data[(data['toxic'] == 0) & (data['insult'] == 0) & (data['obscene'] == 1)].shape[0]

t_i = data[(data['toxic'] == 1) & (data['insult'] == 1) & (data['obscene'] == 0)].shape[0]
t_o = data[(data['toxic'] == 1) & (data['insult'] == 0) & (data['obscene'] == 1)].shape[0]
i_o = data[(data['toxic'] == 0) & (data['insult'] == 1) & (data['obscene'] == 1)].shape[0]

t_i_o = data[(data['toxic'] == 1) & (data['insult'] == 1) & (data['obscene'] == 1)].shape[0]


# Make the diagram
plt.figure(figsize=(8, 8))
plt.title("Venn diagram for 'toxic', 'insult' and 'obscene'")
venn3(subsets = (t, i, t_i, o, t_o, i_o, t_i_o), 
      set_labels=('toxic', 'insult', 'obscene'))
plt.show()