In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/train.csv.zip')

#allow all columns to be displayed
pd.set_option('display.max_columns', None)

Now that we have the data, lets take a closer look at the data. We will look at the distribution of the data as well as the number of missing values and the type of the data.

# Evaluating Data distibution and any bad values

In [3]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
df.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
toxic_types = ['toxic',	'severe_toxic'	,'obscene',	'threat',	'insult', 	'identity_hate']

#get the value counts for each toxic type
for toxic_type in toxic_types:
    print(df[toxic_type].value_counts())
    print(df[toxic_type].value_counts(normalize=True))

toxic
0    144277
1     15294
Name: count, dtype: int64
toxic
0    0.904156
1    0.095844
Name: proportion, dtype: float64
severe_toxic
0    157976
1      1595
Name: count, dtype: int64
severe_toxic
0    0.990004
1    0.009996
Name: proportion, dtype: float64
obscene
0    151122
1      8449
Name: count, dtype: int64
obscene
0    0.947052
1    0.052948
Name: proportion, dtype: float64
threat
0    159093
1       478
Name: count, dtype: int64
threat
0    0.997004
1    0.002996
Name: proportion, dtype: float64
insult
0    151694
1      7877
Name: count, dtype: int64
insult
0    0.950636
1    0.049364
Name: proportion, dtype: float64
identity_hate
0    158166
1      1405
Name: count, dtype: int64
identity_hate
0    0.991195
1    0.008805
Name: proportion, dtype: float64


In [6]:
#find out how many of the rows have no toxic comments
print(df[toxic_types].sum(axis=1).value_counts())
df[toxic_types].sum(axis=1).value_counts(normalize=True)

0    143346
1      6360
3      4209
2      3480
4      1760
5       385
6        31
Name: count, dtype: int64


0    0.898321
1    0.039857
3    0.026377
2    0.021808
4    0.011030
5    0.002413
6    0.000194
Name: proportion, dtype: float64

As we can see above, the majority of the data has been labeled as non-toxic. We may need to be careful about the model overfitting to just label non-toxic. We will need to look at the other labels to see if they are also skewed.

In [7]:
double_toxic=df[(df.toxic==1) & (df.severe_toxic==1)]
double_toxic.shape

(1595, 8)

In [8]:
severely_toxic = df[df.severe_toxic==1]
severely_toxic.shape

(1595, 8)

In [9]:
double_toxic.equals(severely_toxic)

True

In [10]:
df.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [11]:
#get all data with obscene and insult comments
obscene_insult = df[(df.obscene==1) & (df.insult==1)]
print(obscene_insult.shape)

insult = df[df.insult==1]
insult.shape
print(insult.shape)


#get all data with obscene  comments
obscene = df[df.obscene==1]
obscene.shape

(6155, 8)
(7877, 8)


(8449, 8)

Interestingly all the ones tagged as severe_toxic are also labelled as toxic. Also a lot of the obscene and insult comments are also frequently labelled together.  Perhaps it will be useful to look at some of the examples that are labelled as only one or the other To get a better idea for what defines that category of data.

# Looking at samples within the categories to get a better idea of the data

In [12]:
#using df, get all data with obscene  comments and insult comments not equal to 1
obscene_not_insult = df[(df.obscene==1) & (df.insult!=1)]
obscene_not_insult.shape

#using df, get all data with insult  comments not equal to 1
insult_not_obscene = df[(df.obscene!=1) & (df.insult==1)]
insult_not_obscene.shape

#using df, get all data with obscene  comments and all other toxic types not equal to 1


(1722, 8)

In [None]:
#look at a few of the comments in obscene_not_insult
for item in obscene_not_insult.comment_text.head(10):
    print(item)
    print('------------------')

In [20]:
toxic_types = ['toxic',	'severe_toxic'	,'obscene',	'threat',	'insult', 	'identity_hate']
#using df, get all data with obscene  comments and all other toxic types not equal to 1

toxic_only = df[(df.toxic==1) & (df[toxic_types].sum(axis=1)==1)]
print('toxic_only shape: ', toxic_only.shape)
severe_toxic_only = df[(df.severe_toxic==1) & (df[toxic_types].sum(axis=1)==1)]
print('severe_toxic_only shape: ', severe_toxic_only.shape)

obscene_only = df[(df.obscene==1) & (df[toxic_types].sum(axis=1)==1)]
print('obscene_only shape: ', obscene_only.shape)

threat_only = df[(df.threat==1) & (df[toxic_types].sum(axis=1)==1)]
print('threat_only shape: ', threat_only.shape)

insult_only = df[(df.insult==1) & (df[toxic_types].sum(axis=1)==1)]
print('insult_only shape: ', insult_only.shape)

identity_hate_only = df[(df.identity_hate==1) & (df[toxic_types].sum(axis=1)==1)]
print('identity_hate_only shape: ', identity_hate_only.shape)

okay_only = df[(df[toxic_types].sum(axis=1)==0)]
print('okay_only shape: ', okay_only.shape)

toxic_only shape:  (5666, 8)
severe_toxic_only shape:  (0, 8)
obscene_only shape:  (317, 8)
threat_only shape:  (22, 8)
insult_only shape:  (301, 8)
identity_hate_only shape:  (54, 8)
okay_only shape:  (143346, 8)


In [None]:
for text in toxic_only.comment_text.head(10):
    print(text)
    print('------------------')

nothing too clear to distinguish about this data

In [None]:
for text in obscene_only.comment_text.head(10):
    print(text)
    print('------------------')

Obscene comments seem to include heavily gaslighting, complaining, and blaming others for something they are probably responsible for.  Peoeple that exhibit these behaviors tend to narcissitic

More insights may be made but the time it takes to more closer evaluate the data before geting some sort of a bench mark doesnt not seem helpful

# Moving onto model training  

In [26]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


## a lesson learned - multilabel data split

A classic stratified split does not work with multilablled data when there are rare multilabeleed instances that only occur once in the dataset.  As a result, I found that rare data and removed it before doing the test/train split


In [48]:
#first we need to split the data into train and validation sets
from sklearn.model_selection import train_test_split

#split the data into train and validation sets
train, valid = train_test_split(df, test_size=0.2, random_state=42, stratify=df[toxic_types], )

print(train.shape, valid.shape)

(127655, 9) (31914, 9)


In [31]:
#make a column called multi_label that is the list of all the valid labels column names
df['multi_label'] = df[toxic_types].apply(lambda x: x.index[x==1].tolist(), axis=1)


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,multi_label
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,[]
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,[]
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,[]
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,[]
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,[]


In [33]:
#if its an empty list, make it a list with one item, 'non-toxic'
df['multi_label'] = df['multi_label'].apply(lambda x: x if len(x)>0 else ['non-toxic'])

In [36]:
df.multi_label.value_counts()

multi_label
[non-toxic]                                                      143346
[toxic]                                                            5666
[toxic, obscene, insult]                                           3800
[toxic, obscene]                                                   1758
[toxic, insult]                                                    1215
[toxic, severe_toxic, obscene, insult]                              989
[toxic, obscene, insult, identity_hate]                             618
[obscene]                                                           317
[insult]                                                            301
[toxic, severe_toxic, obscene, insult, identity_hate]               265
[obscene, insult]                                                   181
[toxic, severe_toxic, obscene]                                      158
[toxic, identity_hate]                                              136
[toxic, insult, identity_hate]                      

In [42]:
condition1 = (df['toxic'] == 1) & (df['severe_toxic'] == 1) & (df['threat'] == 1) & (df['insult'] == 1)
condition2 = (df['toxic'] == 1) & (df['severe_toxic'] == 1) & (df['threat'] == 1) & (df['identity_hate'] == 1)

#find all the rows that have include conditions 1 or 2 and do not include any other toxic types
df[(condition1 | condition2) & (df[toxic_types].sum(axis=1)==4)]

(2, 9)

In [44]:
rare_data = df[(condition1 | condition2) & (df[toxic_types].sum(axis=1)==4)]
#drop the rare data from the df
df = df.drop(rare_data.index)

In [47]:
train, valid = train_test_split(df, test_size=0.2, random_state=42, stratify=df['multi_label'], )
#add the rare data df to the train set df
train = pd.concat([train, rare_data])


# Training a model

For base evaluation, I will start off using a simpler model

Afterwards, I will move onto training a large model, and maybe even a transformer model

NOTE:  since the competition is over I could use the labelled test set but I will continue using the validation set to simulate a realistic competition