# Imports

In [26]:
import gc

import numpy as np
import pandas as pd

from tqdm.auto import tqdm
from tqdm._tqdm_notebook import tqdm_notebook

tqdm_notebook.pandas()

# Setup

In [27]:
# For parallel data processing
from pandarallel import pandarallel
pandarallel.initialize()

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


# Config

In [88]:
class Config:
    TEXT_COLUMN_NAME = "comment_text"
    data_columns = ['id', 'comment_text']
    label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

    train_1 = '../data/raw/train.csv.zip'
    test_1_data = '../data/raw/test.csv.zip'
    test_1_label = '../data/raw/test_labels.csv.zip'

    all_data_2 = '../data/raw/all_data.csv.zip'

    intermediate_1 = '../data/intermediate/train_1.csv.zip'

    final_csv = '../data/processed/train.csv.zip'



# Read Data

In [29]:
# Read validation and ruddit df

### 1st Jigsaw - Extract all data

In [30]:
train_1_df = pd.read_csv(Config.train_1)
test_1_data_df = pd.read_csv(Config.test_1_data)
test_1_label_df = pd.read_csv(Config.test_1_label)

In [31]:
# Display the read dfs
print(f'train_1_df - {len(train_1_df)} records long')
print(train_1_df.head())

print(f'test_1_df - {len(test_1_data_df)} records long')
print(test_1_data_df.head())

print(f'test_labels_1_df - {len(test_1_label_df)} records long')
print(test_1_label_df.head())

train_1_df - 159571 records long
                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  
test_1_df - 153164 records long
                 id                                       comment_text
0  00001cee341fdb12  Yo bitch

In [32]:
test_1_label_df = test_1_label_df[
    (test_1_label_df.toxic != -1) & (test_1_label_df.severe_toxic != -1) & (test_1_label_df.obscene != -1) & 
    (test_1_label_df.threat != -1) & (test_1_label_df.insult != -1) & (test_1_label_df.identity_hate != -1) 
    ]

print(f'Updated test_labels_1_df - {len(test_1_label_df)} records long')
print(test_1_label_df.head())

Updated test_labels_1_df - 63978 records long
                  id  toxic  severe_toxic  obscene  threat  insult  \
5   0001ea8717f6de06      0             0        0       0       0   
7   000247e83dcc1211      0             0        0       0       0   
11  0002f87b16116a7f      0             0        0       0       0   
13  0003e1cccfd5a40a      0             0        0       0       0   
14  00059ace3e3e9a53      0             0        0       0       0   

    identity_hate  
5               0  
7               0  
11              0  
13              0  
14              0  


#### Add labels to test data

In [33]:
def add_labels_to_df(df):
    for col in Config.label_columns:
        test_1_data_df[col] = -1

add_labels_to_df(test_1_data_df)
print(test_1_data_df.head())

                 id                                       comment_text  toxic  \
0  00001cee341fdb12  Yo bitch Ja Rule is more succesful then you'll...     -1   
1  0000247867823ef7  == From RfC == \n\n The title is fine as it is...     -1   
2  00013b17ad220c46  " \n\n == Sources == \n\n * Zawe Ashton on Lap...     -1   
3  00017563c3f7919a  :If you have a look back at the source, the in...     -1   
4  00017695ad8997eb          I don't anonymously edit articles at all.     -1   

   severe_toxic  obscene  threat  insult  identity_hate  
0            -1       -1      -1      -1             -1  
1            -1       -1      -1      -1             -1  
2            -1       -1      -1      -1             -1  
3            -1       -1      -1      -1             -1  
4            -1       -1      -1      -1             -1  


#### Add labels

In [34]:
test_labels_ids = list(set(test_1_label_df.id))
test_labels_ids[:5]

['228826e565a5a148',
 '1305d5734ae394a5',
 '9ead117632fbb72f',
 'e5deceb0375848ab',
 '8f30ec7c6cf3315b']

In [35]:
def fill_labels(row):
    if row.id in test_labels_ids:
        for col in Config.label_columns:
            row[col] = test_1_label_df[test_1_label_df.id == row.id][col].item()
    return row

test_1_data_df = test_1_data_df.parallel_apply(fill_labels, axis=1)

In [36]:
print(f'Length of non "-1" labels = {len(test_1_data_df[test_1_data_df.toxic != -1])}')

Length of non "-1" labels = 63978


In [37]:
test_1_data_df = test_1_data_df[(test_1_data_df.toxic != -1) & (test_1_data_df.severe_toxic != -1) & (test_1_data_df.obscene != -1) & 
(test_1_data_df.threat != -1) & (test_1_data_df.insult != -1) & (test_1_data_df.identity_hate != -1)]

display(test_1_data_df)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
5,0001ea8717f6de06,Thank you for understanding. I think very high...,0,0,0,0,0,0
7,000247e83dcc1211,:Dear god this site is horrible.,0,0,0,0,0,0
11,0002f87b16116a7f,"""::: Somebody will invariably try to add Relig...",0,0,0,0,0,0
13,0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ...",0,0,0,0,0,0
14,00059ace3e3e9a53,""" \n\n == Before adding a new product to the l...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
153150,fff8f64043129fa2,":Jerome, I see you never got around to this…! ...",0,0,0,0,0,0
153151,fff9d70fe0722906,==Lucky bastard== \n http://wikimediafoundatio...,0,0,0,0,0,0
153154,fffa8a11c4378854,==shame on you all!!!== \n\n You want to speak...,0,0,0,0,0,0
153155,fffac2a094c8e0e2,MEL GIBSON IS A NAZI BITCH WHO MAKES SHITTY MO...,1,0,1,0,1,0


### Merge all Jigsaw 1 competition data

In [38]:
train_1_final_df = pd.concat([train_1_df, test_1_data_df])
train_1_final_df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
153150,fff8f64043129fa2,":Jerome, I see you never got around to this…! ...",0,0,0,0,0,0
153151,fff9d70fe0722906,==Lucky bastard== \n http://wikimediafoundatio...,0,0,0,0,0,0
153154,fffa8a11c4378854,==shame on you all!!!== \n\n You want to speak...,0,0,0,0,0,0
153155,fffac2a094c8e0e2,MEL GIBSON IS A NAZI BITCH WHO MAKES SHITTY MO...,1,0,1,0,1,0


**Save to disk**

In [42]:
train_1_final_df.to_csv(Config.intermediate_1, index=False, compression='zip')

**CleanUp**

In [44]:
del train_1_final_df, test_1_data_df, test_labels_ids, train_1_df, test_1_label_df
gc.collect()

0

# 2nd Jigsaw - Extract all data

In [45]:
train_2_df = pd.read_csv(Config.all_data_2)
train_2_df

Unnamed: 0,id,comment_text,split,created_date,publication_id,parent_id,article_id,rating,funny,wow,...,white,asian,latino,other_race_or_ethnicity,physical_disability,intellectual_or_learning_disability,psychiatric_or_mental_illness,other_disability,identity_annotator_count,toxicity_annotator_count
0,1083994,He got his money... now he lies in wait till a...,train,2017-03-06 15:21:53.675241+00,21,,317120,approved,0,0,...,,,,,,,,,0,67
1,650904,Mad dog will surely put the liberals in mental...,train,2016-12-02 16:44:21.329535+00,21,,154086,approved,0,0,...,,,,,,,,,0,76
2,5902188,And Trump continues his lifelong cowardice by ...,train,2017-09-05 19:05:32.341360+00,55,,374342,approved,1,0,...,,,,,,,,,0,63
3,7084460,"""while arresting a man for resisting arrest"".\...",test,2016-11-01 16:53:33.561631+00,13,,149218,approved,0,0,...,,,,,,,,,0,76
4,5410943,Tucker and Paul are both total bad ass mofo's.,train,2017-06-14 05:08:21.997315+00,21,,344096,approved,0,0,...,,,,,,,,,0,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1999511,1018736,Another man shamming article. If white men did...,train,2017-02-20 07:20:49.964620+00,54,,169202,approved,0,0,...,0.8,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,10,10
1999512,340016,"""no matter what is put in front of you regardi...",train,2016-06-06 06:43:04.780968+00,21,339965.0,137961,approved,0,0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,10,10
1999513,919629,The Democrat party aided and abetted by it's M...,train,2017-01-30 02:44:29.168863+00,54,,164845,rejected,0,1,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,11,10
1999514,5165492,I just don't find her a very good representati...,train,2017-04-22 18:42:02.442987+00,54,,328877,approved,1,0,...,0.0,0.0,0.0,0.0,0.003717,0.0,0.0,0.00000,269,10


In [67]:
train_2_df = train_2_df.rename(columns={"toxicity": "toxic", "severe_toxicity": "severe_toxic", "identity_attack": "identity_hate"})
train_2_df.columns

Index(['id', 'comment_text', 'split', 'created_date', 'publication_id',
       'parent_id', 'article_id', 'rating', 'funny', 'wow', 'sad', 'likes',
       'disagree', 'toxic', 'severe_toxic', 'obscene', 'sexual_explicit',
       'identity_hate', 'insult', 'threat', 'male', 'female', 'transgender',
       'other_gender', 'heterosexual', 'homosexual_gay_or_lesbian', 'bisexual',
       'other_sexual_orientation', 'christian', 'jewish', 'muslim', 'hindu',
       'buddhist', 'atheist', 'other_religion', 'black', 'white', 'asian',
       'latino', 'other_race_or_ethnicity', 'physical_disability',
       'intellectual_or_learning_disability', 'psychiatric_or_mental_illness',
       'other_disability', 'identity_annotator_count',
       'toxicity_annotator_count'],
      dtype='object')

In [69]:
train_2_df = train_2_df[Config.data_columns + Config.label_columns]
train_2_df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,1083994,He got his money... now he lies in wait till a...,0.373134,0.044776,0.089552,0.014925,0.343284,0.000000
1,650904,Mad dog will surely put the liberals in mental...,0.605263,0.013158,0.065789,0.065789,0.565789,0.092105
2,5902188,And Trump continues his lifelong cowardice by ...,0.666667,0.015873,0.031746,0.000000,0.666667,0.047619
3,7084460,"""while arresting a man for resisting arrest"".\...",0.815789,0.065789,0.552632,0.105263,0.684211,0.000000
4,5410943,Tucker and Paul are both total bad ass mofo's.,0.550000,0.037500,0.337500,0.000000,0.487500,0.037500
...,...,...,...,...,...,...,...,...
1999511,1018736,Another man shamming article. If white men did...,0.400000,0.000000,0.000000,0.000000,0.300000,0.200000
1999512,340016,"""no matter what is put in front of you regardi...",0.400000,0.000000,0.000000,0.000000,0.000000,0.400000
1999513,919629,The Democrat party aided and abetted by it's M...,0.400000,0.000000,0.200000,0.000000,0.300000,0.300000
1999514,5165492,I just don't find her a very good representati...,0.400000,0.000000,0.000000,0.000000,0.100000,0.200000


# Merge All data

In [85]:
train_1_df = pd.read_csv(Config.intermediate_1, compression='zip')
train_1_df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
223544,fff8f64043129fa2,":Jerome, I see you never got around to this…! ...",0,0,0,0,0,0
223545,fff9d70fe0722906,==Lucky bastard== \n http://wikimediafoundatio...,0,0,0,0,0,0
223546,fffa8a11c4378854,==shame on you all!!!== \n\n You want to speak...,0,0,0,0,0,0
223547,fffac2a094c8e0e2,MEL GIBSON IS A NAZI BITCH WHO MAKES SHITTY MO...,1,0,1,0,1,0


In [87]:
train_df = pd.concat([train_1_df, train_2_df])
train_df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0.0,0.0,0.0,0.0,0.0,0.0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0.0,0.0,0.0,0.0,0.0,0.0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0.0,0.0,0.0,0.0,0.0,0.0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0.0,0.0,0.0,0.0,0.0,0.0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
1999511,1018736,Another man shamming article. If white men did...,0.4,0.0,0.0,0.0,0.3,0.2
1999512,340016,"""no matter what is put in front of you regardi...",0.4,0.0,0.0,0.0,0.0,0.4
1999513,919629,The Democrat party aided and abetted by it's M...,0.4,0.0,0.2,0.0,0.3,0.3
1999514,5165492,I just don't find her a very good representati...,0.4,0.0,0.0,0.0,0.1,0.2


In [89]:
train_df.to_csv(Config.final_csv, index=False, compression='zip')

# Create Train and Test set

# Clean everything

In [None]:
del train_1_df, train_2_df, train_df
gc.collect()

# Last words
Final processed data is available in `data/processed` directory