# Imports

In [110]:
import gc

import numpy as np
import pandas as pd

from tqdm.auto import tqdm
from tqdm._tqdm_notebook import tqdm_notebook

from sklearn.model_selection import train_test_split

tqdm_notebook.pandas()

# Setup

In [27]:
# For parallel data processing
from pandarallel import pandarallel
pandarallel.initialize()

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


# Config

In [156]:
class Config:
    TEXT_COLUMN_NAME = "comment_text"
    data_columns = ['id', 'comment_text']
    label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

    train_1 = '../data/raw/train.csv.zip'
    test_1_data = '../data/raw/test.csv.zip'
    test_1_label = '../data/raw/test_labels.csv.zip'

    all_data_2 = '../data/raw/all_data.csv.zip'

    ruddit = '../data/raw/ruddit_with_text.csv.zip'

    intermediate_1 = '../data/intermediate/train_1.csv.zip'

    final_all_data_csv = '../data/processed/final_all_data.csv.zip'
    final_train_data_csv = '../data/processed/final_train_data.csv.zip'
    final_test_data_csv = '../data/processed/final_test_data.csv.zip'
    jigsaw_data_csv = '../data/processed/jigsaw_toxic.csv.zip'
    ruddit_data_csv = '../data/processed/ruddit.csv.zip'

# Read Data

In [29]:
# Read validation and ruddit df

### 1st Jigsaw - Extract all data

In [30]:
train_1_df = pd.read_csv(Config.train_1)
test_1_data_df = pd.read_csv(Config.test_1_data)
test_1_label_df = pd.read_csv(Config.test_1_label)

In [31]:
# Display the read dfs
print(f'train_1_df - {len(train_1_df)} records long')
print(train_1_df.head())

print(f'test_1_df - {len(test_1_data_df)} records long')
print(test_1_data_df.head())

print(f'test_labels_1_df - {len(test_1_label_df)} records long')
print(test_1_label_df.head())

train_1_df - 159571 records long
                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  
test_1_df - 153164 records long
                 id                                       comment_text
0  00001cee341fdb12  Yo bitch

In [32]:
test_1_label_df = test_1_label_df[
    (test_1_label_df.toxic != -1) & (test_1_label_df.severe_toxic != -1) & (test_1_label_df.obscene != -1) & 
    (test_1_label_df.threat != -1) & (test_1_label_df.insult != -1) & (test_1_label_df.identity_hate != -1) 
    ]

print(f'Updated test_labels_1_df - {len(test_1_label_df)} records long')
print(test_1_label_df.head())

Updated test_labels_1_df - 63978 records long
                  id  toxic  severe_toxic  obscene  threat  insult  \
5   0001ea8717f6de06      0             0        0       0       0   
7   000247e83dcc1211      0             0        0       0       0   
11  0002f87b16116a7f      0             0        0       0       0   
13  0003e1cccfd5a40a      0             0        0       0       0   
14  00059ace3e3e9a53      0             0        0       0       0   

    identity_hate  
5               0  
7               0  
11              0  
13              0  
14              0  


#### Add labels to test data

In [33]:
def add_labels_to_df(df):
    for col in Config.label_columns:
        test_1_data_df[col] = -1

add_labels_to_df(test_1_data_df)
print(test_1_data_df.head())

                 id                                       comment_text  toxic  \
0  00001cee341fdb12  Yo bitch Ja Rule is more succesful then you'll...     -1   
1  0000247867823ef7  == From RfC == \n\n The title is fine as it is...     -1   
2  00013b17ad220c46  " \n\n == Sources == \n\n * Zawe Ashton on Lap...     -1   
3  00017563c3f7919a  :If you have a look back at the source, the in...     -1   
4  00017695ad8997eb          I don't anonymously edit articles at all.     -1   

   severe_toxic  obscene  threat  insult  identity_hate  
0            -1       -1      -1      -1             -1  
1            -1       -1      -1      -1             -1  
2            -1       -1      -1      -1             -1  
3            -1       -1      -1      -1             -1  
4            -1       -1      -1      -1             -1  


#### Add labels

In [34]:
test_labels_ids = list(set(test_1_label_df.id))
test_labels_ids[:5]

['228826e565a5a148',
 '1305d5734ae394a5',
 '9ead117632fbb72f',
 'e5deceb0375848ab',
 '8f30ec7c6cf3315b']

In [35]:
def fill_labels(row):
    if row.id in test_labels_ids:
        for col in Config.label_columns:
            row[col] = test_1_label_df[test_1_label_df.id == row.id][col].item()
    return row

test_1_data_df = test_1_data_df.parallel_apply(fill_labels, axis=1)

In [36]:
print(f'Length of non "-1" labels = {len(test_1_data_df[test_1_data_df.toxic != -1])}')

Length of non "-1" labels = 63978


In [37]:
test_1_data_df = test_1_data_df[(test_1_data_df.toxic != -1) & (test_1_data_df.severe_toxic != -1) & (test_1_data_df.obscene != -1) & 
(test_1_data_df.threat != -1) & (test_1_data_df.insult != -1) & (test_1_data_df.identity_hate != -1)]

display(test_1_data_df)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
5,0001ea8717f6de06,Thank you for understanding. I think very high...,0,0,0,0,0,0
7,000247e83dcc1211,:Dear god this site is horrible.,0,0,0,0,0,0
11,0002f87b16116a7f,"""::: Somebody will invariably try to add Relig...",0,0,0,0,0,0
13,0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ...",0,0,0,0,0,0
14,00059ace3e3e9a53,""" \n\n == Before adding a new product to the l...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
153150,fff8f64043129fa2,":Jerome, I see you never got around to this…! ...",0,0,0,0,0,0
153151,fff9d70fe0722906,==Lucky bastard== \n http://wikimediafoundatio...,0,0,0,0,0,0
153154,fffa8a11c4378854,==shame on you all!!!== \n\n You want to speak...,0,0,0,0,0,0
153155,fffac2a094c8e0e2,MEL GIBSON IS A NAZI BITCH WHO MAKES SHITTY MO...,1,0,1,0,1,0


### Merge all Jigsaw 1 competition data

In [38]:
train_1_final_df = pd.concat([train_1_df, test_1_data_df])
train_1_final_df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
153150,fff8f64043129fa2,":Jerome, I see you never got around to this…! ...",0,0,0,0,0,0
153151,fff9d70fe0722906,==Lucky bastard== \n http://wikimediafoundatio...,0,0,0,0,0,0
153154,fffa8a11c4378854,==shame on you all!!!== \n\n You want to speak...,0,0,0,0,0,0
153155,fffac2a094c8e0e2,MEL GIBSON IS A NAZI BITCH WHO MAKES SHITTY MO...,1,0,1,0,1,0


**Save to disk**

In [42]:
train_1_final_df.to_csv(Config.intermediate_1, index=False, compression='zip')

**CleanUp**

In [44]:
del train_1_final_df, test_1_data_df, test_labels_ids, train_1_df, test_1_label_df
gc.collect()

0

# 2nd Jigsaw - Extract all data

In [45]:
train_2_df = pd.read_csv(Config.all_data_2)
train_2_df

Unnamed: 0,id,comment_text,split,created_date,publication_id,parent_id,article_id,rating,funny,wow,...,white,asian,latino,other_race_or_ethnicity,physical_disability,intellectual_or_learning_disability,psychiatric_or_mental_illness,other_disability,identity_annotator_count,toxicity_annotator_count
0,1083994,He got his money... now he lies in wait till a...,train,2017-03-06 15:21:53.675241+00,21,,317120,approved,0,0,...,,,,,,,,,0,67
1,650904,Mad dog will surely put the liberals in mental...,train,2016-12-02 16:44:21.329535+00,21,,154086,approved,0,0,...,,,,,,,,,0,76
2,5902188,And Trump continues his lifelong cowardice by ...,train,2017-09-05 19:05:32.341360+00,55,,374342,approved,1,0,...,,,,,,,,,0,63
3,7084460,"""while arresting a man for resisting arrest"".\...",test,2016-11-01 16:53:33.561631+00,13,,149218,approved,0,0,...,,,,,,,,,0,76
4,5410943,Tucker and Paul are both total bad ass mofo's.,train,2017-06-14 05:08:21.997315+00,21,,344096,approved,0,0,...,,,,,,,,,0,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1999511,1018736,Another man shamming article. If white men did...,train,2017-02-20 07:20:49.964620+00,54,,169202,approved,0,0,...,0.8,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,10,10
1999512,340016,"""no matter what is put in front of you regardi...",train,2016-06-06 06:43:04.780968+00,21,339965.0,137961,approved,0,0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,10,10
1999513,919629,The Democrat party aided and abetted by it's M...,train,2017-01-30 02:44:29.168863+00,54,,164845,rejected,0,1,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,11,10
1999514,5165492,I just don't find her a very good representati...,train,2017-04-22 18:42:02.442987+00,54,,328877,approved,1,0,...,0.0,0.0,0.0,0.0,0.003717,0.0,0.0,0.00000,269,10


In [67]:
train_2_df = train_2_df.rename(columns={"toxicity": "toxic", "severe_toxicity": "severe_toxic", "identity_attack": "identity_hate"})
train_2_df.columns

Index(['id', 'comment_text', 'split', 'created_date', 'publication_id',
       'parent_id', 'article_id', 'rating', 'funny', 'wow', 'sad', 'likes',
       'disagree', 'toxic', 'severe_toxic', 'obscene', 'sexual_explicit',
       'identity_hate', 'insult', 'threat', 'male', 'female', 'transgender',
       'other_gender', 'heterosexual', 'homosexual_gay_or_lesbian', 'bisexual',
       'other_sexual_orientation', 'christian', 'jewish', 'muslim', 'hindu',
       'buddhist', 'atheist', 'other_religion', 'black', 'white', 'asian',
       'latino', 'other_race_or_ethnicity', 'physical_disability',
       'intellectual_or_learning_disability', 'psychiatric_or_mental_illness',
       'other_disability', 'identity_annotator_count',
       'toxicity_annotator_count'],
      dtype='object')

In [69]:
train_2_df = train_2_df[Config.data_columns + Config.label_columns]
train_2_df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,1083994,He got his money... now he lies in wait till a...,0.373134,0.044776,0.089552,0.014925,0.343284,0.000000
1,650904,Mad dog will surely put the liberals in mental...,0.605263,0.013158,0.065789,0.065789,0.565789,0.092105
2,5902188,And Trump continues his lifelong cowardice by ...,0.666667,0.015873,0.031746,0.000000,0.666667,0.047619
3,7084460,"""while arresting a man for resisting arrest"".\...",0.815789,0.065789,0.552632,0.105263,0.684211,0.000000
4,5410943,Tucker and Paul are both total bad ass mofo's.,0.550000,0.037500,0.337500,0.000000,0.487500,0.037500
...,...,...,...,...,...,...,...,...
1999511,1018736,Another man shamming article. If white men did...,0.400000,0.000000,0.000000,0.000000,0.300000,0.200000
1999512,340016,"""no matter what is put in front of you regardi...",0.400000,0.000000,0.000000,0.000000,0.000000,0.400000
1999513,919629,The Democrat party aided and abetted by it's M...,0.400000,0.000000,0.200000,0.000000,0.300000,0.300000
1999514,5165492,I just don't find her a very good representati...,0.400000,0.000000,0.000000,0.000000,0.100000,0.200000


# Merge All data

In [85]:
train_1_df = pd.read_csv(Config.intermediate_1, compression='zip')
train_1_df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
223544,fff8f64043129fa2,":Jerome, I see you never got around to this…! ...",0,0,0,0,0,0
223545,fff9d70fe0722906,==Lucky bastard== \n http://wikimediafoundatio...,0,0,0,0,0,0
223546,fffa8a11c4378854,==shame on you all!!!== \n\n You want to speak...,0,0,0,0,0,0
223547,fffac2a094c8e0e2,MEL GIBSON IS A NAZI BITCH WHO MAKES SHITTY MO...,1,0,1,0,1,0


In [94]:
jigsaw_train_df = pd.concat([train_1_df, train_2_df])
jigsaw_train_df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0.0,0.0,0.0,0.0,0.0,0.0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0.0,0.0,0.0,0.0,0.0,0.0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0.0,0.0,0.0,0.0,0.0,0.0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0.0,0.0,0.0,0.0,0.0,0.0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
1999511,1018736,Another man shamming article. If white men did...,0.4,0.0,0.0,0.0,0.3,0.2
1999512,340016,"""no matter what is put in front of you regardi...",0.4,0.0,0.0,0.0,0.0,0.4
1999513,919629,The Democrat party aided and abetted by it's M...,0.4,0.0,0.2,0.0,0.3,0.3
1999514,5165492,I just don't find her a very good representati...,0.4,0.0,0.0,0.0,0.1,0.2


# Scoring Scheme for Jigsaw data

In [91]:
score_scheme = {
    "toxic": 2,
    "severe_toxic": 4,
    "obscene": 0.8,
    "identity_hate": 3,
    "insult": 1.5,
    "threat": 3,
}

In [92]:
def normalize_scoring_scheme(score_scheme):
    total = sum(score_scheme.values())
    for field in score_scheme.keys():
        score_scheme[field] /= total
    return score_scheme
    
norm_score_scheme = normalize_scoring_scheme(score_scheme)
print(norm_score_scheme)

{'toxic': 0.13986013986013984, 'severe_toxic': 0.2797202797202797, 'obscene': 0.055944055944055944, 'identity_hate': 0.2097902097902098, 'insult': 0.1048951048951049, 'threat': 0.2097902097902098}


### Update score

In [93]:
def calc_score(row):
    for toxic_col_name, factor in norm_score_scheme.items():
        row['score'] += row[toxic_col_name] * score_scheme[toxic_col_name]
    return row

In [95]:
jigsaw_train_df['score'] = 0

jigsaw_train_df = jigsaw_train_df.parallel_apply(calc_score, axis=1)
jigsaw_train_df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,score
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0.0,0.0,0.0,0.0,0.0,0.0,0.000000
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0.0,0.0,0.0,0.0,0.0,0.0,0.000000
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...
1999511,1018736,Another man shamming article. If white men did...,0.4,0.0,0.0,0.0,0.3,0.2,0.129371
1999512,340016,"""no matter what is put in front of you regardi...",0.4,0.0,0.0,0.0,0.0,0.4,0.139860
1999513,919629,The Democrat party aided and abetted by it's M...,0.4,0.0,0.2,0.0,0.3,0.3,0.161538
1999514,5165492,I just don't find her a very good representati...,0.4,0.0,0.0,0.0,0.1,0.2,0.108392


#### Keep just comment_text and score

In [105]:
jigsaw_train_df = jigsaw_train_df[['comment_text', 'score']]
jigsaw_train_df

Unnamed: 0,comment_text,score
0,Explanation\nWhy the edits made under my usern...,0.000000
1,D'aww! He matches this background colour I'm s...,0.000000
2,"Hey man, I'm really not trying to edit war. It...",0.000000
3,"""\nMore\nI can't make any real suggestions on ...",0.000000
4,"You, sir, are my hero. Any chance you remember...",0.000000
...,...,...
1999511,Another man shamming article. If white men did...,0.129371
1999512,"""no matter what is put in front of you regardi...",0.139860
1999513,The Democrat party aided and abetted by it's M...,0.161538
1999514,I just don't find her a very good representati...,0.108392


# Ruddit data

In [101]:
ruddit_df = pd.read_csv(Config.ruddit)
ruddit_df = ruddit_df.rename(columns={'comment_id': 'id','txt': 'comment_text', 'offensiveness_score': 'score'})
ruddit_df = ruddit_df[['comment_text', 'score']]
ruddit_df

Unnamed: 0,comment_text,score
0,> The difference in average earnings between m...,-0.083
1,"The myth is that the ""gap"" is entirely based o...",-0.022
2,[deleted],0.167
3,The assertion is that women get paid less for ...,-0.146
4,You said in the OP that's not what they're mea...,-0.083
...,...,...
5833,They should only censor things that talk badly...,0.064
5834,> and one of them is a woman. \n\nOH SHIT we b...,0.458
5835,how is this flared as US politics,-0.292
5836,People in Hong Kong must decide if they are go...,0.333


### Score rebalance to [0,1]

In [102]:
ruddit_df['score'] = (ruddit_df['score'] + 1) / 2
ruddit_df

Unnamed: 0,comment_text,score
0,> The difference in average earnings between m...,0.4585
1,"The myth is that the ""gap"" is entirely based o...",0.4890
2,[deleted],0.5835
3,The assertion is that women get paid less for ...,0.4270
4,You said in the OP that's not what they're mea...,0.4585
...,...,...
5833,They should only censor things that talk badly...,0.5320
5834,> and one of them is a woman. \n\nOH SHIT we b...,0.7290
5835,how is this flared as US politics,0.3540
5836,People in Hong Kong must decide if they are go...,0.6665


In [103]:
ruddit_df.describe()

Unnamed: 0,score
count,5838.0
mean,0.48694
std,0.166988
min,0.0555
25%,0.3725
50%,0.469
75%,0.5835
max,0.9895


# Merge Reddit and Jigsaw data

In [138]:
train_df = pd.concat([jigsaw_train_df, ruddit_df])
train_df.reset_index(inplace=True, drop=True)
train_df

Unnamed: 0,comment_text,score
0,Explanation\nWhy the edits made under my usern...,0.0000
1,D'aww! He matches this background colour I'm s...,0.0000
2,"Hey man, I'm really not trying to edit war. It...",0.0000
3,"""\nMore\nI can't make any real suggestions on ...",0.0000
4,"You, sir, are my hero. Any chance you remember...",0.0000
...,...,...
2228898,They should only censor things that talk badly...,0.5320
2228899,> and one of them is a woman. \n\nOH SHIT we b...,0.7290
2228900,how is this flared as US politics,0.3540
2228901,People in Hong Kong must decide if they are go...,0.6665


# Rename 'Comment text' to 'text'

In [139]:
train_df = train_df.rename(columns={'comment_text': 'text'})
train_df

Unnamed: 0,text,score
0,Explanation\nWhy the edits made under my usern...,0.0000
1,D'aww! He matches this background colour I'm s...,0.0000
2,"Hey man, I'm really not trying to edit war. It...",0.0000
3,"""\nMore\nI can't make any real suggestions on ...",0.0000
4,"You, sir, are my hero. Any chance you remember...",0.0000
...,...,...
2228898,They should only censor things that talk badly...,0.5320
2228899,> and one of them is a woman. \n\nOH SHIT we b...,0.7290
2228900,how is this flared as US politics,0.3540
2228901,People in Hong Kong must decide if they are go...,0.6665


# Create Train and Test set

In [140]:
def define_score_bucket(row):
    if row['score'] == 0:
        row['score_bucket'] = 0
    elif row['score'] > 0 and row['score'] <= 0.25:
        row['score_bucket'] = 1
    elif row['score'] > 0.25 and row['score'] <= 0.5:
        row['score_bucket'] = 2
    elif row['score'] > 0.5 and row['score'] <= 0.75:
        row['score_bucket'] = 3
    elif row['score'] > 0.75 and row['score'] < 1:
        row['score_bucket'] = 4
    else: # score == 1
        row['score_bucket'] = 5
    return row

In [141]:
train_df['score_bucket'] = -1
train_df = train_df.parallel_apply(define_score_bucket, axis=1)
train_df

Unnamed: 0,text,score,score_bucket
0,Explanation\nWhy the edits made under my usern...,0.0000,0
1,D'aww! He matches this background colour I'm s...,0.0000,0
2,"Hey man, I'm really not trying to edit war. It...",0.0000,0
3,"""\nMore\nI can't make any real suggestions on ...",0.0000,0
4,"You, sir, are my hero. Any chance you remember...",0.0000,0
...,...,...,...
2228898,They should only censor things that talk badly...,0.5320,3
2228899,> and one of them is a woman. \n\nOH SHIT we b...,0.7290,3
2228900,how is this flared as US politics,0.3540,2
2228901,People in Hong Kong must decide if they are go...,0.6665,3


In [142]:
train_comments, test_comments, train_score, test_score = train_test_split(train_df['text'], train_df['score'], test_size=0.25, random_state=2022, stratify=train_df['score_bucket'])

In [143]:
print(f'train comments length: {len(train_comments)}')
print(f'test comments length: {len(test_comments)}')
print(f'train score length: {len(train_score)}')
print(f'test score length: {len(test_score)}')

train comments length: 1671677
test comments length: 557226
train score length: 1671677
test score length: 557226


# Clean everything

In [None]:
# A few more variables need to be deleted / cleaned up ...
del train_1_df, train_2_df, train_df
gc.collect()

# Save files

### All files

In [144]:
train_df.drop(columns=['score_bucket'], inplace=True)
train_df

Unnamed: 0,text,score
0,Explanation\nWhy the edits made under my usern...,0.0000
1,D'aww! He matches this background colour I'm s...,0.0000
2,"Hey man, I'm really not trying to edit war. It...",0.0000
3,"""\nMore\nI can't make any real suggestions on ...",0.0000
4,"You, sir, are my hero. Any chance you remember...",0.0000
...,...,...
2228898,They should only censor things that talk badly...,0.5320
2228899,> and one of them is a woman. \n\nOH SHIT we b...,0.7290
2228900,how is this flared as US politics,0.3540
2228901,People in Hong Kong must decide if they are go...,0.6665


In [154]:
train_df.to_csv(Config.final_all_data_csv, compression='zip')

### Train Data

In [148]:
split_train_df = pd.DataFrame(data={'text': list(train_comments), 'score': list(train_score)})
split_train_df

Unnamed: 0,text,score
0,We will spend ourselves rich and drink ourselv...,0.000000
1,"""Secret Service agents and Reno Police Officer...",0.000000
2,Slavery? ... Nobody forced Lance Wells to cont...,0.000000
3,Her point is the police shouldn't judge! The p...,0.000000
4,I did kinda like the thumbs down though and tr...,0.000000
...,...,...
1671672,I don't think that I would be willing to work ...,0.000000
1671673,"WRONG................AND, WRONG AGAIN.\nPlease...",0.027972
1671674,Right. I bet they are being kept well refriger...,0.000000
1671675,"Well, FPTP is not a failure. It also doesn't e...",0.000000


In [153]:
split_train_df.to_csv(Config.final_train_data_csv, compression='zip')

In [150]:
split_test_df = pd.DataFrame(data={'text': list(test_comments), 'score': list(test_score)})
split_test_df

Unnamed: 0,text,score
0,"Hi, EN: You raise some good points - let's di...",0.000000
1,"""\n\nThe About.com has been removed and cite t...",0.000000
2,Awe come on. It is fun watching the ecotards ...,0.321678
3,REDIRECT Talk:1812 Louisiana hurricane,0.000000
4,"""Trump’s policies may divert migrants to Latin...",0.000000
...,...,...
557221,53 democrats are sucking each other's dicks.,0.346171
557222,"And innocence. (Oops, should have put as resp...",0.000000
557223,What is it that they say? Follow the money. Th...,0.000000
557224,So since you have nothing you decide to use t...,0.093240


In [152]:
split_test_df.to_csv(Config.final_test_data_csv, compression='zip')

In [155]:
split_test_df.describe()

Unnamed: 0,score
count,557226.0
mean,0.032311
std,0.07023
min,0.0
25%,0.0
50%,0.0
75%,0.040793
max,1.0


### Jigsaw data

In [157]:
jigsaw_train_df.to_csv(Config.jigsaw_data_csv, compression='zip')

### Ruddit data

In [158]:
ruddit_df.to_csv(Config.ruddit_data_csv, compression='zip')

# Last words
Final processed data is available in `data/processed` directory