# Imports

In [1]:
import numpy as np
import pandas as pd

from tqdm.auto import tqdm
from tqdm._tqdm_notebook import tqdm_notebook

tqdm_notebook.pandas()

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  from tqdm._tqdm_notebook import tqdm_notebook


# Setup

In [2]:
# For parallel data processing
from pandarallel import pandarallel
pandarallel.initialize()

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


# Config

In [4]:
class Config:
    TEXT_COLUMN_NAME = "text"
    label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

    train_1 = '../data/raw/train.csv.zip'
    test_1_data = '../data/raw/test.csv.zip'
    test_1_label = '../data/raw/test_labels.csv.zip'

# Read Data

In [5]:
# Read validation and ruddit df

### 1st Jigsaw - Extract all data

In [6]:
train_1_df = pd.read_csv(Config.train_1)
test_1_data_df = pd.read_csv(Config.test_1_data)
test_1_label_df = pd.read_csv(Config.test_1_label)

In [7]:
# Display the read dfs
print(f'train_1_df - {len(train_1_df)} records long')
print(train_1_df.head())

print(f'test_1_df - {len(test_1_data_df)} records long')
print(test_1_data_df.head())

print(f'test_labels_1_df - {len(test_1_label_df)} records long')
print(test_1_label_df.head())

train_1_df - 159571 records long


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


test_1_df - 153164 records long


Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


test_labels_1_df - 153164 records long


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,-1,-1,-1,-1,-1,-1


In [8]:
test_1_label_df = test_1_label_df[
    (test_1_label_df.toxic != -1) & (test_1_label_df.severe_toxic != -1) & (test_1_label_df.obscene != -1) & 
    (test_1_label_df.threat != -1) & (test_1_label_df.insult != -1) & (test_1_label_df.identity_hate != -1) 
    ]

print(f'Updated test_labels_1_df - {len(test_1_label_df)} records long')
print(test_1_label_df.head())

Updated test_labels_1_df - 63978 records long


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
5,0001ea8717f6de06,0,0,0,0,0,0
7,000247e83dcc1211,0,0,0,0,0,0
11,0002f87b16116a7f,0,0,0,0,0,0
13,0003e1cccfd5a40a,0,0,0,0,0,0
14,00059ace3e3e9a53,0,0,0,0,0,0


#### Add labels to test data

In [9]:
def add_labels_to_df(df):
    for col in Config.label_columns:
        test_1_data_df[col] = ''

add_labels_to_df(test_1_data_df)
print(test_1_data_df.head())

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,,,,,,
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,,,,,,
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",,,,,,
3,00017563c3f7919a,":If you have a look back at the source, the in...",,,,,,
4,00017695ad8997eb,I don't anonymously edit articles at all.,,,,,,


#### Add labels

In [10]:
test_labels_ids = list(set(test_1_label_df.id))
test_labels_ids[:5]

['c9c6f1ee70bfdfa8',
 '9ec7dac9a40ac3fb',
 'f57224c61929c254',
 'b37568a105bd76b5',
 '3857295b17568133']

In [None]:
def fill_labels(row):
    if row.id in test_labels_ids:
        for col in Config.label_columns:
            row[col] = test_1_label_df[test_1_label_df.id == row.id][col]
    return row

test_1_data_df = test_1_data_df.parallel_apply(fill_labels, axis=1)

Traceback (most recent call last):
  File "/Users/monideepde/miniforge3/envs/pytorch_m1/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/Users/monideepde/miniforge3/envs/pytorch_m1/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/Users/monideepde/miniforge3/envs/pytorch_m1/lib/python3.9/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/Users/monideepde/miniforge3/envs/pytorch_m1/lib/python3.9/site-packages/traitlets/config/application.py", line 846, in launch_instance
    app.start()
  File "/Users/monideepde/miniforge3/envs/pytorch_m1/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 677, in start
    self.io_loop.start()
  File "/Users/monideepde/miniforge3/envs/pytorch_m1/lib/python3.9/site-packages/tornado/platform/asyncio.py", line 199, in start
    self.asyncio_loop.run_forever()
  File "/Users/monideepde/miniforge3/envs/pytor

KeyboardInterrupt: 

Error in atexit._run_exitfuncs:
Traceback (most recent call last):
  File "/Users/monideepde/miniforge3/envs/pytorch_m1/lib/python3.9/multiprocessing/util.py", line 357, in _exit_function
    p.join()
  File "/Users/monideepde/miniforge3/envs/pytorch_m1/lib/python3.9/multiprocessing/process.py", line 147, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process
Exception in thread Exception in thread Exception in thread Exception in thread Exception in thread Exception in thread Exception in thread Exception in thread Exception in thread Exception in thread Exception in thread Exception in thread Exception in thread Exception in thread Exception in thread Exception in thread Exception in thread ents.py", line 596, in run_forever
    self._run_once()
  File "/Users/monideepde/miniforge3/envs/pytorch_m1/lib/python3.9/asyncio/base_events.py", line 1854, in _run_once
    event_list = self._selector.select(timeout)
  F

In [42]:
test_1_data_df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,,,,,,
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,,,,,,
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",,,,,,
3,00017563c3f7919a,":If you have a look back at the source, the in...",,,,,,
4,00017695ad8997eb,I don't anonymously edit articles at all.,,,,,,
...,...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,". \n i totally agree, this stuff is nothing bu...",,,,,,
153160,fffd7a9a6eb32c16,== Throw from out field to home plate. == \n\n...,,,,,,
153161,fffda9e8d6fafa9e,""" \n\n == Okinotorishima categories == \n\n I ...",,,,,,
153162,fffe8f1340a79fc2,""" \n\n == """"One of the founding nations of the...",,,,,,
