## <span style="color: yellow;"><b>Data Cleaning</b></span>


### Merge

In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('data/train.csv')
test_pvt = pd.read_csv('data/test_private_expanded.csv')

In [3]:
merged = pd.concat([train, test_pvt], ignore_index=True)

In [4]:
print(f'No of Columns : {len(merged.columns)}')
print(f'No of Rows : {len(merged)}')

No of Columns : 46
No of Rows : 1902194


### Data Cleaning

In [5]:
# Step 1 - Fill NANs in Toxicity with Target values

merged['toxicity'].fillna(merged['target'], inplace=True)

In [6]:
# Step 2 - Remove column Target

merged.drop(columns='target', inplace=True)

In [7]:
# Step 3 - Add new column toxic, toxicity >= 0.5 then toxic = 1 otherwise toxic = 0

merged['toxic'] = (merged['toxicity'] >= 0.5).astype(int)

In [8]:
merged['toxic'].unique()

array([0, 1])

In [9]:
merged['toxicity'].unique()

array([0.        , 0.89361702, 0.66666667, ..., 0.00341006, 0.48780488,
       0.96731572])

In [10]:
merged.isnull().sum()

id                                           0
comment_text                                 3
severe_toxicity                              0
obscene                                      0
identity_attack                              0
insult                                       0
threat                                       0
asian                                  1475487
atheist                                1475487
bisexual                               1475487
black                                  1475487
buddhist                               1475487
christian                              1475487
female                                 1475487
heterosexual                           1475487
hindu                                  1475487
homosexual_gay_or_lesbian              1475487
intellectual_or_learning_disability    1475487
jewish                                 1475487
latino                                 1475487
male                                   1475487
muslim       

In [11]:
# drop rows for below columns where values are NaNs

columns_to_check = ['asian', 'atheist', 'bisexual', 'black', 'buddhist', 'christian',
                    'female', 'heterosexual', 'hindu', 'homosexual_gay_or_lesbian',
                    'intellectual_or_learning_disability', 'jewish', 'latino', 'male',
                    'muslim', 'other_disability', 'other_gender', 'other_race_or_ethnicity',
                    'other_religion', 'other_sexual_orientation', 'physical_disability',
                    'psychiatric_or_mental_illness', 'transgender', 'white']

# Drop rows where all specified columns have NaN values
merged.dropna(subset=columns_to_check, how='all', inplace=True)



In [12]:
merged.columns

Index(['id', 'comment_text', 'severe_toxicity', 'obscene', 'identity_attack',
       'insult', 'threat', 'asian', 'atheist', 'bisexual', 'black', 'buddhist',
       'christian', 'female', 'heterosexual', 'hindu',
       'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability',
       'jewish', 'latino', 'male', 'muslim', 'other_disability',
       'other_gender', 'other_race_or_ethnicity', 'other_religion',
       'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white', 'created_date',
       'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow',
       'sad', 'likes', 'disagree', 'sexual_explicit',
       'identity_annotator_count', 'toxicity_annotator_count', 'toxicity',
       'toxic'],
      dtype='object')

In [13]:
merged.isnull().sum()

id                                          0
comment_text                                2
severe_toxicity                             0
obscene                                     0
identity_attack                             0
insult                                      0
threat                                      0
asian                                       0
atheist                                     0
bisexual                                    0
black                                       0
buddhist                                    0
christian                                   0
female                                      0
heterosexual                                0
hindu                                       0
homosexual_gay_or_lesbian                   0
intellectual_or_learning_disability         0
jewish                                      0
latino                                      0
male                                        0
muslim                            

In [14]:
# Step 4 - Drop columns parent_id, publication_id, article_id

columns_to_drop = ['parent_id', 'publication_id', 'article_id']
merged = merged.drop(columns=columns_to_drop)

In [15]:
merged.isnull().sum()

id                                     0
comment_text                           2
severe_toxicity                        0
obscene                                0
identity_attack                        0
insult                                 0
threat                                 0
asian                                  0
atheist                                0
bisexual                               0
black                                  0
buddhist                               0
christian                              0
female                                 0
heterosexual                           0
hindu                                  0
homosexual_gay_or_lesbian              0
intellectual_or_learning_disability    0
jewish                                 0
latino                                 0
male                                   0
muslim                                 0
other_disability                       0
other_gender                           0
other_race_or_et

In [16]:
# Step 5 - remove NaNs columns of comment_text

# comment_text                                2
# 2 rows having NaNs


merged = merged.dropna(subset=['comment_text'])


In [17]:
merged.isnull().sum()

id                                     0
comment_text                           0
severe_toxicity                        0
obscene                                0
identity_attack                        0
insult                                 0
threat                                 0
asian                                  0
atheist                                0
bisexual                               0
black                                  0
buddhist                               0
christian                              0
female                                 0
heterosexual                           0
hindu                                  0
homosexual_gay_or_lesbian              0
intellectual_or_learning_disability    0
jewish                                 0
latino                                 0
male                                   0
muslim                                 0
other_disability                       0
other_gender                           0
other_race_or_et

In [19]:
# Step 6 - create .csv file from merged dataset

merged.to_csv('data/merged_data.csv', index=False)

In [20]:
merged.columns

Index(['id', 'comment_text', 'severe_toxicity', 'obscene', 'identity_attack',
       'insult', 'threat', 'asian', 'atheist', 'bisexual', 'black', 'buddhist',
       'christian', 'female', 'heterosexual', 'hindu',
       'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability',
       'jewish', 'latino', 'male', 'muslim', 'other_disability',
       'other_gender', 'other_race_or_ethnicity', 'other_religion',
       'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white', 'created_date',
       'rating', 'funny', 'wow', 'sad', 'likes', 'disagree', 'sexual_explicit',
       'identity_annotator_count', 'toxicity_annotator_count', 'toxicity',
       'toxic'],
      dtype='object')

In [21]:
# Checking how the data is balanced
toxic_counts = merged['toxic'].value_counts()
print(toxic_counts)

toxic_percentages = ((toxic_counts / len(merged)) * 100).round(1)
print(toxic_percentages)

toxic
0    378271
1     48434
Name: count, dtype: int64
toxic
0    88.6
1    11.4
Name: count, dtype: float64


<span style="color: yellow;"><b>Imbalanced data. Work on this after EDA</b></span>

In [22]:
print(f'No of Columns : {len(merged.columns)}')
print(f'No of Rows : {len(merged)}')

No of Columns : 43
No of Rows : 426705
