## <span style="color: yellow;"><b>Data Cleaning</b></span>


### Merge

In [None]:
import pandas as pd

In [None]:
train = pd.read_csv('data/train.csv')
test_pvt = pd.read_csv('data/test_private_expanded.csv')

In [None]:
merged = pd.concat([train, test_pvt], ignore_index=True)

In [None]:
print(f'No of Columns : {len(merged.columns)}')
print(f'No of Rows : {len(merged)}')

### Data Cleaning

In [None]:
# Step 1 - Fill NANs in Toxicity with Target values

merged['toxicity'].fillna(merged['target'], inplace=True)

In [None]:
# Step 2 - Remove column Target

merged.drop(columns='target', inplace=True)

In [None]:
# Step 3 - Add new column toxic, toxicity >= 0.5 then toxic = 1 otherwise toxic = 0

merged['toxic'] = (merged['toxicity'] >= 0.5).astype(int)

In [None]:
merged['toxic'].unique()

In [None]:
merged['toxicity'].unique()

In [None]:
merged.isnull().sum()

In [None]:
# drop rows for below columns where values are NaNs

columns_to_check = ['asian', 'atheist', 'bisexual', 'black', 'buddhist', 'christian',
                    'female', 'heterosexual', 'hindu', 'homosexual_gay_or_lesbian',
                    'intellectual_or_learning_disability', 'jewish', 'latino', 'male',
                    'muslim', 'other_disability', 'other_gender', 'other_race_or_ethnicity',
                    'other_religion', 'other_sexual_orientation', 'physical_disability',
                    'psychiatric_or_mental_illness', 'transgender', 'white']

# Drop rows where all specified columns have NaN values
merged.dropna(subset=columns_to_check, how='all', inplace=True)



In [None]:
merged.columns

In [None]:
merged.isnull().sum()

In [None]:
# Step 4 - Drop columns parent_id, publication_id, article_id

columns_to_drop = ['parent_id', 'publication_id', 'article_id']
merged = merged.drop(columns=columns_to_drop)

In [None]:
merged.isnull().sum()

In [None]:
# Step 5 - remove NaNs columns of comment_text

# comment_text                                2
# 2 rows having NaNs


merged = merged.dropna(subset=['comment_text'])


In [None]:
merged.isnull().sum()

In [None]:
# Step 6 - create .csv file from merged dataset

merged.to_csv('data/merged_data.csv', index=False)

In [None]:
merged.columns

In [None]:
# Checking how the data is balanced
toxic_counts = merged['toxic'].value_counts()
print(toxic_counts)

toxic_percentages = ((toxic_counts / len(merged)) * 100).round(1)
print(toxic_percentages)

<span style="color: yellow;"><b>Imbalanced data. Work on this after EDA</b></span>

In [None]:
print(f'No of Columns : {len(merged.columns)}')
print(f'No of Rows : {len(merged)}')