# Balancing data

## Import

In [1]:
import pandas as pd
import numpy as np

# Under Sampling
from imblearn.under_sampling import RandomUnderSampler

## Load Data

In [2]:
data = pd.read_csv('data/train.csv') #nrows=1000)

# Drop rows with null values in comment_text
df_cleaned = data.dropna(subset=['comment_text'])
df_train = df_cleaned[['comment_text','target']]

# Add new column toxic, toxicity >= 0.5 then toxic = 1 otherwise toxic = 0
df_train = df_train.copy()
df_train['toxic'] = np.where(df_train['target'] >= 0.50, 1, 0)

# Just remains toxic and comment_text
df_train = df_train.drop(['target'], axis=1)

## Showing Unbalanced data

In [3]:
# Get counts and percentages
counts = df_train['toxic'].value_counts()
percentages = df_train['toxic'].value_counts(normalize=True) * 100
# Display counts and percentages
result_df = pd.DataFrame({'Count': counts, 'Percentage': percentages})
result_df['Percentage'] = result_df['Percentage'].map('{:.2f}%'.format)
print(result_df)

         Count Percentage
toxic                    
0      1660537     92.00%
1       144334      8.00%


##  Random Under Sampling

In [4]:
# Define X and y
X = df_train['comment_text']
y = df_train['toxic']

# Proportion for the minority class (toxic)
proportion = 0.4  #40% of the data will be toxic

# Instantiate RandomUnderSampler
under_sampler = RandomUnderSampler(sampling_strategy=proportion, random_state=42)

# Fit and transform the data
X_resampled, y_resampled = under_sampler.fit_resample(X.to_frame(), y)

# Create a new DataFrame with the resampled data
df_resampled = pd.concat([X_resampled, y_resampled], axis=1)

In [5]:
# Get counts and percentages
counts = df_resampled['toxic'].value_counts()
percentages = df_resampled['toxic'].value_counts(normalize=True) * 100
# Display counts and percentages
result_df = pd.DataFrame({'Count': counts, 'Percentage': percentages})
result_df['Percentage'] = result_df['Percentage'].map('{:.2f}%'.format)
print(result_df)

        Count Percentage
toxic                   
0      360835     71.43%
1      144334     28.57%


## Manual Under Sampling

In [6]:
# Define the desired proportions
proportion_toxic = 0.4  # 40% will be toxic
proportion_non_toxic = 0.6  # 60% will be non-toxic

# Separate the data into toxic and non-toxic
toxic_data = df_train[y == 1]
non_toxic_data = df_train[y == 0]

# Calculate the number of samples for each class
num_toxic_samples = int(len(df_train) * proportion_toxic)
num_non_toxic_samples = int(len(df_train) * proportion_non_toxic)

# Randomly sample the data with replacement
sampled_toxic_data = toxic_data.sample(n=num_toxic_samples, replace=True, random_state=42)
sampled_non_toxic_data = non_toxic_data.sample(n=num_non_toxic_samples, replace=True, random_state=42)

# Concatenate the sampled data
df_resampled_manual = pd.concat([sampled_toxic_data, sampled_non_toxic_data])

# Shuffle the rows of the resulting DataFrame
df_resampled_manual = df_resampled_manual.sample(frac=1, random_state=42).reset_index(drop=True)

In [7]:
# Get counts and percentages
counts = df_resampled_manual['toxic'].value_counts()
percentages = df_resampled_manual['toxic'].value_counts(normalize=True) * 100
# Display counts and percentages
result_df = pd.DataFrame({'Count': counts, 'Percentage': percentages})
result_df['Percentage'] = result_df['Percentage'].map('{:.2f}%'.format)
print(result_df)

         Count Percentage
toxic                    
0      1082922     60.00%
1       721948     40.00%


## Manual 2 Under Sampling

In [8]:
# Calculate the undersample size for class 0
undersample_size_0 = int(df_train['toxic'].value_counts()[1] / 0.3 * 0.7)

# Undersample class 0 and keep all samples from class 1
undersampled_data = pd.concat([
    df_train[df_train['toxic'] == 0].sample(n=undersample_size_0, random_state=42),
    df_train[df_train['toxic'] == 1]],
    ignore_index=True)

# Shuffle the undersampled data
undersampled_data = undersampled_data.sample(frac=1, random_state=42)


In [9]:
# Get counts and percentages
counts = undersampled_data['toxic'].value_counts()
percentages = undersampled_data['toxic'].value_counts(normalize=True) * 100
# Display counts and percentages
result_df = pd.DataFrame({'Count': counts, 'Percentage': percentages})
result_df['Percentage'] = result_df['Percentage'].map('{:.2f}%'.format)
print(result_df)

        Count Percentage
toxic                   
0      336779     70.00%
1      144334     30.00%
