<h1>Combining Multiple Datasets </h1>

1. Code-Mixed Hinglish Hate Speech Detection Dataset : https://www.kaggle.com/datasets/sharduldhekane/code-mixed-hinglish-hate-speech-detection-dataset
2. Hinglish Hate Speech with Sentiment and Emotion : https://www.kaggle.com/datasets/shreyat22/hinglish-hate-speech-with-sentiment-and-emotion
3. Davidson et al. (2017): https://www.kaggle.com/datasets/eldrich/hate-speech-offensive-tweets-by-davidson-et-al
4. Thar Dataset: https://www.kaggle.com/datasets/aakash941/thar-dataset

In [11]:
import pandas as pd

# ===== Step 1: Load your datasets =====
# Replace with your actual paths
file1 = './Core/combined_hate_speech_dataset.csv'
file2 = './Core/hate_speech_with_sentiment_emotion_new.csv'
file3 = './Core/offensive-tweets-by-davidson.csv'
file4 = './Core/THAR-Dataset.csv'

df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)
df3 = pd.read_csv(file3)
df4 = pd.read_csv(file4)

# ===== Step 2: Check columns =====
print("Dataset 1 columns:", df1.columns.tolist())
print("Dataset 2 columns:", df2.columns.tolist())
print("Dataset 3 columns:", df3.columns.tolist())
print("Dataset 4 columns:", df4.columns.tolist())

Dataset 1 columns: ['text', 'hate_label', 'source', 'profanity_score', 'language', 'dataset_version', 'combined_date', 'text_length', 'word_count']
Dataset 2 columns: ['text', 'hate_label', 'source', 'profanity_score', 'language', 'dataset_version', 'combined_date', 'text_length', 'word_count', 'sentiment', 'emotion']
Dataset 3 columns: ['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither', 'class', 'tweet']
Dataset 4 columns: ['Identifier', 'Comment', 'SubTask1', 'SubTask2']


 1 = hate, 0 = non-hate

In [12]:
# Dataset 1
df1_clean = df1[['text', 'hate_label']].copy()
df1_clean.rename(columns={'hate_label': 'label'}, inplace=True)

# Dataset 2
df2_clean = df2[['text', 'hate_label']].copy()
df2_clean.rename(columns={'hate_label': 'label'}, inplace=True)

# Dataset 3
df3_clean = df3[['tweet', 'class']].copy()
df3_clean.rename(columns={'tweet': 'text', 'class': 'label'}, inplace=True)
df3_clean = df3_clean[df3_clean['label'].isin([0, 2])]
df3_clean['label'] = df3_clean['label'].map({0: 1, 2: 0})  # 1 = hate, 0 = non-hate

# Dataset 4
df4_clean = df4[['Comment', 'SubTask1']].copy()
df4_clean.rename(columns={'Comment': 'text', 'SubTask1': 'label'}, inplace=True)
df4_clean['label'] = df4_clean['label'].apply(lambda x: 1 if str(x).strip().lower() == 'antireligion' else 0)


In [13]:

print("Dataset 1 columns:", df1_clean.columns.tolist())
print("Dataset 2 columns:", df2_clean.columns.tolist())
print("Dataset 3 columns:", df3_clean.columns.tolist())
print("Dataset 4 columns:", df4_clean.columns.tolist())

Dataset 1 columns: ['text', 'label']
Dataset 2 columns: ['text', 'label']
Dataset 3 columns: ['text', 'label']
Dataset 4 columns: ['text', 'label']


In [14]:
print("Dataset 1 unique labels:", df1_clean['label'].unique())
print("Dataset 2 unique labels:", df2_clean['label'].unique())
print("Dataset 3 unique classes:", df3_clean['label'].unique())
print("Dataset 4 unique labels:", df4_clean['label'].unique())

Dataset 1 unique labels: [0 1]
Dataset 2 unique labels: [1 0]
Dataset 3 unique classes: [0 1]
Dataset 4 unique labels: [1 0]


In [15]:
print(df1_clean['label'].value_counts())
print(df2_clean['label'].value_counts())
print(df3_clean['label'].value_counts())
print(df4_clean['label'].value_counts())

label
0    15825
1    13725
Name: count, dtype: int64
label
0    15825
1    13725
Name: count, dtype: int64
label
0    4163
1    1430
Name: count, dtype: int64
label
0    6095
1    5454
Name: count, dtype: int64


In [16]:
# ==== Combine all datasets ====
combined_df = pd.concat([df1_clean, df2_clean, df3_clean, df4_clean], ignore_index=True)

# Clean text & remove duplicates
combined_df.dropna(subset=['text', 'label'], inplace=True)
combined_df.drop_duplicates(subset=['text'], inplace=True)

print("Combined dataset shape:", combined_df.shape)
print(combined_df['label'].value_counts())

Combined dataset shape: (46681, 2)
label
0    26076
1    20605
Name: count, dtype: int64


Oversampling

In [17]:
from sklearn.utils import resample

df_majority = combined_df[combined_df['label'] == 0]
df_minority = combined_df[combined_df['label'] == 1]

df_minority_upsampled = resample(df_minority,
                                 replace=True,
                                 n_samples=len(df_majority),
                                 random_state=42)

df_balanced = pd.concat([df_majority, df_minority_upsampled])
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print("Balanced dataset shape:", df_balanced.shape)
print(df_balanced['label'].value_counts())


Balanced dataset shape: (52152, 2)
label
1    26076
0    26076
Name: count, dtype: int64


In [18]:
df_balanced.to_csv('./Processed/final_hate_speech_dataset.csv', index=False)


Test-Train Seperation

In [19]:
from sklearn.model_selection import train_test_split
# Split into train (80%) and test (20%)
train_df, test_df = train_test_split(
    df_balanced,
    test_size=0.2,
    stratify=df_balanced['label'],
    random_state=42
)

print("Training set shape:", train_df.shape)
print("Test set shape:", test_df.shape)
print("\nLabel distribution in training set:")
print(train_df['label'].value_counts(normalize=True))
print("\nLabel distribution in test set:")
print(test_df['label'].value_counts(normalize=True))


Training set shape: (41721, 2)
Test set shape: (10431, 2)

Label distribution in training set:
label
1    0.500012
0    0.499988
Name: proportion, dtype: float64

Label distribution in test set:
label
0    0.500048
1    0.499952
Name: proportion, dtype: float64


In [20]:
train_df.to_csv('./Processed/train.csv', index=False)
test_df.to_csv('./Processed/test.csv', index=False)