##### Merge Processed Comment Datasets into Unified Multi-Task Corpus
##### This notebook loads your four processed CSVs, aligns their schemas,
##### applies rule-based label logic, and saves a single merged dataset.

In [3]:
import pandas as pd
import numpy as np

import sys
from pathlib import Path

# Add the project root directory to Python path
project_root = Path().resolve().parent
sys.path.append(str(project_root))

# Now you can import from src
from src.utils.utils import clean_text

### 1. Load Processed Data

In [7]:
jigsaw_df      = pd.read_csv('../datasets/processed/toxic_comments_relevant_data.csv')
hatexplain_df  = pd.read_csv('../datasets/processed/hatexplain_relevant_data.csv')
spam_df        = pd.read_csv('../datasets/processed/youtube_spam_relevant_data.csv')
sentiment_df   = pd.read_csv('../datasets/processed/twitter_relevant_data.csv')

In [8]:
# Preview shapes:
print("Jigsaw:", jigsaw_df.shape)
print("HateXplain:", hatexplain_df.shape)
print("Spam:", spam_df.shape)
print("Sentiment:", sentiment_df.shape)

Jigsaw: (159571, 8)
HateXplain: (15383, 2)
Spam: (1956, 2)
Sentiment: (69491, 2)


In [17]:
jigsaw_df.head(2)

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,1
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,1


In [18]:
hatexplain_df.head(2)

Unnamed: 0,majority_label,message
0,offensive,u really think i would not have been raped by ...
1,offensive,the uk has threatened to return radioactive wa...


In [19]:
spam_df.head(2)

Unnamed: 0,CONTENT,CLASS
0,"Huh, anyway check out this you[tube] channel: ...",1
1,Hey guys check out my new channel and our firs...,1


In [20]:
sentiment_df.head(2)

Unnamed: 0,text,sentiment
0,im getting on borderlands and i will murder yo...,Positive
1,I am coming to the borders and I will kill you...,Positive


### 2. Standardize Schemas
 We want each DF to have: `text`, six toxicity sub-labels,
 `hate_speech`, `spam`, `sentiment`.

##### >>>>>> rename text/message columns

In [21]:
jigsaw_df   = jigsaw_df.rename(columns={'comment_text':'text'}).drop(columns=['none'])
hatexplain_df = hatexplain_df.rename(columns={'message':'text'})
spam_df     = spam_df.rename(columns={'CONTENT':'text'})
# sentiment_df already has `text`

In [22]:
tox_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Jigsaw: already has tox_cols; add missing task columns
jigsaw_df['hate_speech'] = np.nan
jigsaw_df['spam']        = np.nan
jigsaw_df['sentiment']   = np.nan

In [29]:
# HateXplain: add tox_cols, spam, sentiment
hatexplain_df = hatexplain_df.rename(columns={'majority_label': 'hate_speech'})
for col in tox_cols:
    hatexplain_df[col] = np.nan
hatexplain_df['spam']      = np.nan
hatexplain_df['sentiment'] = np.nan

In [25]:
# Spam: rename column, add tox_cols, hate_speech, sentiment
spam_df = spam_df.rename(columns={'CLASS': 'spam'})
for col in tox_cols:
    spam_df[col] = np.nan
spam_df['hate_speech'] = np.nan
# Map spam labels to strings
spam_df['spam'] = spam_df['spam'].map({1: 'spam', 0: 'ham'})
spam_df['sentiment'] = np.nan

In [26]:
# Sentiment140: add tox_cols, hate_speech, spam
for col in tox_cols:
    sentiment_df[col] = np.nan
sentiment_df['hate_speech'] = np.nan
sentiment_df['spam']        = np.nan

# Map sentiment labels to strings
sentiment_df['sentiment'] = sentiment_df['sentiment'].map({0: 'negative', 2: 'neutral', 4: 'positive'})

#### 4. (Re-)Clean Text
Ensure consistent text preprocessing across all datasets.

In [27]:

# %%
for df in [jigsaw_df, hatexplain_df, spam_df, sentiment_df]:
    df['text'] = df['text'].astype(str).apply(clean_text)

##### 5. Combine All DataFrames

In [30]:
combined = pd.concat([
    jigsaw_df[['text'] + tox_cols + ['hate_speech', 'spam', 'sentiment']],
    hatexplain_df[['text'] + tox_cols + ['hate_speech', 'spam', 'sentiment']],
    spam_df[['text'] + tox_cols + ['hate_speech', 'spam', 'sentiment']],
    sentiment_df[['text'] + tox_cols + ['hate_speech', 'spam', 'sentiment']]
], ignore_index=True)
print("Combined shape:", combined.shape)

Combined shape: (246401, 10)


##### 6. Apply Rule-Based Logic for Toxicity for hate speech

In [38]:
combined['hate_speech'].unique()

array([nan, 'offensive', 'normal', 'hatespeech'], dtype=object)

In [51]:
import pickle
import sys
from pathlib import Path
import nltk
from nltk.tokenize import word_tokenize

# Download required NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')

# Define the tokenize function
def tokenize(text):
    return word_tokenize(text)

# Add project root to path
project_root = Path().resolve().parent
sys.path.append(str(project_root))

# Load the models
models_path = project_root / 'models'

# Load TF-IDF vectorizer
with open(models_path / 'tfidf_vectorizer.pkl', 'rb') as f:
    tfidf_vectorizer = pickle.load(f)

# Load Logistic Regression model
with open(models_path / 'lr_models.pkl', 'rb') as f:
    lr_models = pickle.load(f)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Q\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Q\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


##### 6.1: Predict toxicity sub-labels for 'hate' or 'offensive' comments

In [48]:
# mask_offender = combined['hate_speech'].isin(['hate','offensive'])
# if mask_offender.any():
#     X_off = tfidf_vectorizer.transform(combined.loc[mask_offender,'text'])
    
#     # Get predictions for all toxicity types
#     predictions = lr_model.predict(X_off)
    
#     # Update all toxicity columns at once
#     combined.loc[mask_offender, tox_cols] = predictions
    
#     # If you want to keep only the highest confidence prediction
#     def top_label(vals):
#         arr = np.array(vals, dtype=float)
#         max_val = arr.max()
#         return [1 if a==max_val and max_val>0 else 0 for a in arr]
    
#     top_df = combined.loc[mask_offender, tox_cols].apply(top_label, axis=1, result_type='expand')
#     top_df.columns = tox_cols
#     combined.loc[mask_offender, tox_cols] = top_df.values

In [52]:
# Get comments flagged as hate speech or offensive
mask_offender = combined['hate_speech'].isin(['hate', 'offensive'])
print(f"Found {mask_offender.sum()} comments flagged as hate speech or offensive")

if mask_offender.any():
    # Transform the text data
    X_off = tfidf_vectorizer.transform(combined.loc[mask_offender, 'text'])
    
    # Predict each toxicity type
    for label in tox_cols:
        print(f"Predicting {label}...")
        predictions = lr_models[label].predict(X_off)
        combined.loc[mask_offender, label] = predictions
    
    # Optional: Keep only the highest confidence prediction for each comment
    print("\nRefining predictions to keep only the strongest toxicity type...")
    def top_label(vals):
        arr = np.array(vals, dtype=float)
        max_val = arr.max()
        return [1 if a == max_val and max_val > 0 else 0 for a in arr]
    
    top_df = combined.loc[mask_offender, tox_cols].apply(top_label, axis=1, result_type='expand')
    top_df.columns = tox_cols
    combined.loc[mask_offender, tox_cols] = top_df.values

Found 4384 comments flagged as hate speech or offensive
Predicting toxic...
Predicting severe_toxic...
Predicting obscene...
Predicting threat...
Predicting insult...
Predicting identity_hate...

Refining predictions to keep only the strongest toxicity type...


In [53]:
# Print summary of predictions
print("\nPrediction Summary:")
for col in tox_cols:
    count = combined[col].sum()
    percent = (count / len(combined)) * 100
    print(f"{col}: {count} comments ({percent:.2f}%)")


Prediction Summary:
toxic: 15358.0 comments (6.23%)
severe_toxic: 1659.0 comments (0.67%)
obscene: 8513.0 comments (3.45%)
threat: 542.0 comments (0.22%)
insult: 7941.0 comments (3.22%)
identity_hate: 1469.0 comments (0.60%)


In [54]:
# Display a sample of toxic comments
print("\nSample of toxic comments with their classifications:")
toxic_mask = combined[tox_cols].any(axis=1)
sample_size = min(5, toxic_mask.sum())
sample_df = combined[toxic_mask].sample(n=sample_size)
for _, row in sample_df.iterrows():
    print("\nText:", row['text'][:100], "...")  # Show first 100 chars
    print("Hate Speech Label:", row['hate_speech'])
    print("Toxicity Types:", [col for col in tox_cols if row[col] == 1])
    print("-" * 80)


Sample of toxic comments with their classifications:

Text: gibraltarian get lost. do not contact me again under any pretext. you behaviour is beneath contempt. ...
Hate Speech Label: nan
Toxicity Types: ['toxic']
--------------------------------------------------------------------------------

Text: hello dickbrain just another wikipedia wanker ...
Hate Speech Label: nan
Toxicity Types: ['toxic', 'obscene', 'insult']
--------------------------------------------------------------------------------

Text: because i did not do anything wrong in the first place..you accused me of doing something i did not  ...
Hate Speech Label: nan
Toxicity Types: ['toxic']
--------------------------------------------------------------------------------

Text: if you see this message wikipedia's talk system is terrible, feel free to direct me to the appropria ...
Hate Speech Label: nan
Toxicity Types: ['toxic', 'obscene', 'insult']
------------------------------------------------------------------------

In [55]:
# Create a mask for any type of toxic or offensive content
toxic_mask = (
    # Check for any toxicity label
    combined[tox_cols].any(axis=1) |
    # Check for hate speech or offensive content
    combined['hate_speech'].isin(['hate', 'offensive'])
)

# Update sentiment to negative where toxic_mask is True
print(f"Found {toxic_mask.sum()} comments with toxic/offensive content")
print("\nBefore update:")
print(combined['sentiment'].value_counts())

# Create backup of original sentiment if needed
combined['original_sentiment'] = combined['sentiment']

# Update sentiment to 'negative' where toxic_mask is True
combined.loc[toxic_mask, 'sentiment'] = 'negative'

print("\nAfter update:")
print(combined['sentiment'].value_counts())

# Show some examples of updated sentiments
print("\nSample of updated sentiments:")
changed_mask = (combined['sentiment'] != combined['original_sentiment'])
print(f"\nTotal sentiments changed: {changed_mask.sum()}")

sample_size = min(5, changed_mask.sum())
if sample_size > 0:
    sample_df = combined[changed_mask].sample(n=sample_size)
    for _, row in sample_df.iterrows():
        print("\nText:", row['text'][:100], "...")  # Show first 100 chars
        print("Original Sentiment:", row['original_sentiment'])
        print("New Sentiment:", row['sentiment'])
        print("Reason:", end=" ")
        if row[tox_cols].any():
            print("Toxic labels:", [col for col in tox_cols if row[col] == 1])
        if row['hate_speech'] in ['hate', 'offensive']:
            print("Hate speech label:", row['hate_speech'])
        print("-" * 80)

# Save the updated DataFrame
output_path = project_root / 'datasets' / 'processed' / 'final_comment_analysis_data.csv'
combined.to_csv(output_path, index=False)
print(f"\nSaved processed dataset to: {output_path}")

Found 20609 comments with toxic/offensive content

Before update:
Series([], Name: count, dtype: int64)

After update:
sentiment
negative    20609
Name: count, dtype: int64

Sample of updated sentiments:

Total sentiments changed: 246401

Text: why the hell...? why the hell do you erase data from nationwide anthropometric studies e.g. iran, cz ...
Original Sentiment: nan
New Sentiment: negative
Reason: Toxic labels: ['toxic']
--------------------------------------------------------------------------------

Text: yea home depot was not like this while i worked there ...
Original Sentiment: nan
New Sentiment: nan
Reason: --------------------------------------------------------------------------------

Text: you are so right. i am just going to go back to my regular of working, which is to almost never comm ...
Original Sentiment: nan
New Sentiment: nan
Reason: --------------------------------------------------------------------------------

Text: as in sony buying another console generat