Toxic Comment Classification - Multi Label - NLP


Import Modules

In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve, classification_report

import warnings
warnings.filterwarnings('ignore')

Load the Dataset

In [None]:
import pandas as pd
from pathlib import Path

# Define the data directory
DATA_DIR = Path("data/raw")

# Read train and test datasets
train_df = pd.read_csv(DATA_DIR /"train.csv")
test_df = pd.read_csv(DATA_DIR / "test.csv")

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

In [None]:
# dataset stats
train_df.describe()

In [None]:
# dataset info
train_df.info()

In [None]:
# check for null values
train_df.isnull().sum()

In [None]:
len(train_df)

Exploratory Data Analysis

In [None]:
x = train_df.iloc[:, 2:].sum() # take only label columns
x

In [None]:
rowsums = train_df.iloc[:, 2:].sum(axis=1) # take label columns and sum it column wise
rowsums

In [None]:
no_label_count = 0

for i, count in rowsums.items():
    if count==0:
        no_label_count += 1
        
print('Total number of comments:', len(train_df))
print('Total number of comments without labels:', no_label_count)
print('Total labels:', x.sum())

In [None]:

plt.figure(figsize=(6, 4))
ax = sns.barplot(x=x.index, y=x.values, alpha=0.8, palette=['tab:blue', 'tab:orange', 'tab:green', 'tab:brown', 'tab:red', 'tab:grey'])
plt.title('Label Counts')
plt.ylabel('Count')
plt.xlabel('Label')


In [None]:
plt.figure(figsize=(6, 4))
ax = sns.countplot(x=rowsums.values, alpha=0.8, palette=['tab:blue', 'tab:orange', 'tab:green', 'tab:brown', 'tab:red', 'tab:grey'])
plt.title('Labels per Comment')
plt.ylabel('# of Occurences')
plt.xlabel('# of Labels')

plt.show()

In [None]:
temp_df=train_df.iloc[:,2:-1]
# filter temp by removing clean comments
# temp_df=temp_df[~train.clean]

corr=temp_df.corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr,
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values, annot=True)

In [None]:
main_col="toxic"
corr_mats=[]
for other_col in temp_df.columns[1:]:
    confusion_matrix = pd.crosstab(temp_df[main_col], temp_df[other_col])
    corr_mats.append(confusion_matrix)
out = pd.concat(corr_mats,axis=1,keys=temp_df.columns[1:])

#cell highlighting
# out = out.style.apply(highlight_min,axis=0)
out

In [None]:
from wordcloud import STOPWORDS

stopwords = set(STOPWORDS)
print(len(stopwords))

In [None]:
stopwords = set(STOPWORDS)

In [None]:
from wordcloud import WordCloud
subset_toxic=train_df[train_df.toxic==True]
text=subset_toxic.comment_text.values
wc= WordCloud(background_color="black",max_words=2000,stopwords=stopwords)
wc.generate(" ".join(text))
plt.figure(figsize=(20,10))
plt.axis("off")
plt.imshow(wc.recolor(colormap= 'viridis' , random_state=17), alpha=0.98)
plt.savefig('insights/wordclouds/toxic_comments_wc.png')
plt.show()

In [None]:
subset_severe_toxic=train_df[train_df.severe_toxic==True]
text=subset_severe_toxic.comment_text.values
wc= WordCloud(background_color="black",max_words=2000,stopwords=stopwords)
wc.generate(" ".join(text))
plt.figure(figsize=(20,10))
plt.axis("off")
plt.imshow(wc.recolor(colormap= 'viridis' , random_state=17), alpha=0.98)
plt.savefig('insights/wordclouds/severe_toxic_comments_wc.png')
plt.show()

In [None]:
subset_threat=train_df[train_df.threat==True]
text=subset_threat.comment_text.values
wc= WordCloud(background_color="black",max_words=2000,stopwords=stopwords)
wc.generate(" ".join(text))
plt.figure(figsize=(20,10))
plt.axis("off")
plt.imshow(wc.recolor(colormap= 'viridis' , random_state=17), alpha=0.98)
plt.savefig('insights/wordclouds/threat_comments_wc.png')
plt.show()

In [None]:
subset_insult=train_df[train_df.insult==True]
text=subset_insult.comment_text.values
wc= WordCloud(background_color="black",max_words=2000,stopwords=stopwords)
wc.generate(" ".join(text))
plt.figure(figsize=(20,10))
plt.axis("off")
plt.imshow(wc.recolor(colormap= 'viridis' , random_state=17), alpha=0.98)
plt.savefig('insights/wordclouds/insult_comments_wc.png')
plt.show()

Are longer comments more toxic ?

In [None]:
label_col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
if 'is_clean' not in train_df.columns:
    train_df['is_clean'] = (train_df[label_col].sum(axis=1) == 0).astype(int)

# Total characters
train_df['total_len'] = train_df['comment_text'].apply(len)
test_df['total_len'] = test_df['comment_text'].apply(len)

# Sentence count
train_df['sent_count'] = train_df["comment_text"].apply(lambda x: len(re.findall("\n", str(x))) + 1)
test_df['sent_count'] = test_df["comment_text"].apply(lambda x: len(re.findall("\n", str(x))) + 1)

# Word count
train_df['word_count'] = train_df["comment_text"].apply(lambda x: len(str(x).split()))
test_df['word_count'] = test_df["comment_text"].apply(lambda x: len(str(x).split()))

# Plot KDEs
plt.figure(figsize=(18, 6))
plt.suptitle("Are longer comments more toxic?", fontsize=18)

# Characters
plt.subplot(131)
sns.kdeplot(train_df[train_df.is_clean == 0]['total_len'], label="UnClean", shade=True, color='r')
sns.kdeplot(train_df[train_df.is_clean == 1]['total_len'], label="Clean")
plt.legend()
plt.ylabel('Density', fontsize=12)
plt.xlabel('# of Chars', fontsize=12)

# Words
plt.subplot(132)
sns.kdeplot(train_df[train_df.is_clean == 0]['word_count'], label="UnClean", shade=True, color='r')
sns.kdeplot(train_df[train_df.is_clean == 1]['word_count'], label="Clean")
plt.legend()
plt.xlabel('# of Words', fontsize=12)

# Sentences
plt.subplot(133)
sns.kdeplot(train_df[train_df.is_clean == 0]['sent_count'], label="UnClean", shade=True, color='r')
sns.kdeplot(train_df[train_df.is_clean == 1]['sent_count'], label="Clean")
plt.legend()
plt.xlabel('# of Sentences', fontsize=12)

plt.tight_layout()
plt.show()

most comments are having less than 25 sentences & less than 250 words

unclean comments are having more no.of words in less no.of sentences.

The distrubution plots of clean & unclean of all three plots are very much overlapping with each others, indicating these features are going to be less significant in differentiating them.

In [None]:
import string

# Ensure is_clean exists (0 = toxic, 1 = clean)
label_col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
if 'is_clean' not in train_df.columns:
    train_df['is_clean'] = (train_df[label_col].sum(axis=1) == 0).astype(int)

# Capital letters count
train_df['capitals'] = train_df['comment_text'].apply(lambda x: sum(1 for c in str(x) if c.isupper()))
test_df['capitals'] = test_df['comment_text'].apply(lambda x: sum(1 for c in str(x) if c.isupper()))

# Punctuation count
train_df['punct_count'] = train_df['comment_text'].apply(lambda x: sum(1 for c in str(x) if c in string.punctuation))
test_df['punct_count'] = test_df['comment_text'].apply(lambda x: sum(1 for c in str(x) if c in string.punctuation))

# Smilies count
smilies = (':-)', ':)', ';-)', ';)')
train_df['smilies_count'] = train_df['comment_text'].apply(lambda comment: sum(str(comment).count(s) for s in smilies))
test_df['smilies_count'] = test_df['comment_text'].apply(lambda comment: sum(str(comment).count(s) for s in smilies))

# Plotting
plt.figure(figsize=(18, 6))
plt.suptitle("Does the Presence of Special Characters Vary with Toxicity?", fontsize=18)

# Capitals
plt.subplot(131)
sns.kdeplot(train_df[train_df.is_clean == 0]['capitals'], label="Toxic", shade=True, color='r')
sns.kdeplot(train_df[train_df.is_clean == 1]['capitals'], label="Clean")
plt.legend()
plt.ylabel('Density', fontsize=12)
plt.xlabel('# of Capital Letters', fontsize=12)

# Punctuations
plt.subplot(132)
sns.kdeplot(train_df[train_df.is_clean == 0]['punct_count'], label="Toxic", shade=True, color='r')
sns.kdeplot(train_df[train_df.is_clean == 1]['punct_count'], label="Clean")
plt.legend()
plt.xlabel('# of Punctuations', fontsize=12)

# Smilies
plt.subplot(133)
sns.kdeplot(train_df[train_df.is_clean == 0]['smilies_count'], label="Toxic", shade=True, color='r')
sns.kdeplot(train_df[train_df.is_clean == 1]['smilies_count'], label="Clean")
plt.legend()
plt.xlabel('# of Smilies', fontsize=12)

plt.tight_layout()
plt.show()

presence of captial letters is more in case of unclean comments, but the distrbutions are overlapping making it a difficult feature for models to extract information.

most of the clean comments are having punctuations less than 100 while for unclean comments it spread to max of 5000 punctuations.

no.of smilies in unclean v/s clean comments is very much similar and unclean comments are having more comments with no.of smilies = 1.

In [None]:
# Unique word count
train_df['unique_word_count'] = train_df["comment_text"].apply(lambda x: len(set(str(x).split())))
test_df['unique_word_count'] = test_df["comment_text"].apply(lambda x: len(set(str(x).split())))

# Unique ratio
train_df['unique_word_percent'] = (train_df['unique_word_count'] / train_df['word_count']) * 100
test_df['unique_word_percent'] = (test_df['unique_word_count'] / test_df['word_count']) * 100

# ---------- Plotting ------------
plt.figure(figsize=(15, 5))
plt.suptitle("Comments with Less-Unique-Words (Spam) vs Toxicity?", fontsize=18)

# KDE plot for unique word percentage
plt.subplot(121)
plt.title("% of Unique Words in Comments")
sns.kdeplot(train_df[train_df.is_clean == 0]['unique_word_percent'], label="Toxic", shade=True, color='r')
sns.kdeplot(train_df[train_df.is_clean == 1]['unique_word_percent'], label="Clean")
plt.legend()
plt.ylabel('Density', fontsize=12)
plt.xlabel('Percent Unique Words', fontsize=12)

# Violin plot for comments with <25% unique words
plt.subplot(122)
sns.violinplot(
    y='unique_word_count', x='is_clean',
    data=train_df[train_df['unique_word_percent'] < 25],
    split=True, inner="quart"
)
plt.xlabel('is_Clean', fontsize=12)
plt.ylabel('# of Unique Words', fontsize=12)
plt.title("# Unique Words vs Toxicity")

plt.tight_layout()
plt.show()


There is a wide spread area for unclean points in the unique word percentage range of 1-10%, Interesting there are clean comments as well with lesser number of unique words.

This feature seems carry some significance especially incase of sentences with less unique words.

lets once see how text in clean-spam & unclean-spam comments look like

Data Preprocessing

In [None]:
import re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
print(f"Number of stopwords: {len(stop_words)}")
from nltk.corpus import stopwords

eng_stopwords = set(stopwords.words("english"))

CONTRACTIONS = {
    "can't": "cannot", "won't": "will not", "i'm": "i am", "it's": "it is",
    "don't": "do not", "didn't": "did not", "you're": "you are",
    "they're": "they are", "isn't": "is not", "aren't": "are not",
    # add more as needed
}
def expand_contractions(text):
    def replace(match):
        return CONTRACTIONS.get(match.group(0).lower(), match.group(0))
    pattern = re.compile(r'\b(' + '|'.join(map(re.escape, CONTRACTIONS.keys())) + r')\b', flags=re.IGNORECASE)
    return pattern.sub(replace, text)

def clean_text(text):
    if not isinstance(text, str):
        text = str(text)
    text = text.lower()
    text = expand_contractions(text)
    text = re.sub(r'http\S+|www\.\S+', ' ', text)              # remove urls
    text = re.sub(r'\d+', ' ', text)                           # remove numbers (optional)
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()                   # normalize spaces
    # remove stopwords
    tokens = [t for t in text.split() if t not in eng_stopwords]
    return ' '.join(tokens)

In [None]:
from tqdm import tqdm
tqdm.pandas()

train_df['comment_text_clean'] = train_df['comment_text'].progress_apply(clean_text)
test_df['comment_text_clean'] = test_df['comment_text'].progress_apply(clean_text)
