In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from process_text import clean_text, lemmatizing
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()
train_data_pth = os.getenv("TRAIN_DATA_PATH")
test_data_pth = os.getenv("TEST_DATA_PATH")

# Define constants
DPI = 300

In [1]:
from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()
print("Kaggle API authenticated successfully!")


Kaggle API authenticated successfully!


In [None]:
train_df = pd.read_csv(train_data_pth)
test_df = pd.read_csv(test_data_pth)

In [None]:
print(train_df.head())
train_df.info()
train_df.describe(include='all')
train_df.isnull().sum()
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train_df[label_cols].sum().sort_values(ascending=False)


In [None]:
# Define label columns
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Calculate label counts and sort
label_counts = train_df[label_cols].sum().sort_values(ascending=False)

# Total comments
total_comments = len(train_df)

# Number of non-toxic and toxic comments
train_df['num_labels'] = train_df[label_cols].sum(axis=1)
non_toxic_count = (train_df['num_labels'] == 0).sum()
toxic_count = total_comments - non_toxic_count

# Plot Toxic vs Non-Toxic
plt.figure(figsize=(6, 4), dpi=DPI)
plt.bar(['Non-Toxic', 'Toxic'], [non_toxic_count, toxic_count], color=['lightgreen', 'salmon'])
plt.title("Toxic vs Non-Toxic Comments")
plt.ylabel("Number of Comments")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Plot class distribution
plt.figure(figsize=(10, 6), dpi=DPI)
label_counts.plot(kind='bar', color='skyblue')
plt.title("Label Distribution in Toxic Comment Dataset")
plt.xlabel("Label")
plt.ylabel("Number of Comments")
plt.xticks(rotation=45)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Display class imbalance stats as DataFrame
label_stats = pd.DataFrame({
    "Label": label_counts.index,
    "Count": label_counts.values,
    "Percentage": (label_counts.values / len(train_df) * 100).round(2)
})

print(label_stats)

# Add 'non_toxic' to label counts
label_counts_with_clean = label_counts.copy()
label_counts_with_clean['non_toxic'] = non_toxic_count
label_counts_with_clean = label_counts_with_clean.sort_values(ascending=False)

# Plot updated class distribution
plt.figure(figsize=(10, 6), dpi=DPI)
label_counts_with_clean.plot(kind='bar', color='skyblue')
plt.title("Label Distribution Including Non-Toxic Comments")
plt.xlabel("Label")
plt.ylabel("Number of Comments")
plt.xticks(rotation=45)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 6), dpi=DPI)
corr = train_df[label_cols].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Label Correlation Matrix")
plt.show()

In [None]:
co_occurrence = train_df[label_cols].T.dot(train_df[label_cols])
plt.figure(figsize=(8, 6), dpi=DPI)
sns.heatmap(co_occurrence, annot=True, fmt="d", cmap="Blues")
plt.title("Label Co-occurrence Heatmap")
plt.show()


In [None]:
plt.figure(figsize=(10, 6), dpi=DPI)
train_df['comment_length'] = train_df['comment_text'].apply(lambda x: len(str(x).split()))
train_df['comment_length'].hist(bins=50)
plt.title("Comment Length Distribution")
plt.xlabel("Words")
plt.ylabel("Frequency")
plt.show()

In [None]:
train_df['cleaned'] = train_df['comment_text'].apply(clean_text)
train_df['lemmatized'] = train_df['cleaned'].apply(lemmatizing)
print(train_df[['comment_text', 'cleaned', 'lemmatized']].head())

In [None]:
# Set figure size
plt.figure(figsize=(10, 6), dpi=DPI)

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train_df['is_toxic'] = train_df[label_cols].sum(axis=1) > 0

# Plot histograms
sns.histplot(data=train_df, x='word_count', hue='is_toxic', bins=50, palette={False: 'green', True: 'red'})

# Plot settings
plt.title('Word Count Distribution by Toxicity')
plt.xlabel('Number of Words (Lemmatized)')
plt.ylabel('Number of Comments')
plt.legend(title='Toxic', labels=['Non-Toxic', 'Toxic'])
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Define toxicity labels
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Plot settings
plt.figure(figsize=(20, 10), dpi=DPI)

# Create one subplot per label (only for toxic comments)
for i, label in enumerate(label_cols, 1):
    plt.subplot(2, 3, i)
    filtered_df = train_df[train_df[label] == 1]
    sns.histplot(data=filtered_df, x='word_count', bins=50, color='red', alpha=0.7)
    plt.title(f'Word Count ({label}=1)')
    plt.xlabel('Number of Words')
    plt.ylabel('Toxic Comment Count')
    plt.grid(True)

plt.tight_layout()
plt.show()