# Exploratory Data Analysis - YouTube Comment Analysis Project

This notebook explores three different datasets:
1. YouTube Spam Comments
2. Twitter Data Analysis
3. Toxic Comment Analysis

We'll analyze these datasets to understand their structure and characteristics to inform our YouTube comment analysis project.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style for visualizations
plt.style.use('seaborn')
sns.set_palette('husl')

# Configure pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Project root directory
PROJECT_ROOT = Path('..').resolve()
DATASETS_DIR = PROJECT_ROOT / 'datasets' / 'raw'

## 1. YouTube Spam Comments Dataset Analysis

In [None]:
# Read and combine all YouTube spam datasets
youtube_files = list(DATASETS_DIR.glob('youtube_spam_data/*.csv'))
youtube_dfs = []

for file in youtube_files:
    df = pd.read_csv(file)
    df['source'] = file.stem  # Add source file name
    youtube_dfs.append(df)

youtube_df = pd.concat(youtube_dfs, ignore_index=True)

# Display basic information
print("YouTube Spam Dataset Info:")
print("-" * 50)
print(f"Total number of comments: {len(youtube_df)}")
print("\nColumns:")
youtube_df.info()

In [None]:
# Basic statistics of YouTube spam dataset
print("Distribution of comments across videos:")
print(youtube_df['source'].value_counts())

# Check class distribution (spam vs non-spam)
plt.figure(figsize=(8, 6))
sns.countplot(data=youtube_df, x='CLASS')
plt.title('Distribution of Spam vs Non-Spam Comments')
plt.show()

## 2. Twitter Data Analysis

In [None]:
# Read Twitter datasets
twitter_train = pd.read_csv(DATASETS_DIR / 'twitter_data/twitter_training.csv')
twitter_val = pd.read_csv(DATASETS_DIR / 'twitter_data/twitter_validation.csv')

print("Twitter Dataset Info:")
print("-" * 50)
print(f"Training set size: {len(twitter_train)}")
print(f"Validation set size: {len(twitter_val)}")
print("\nColumns:")
twitter_train.info()

## 3. Toxic Comment Analysis

In [None]:
# Read Toxic comment datasets
toxic_train = pd.read_csv(DATASETS_DIR / 'toxic_comment_data/train.csv')
toxic_test = pd.read_csv(DATASETS_DIR / 'toxic_comment_data/test.csv')
toxic_test_labels = pd.read_csv(DATASETS_DIR / 'toxic_comment_data/test_labels.csv')

print("Toxic Comment Dataset Info:")
print("-" * 50)
print(f"Training set size: {len(toxic_train)}")
print(f"Test set size: {len(toxic_test)}")
print("\nColumns:")
toxic_train.info()

## Text Length Analysis Across Datasets

In [None]:
def add_text_length(df, text_column):
    df['text_length'] = df[text_column].str.len()
    return df

# Add text length to each dataset
youtube_df = add_text_length(youtube_df, 'CONTENT')
twitter_train = add_text_length(twitter_train, 'text')  # Adjust column name if different
toxic_train = add_text_length(toxic_train, 'comment_text')

# Plot text length distributions
plt.figure(figsize=(15, 5))

plt.subplot(131)
sns.histplot(data=youtube_df, x='text_length', bins=50)
plt.title('YouTube Comments Length')
plt.xlabel('Length')

plt.subplot(132)
sns.histplot(data=twitter_train, x='text_length', bins=50)
plt.title('Twitter Text Length')
plt.xlabel('Length')

plt.subplot(133)
sns.histplot(data=toxic_train, x='text_length', bins=50)
plt.title('Toxic Comments Length')
plt.xlabel('Length')

plt.tight_layout()
plt.show()

## Common Words and Patterns

In [None]:
from collections import Counter
import re

def get_common_words(text_series, n=20):
    # Combine all text
    text = ' '.join(text_series.astype(str))
    # Convert to lowercase and split into words
    words = re.findall(r'\w+', text.lower())
    # Get most common words
    return Counter(words).most_common(n)

# Analyze common words in each dataset
print("Most common words in YouTube comments:")
print(get_common_words(youtube_df['CONTENT']))

print("\nMost common words in Twitter data:")
print(get_common_words(twitter_train['text']))  # Adjust column name if different

print("\nMost common words in Toxic comments:")
print(get_common_words(toxic_train['comment_text']))