# YouTube Comment Analysis
This notebook includes:
- Sentiment classification (TF-IDF + Logistic Regression)
- Transformer-based sentiment classifier
- Topic modeling with LDA

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load data
file_path = 'data_with_short_comments_for_further_analysis.csv'
df = pd.read_csv(file_path)

# Clean comments
def clean_text(text):
    text = re.sub(r'http\S+', '', str(text))
    text = re.sub(r'[^A-Za-z\u0900-\u097F\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['comment'] = df['comment'].fillna('').apply(clean_text)
df = df.dropna(subset=['comment', 'sentiment'])

## TF-IDF + Logistic Regression (Baseline)

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['sentiment'])

X = df['comment']
y = df['label']

vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
X_vec = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

## Transformer-Based Sentiment Classifier (MiniLM + Logistic Regression)

In [None]:
!pip install transformers sentence-transformers torch

from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.preprocessing import LabelEncoder

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

def get_embeddings(texts, tokenizer, model):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].cpu().numpy()

df_sampled = df.sample(n=1000, random_state=42) if len(df) > 1000 else df.copy()
X_embed = get_embeddings(df_sampled['comment'].tolist(), tokenizer, model)
y = LabelEncoder().fit_transform(df_sampled['sentiment'])
X_train, X_test, y_train, y_test = train_test_split(X_embed, y, test_size=0.2, random_state=42)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

## Topic Modeling with LDA

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

lda_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
X_lda = lda_vectorizer.fit_transform(df['comment'])

lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_model.fit(X_lda)

# Display topics
terms = lda_vectorizer.get_feature_names_out()
for idx, topic in enumerate(lda_model.components_):
    print(f"\nTopic {idx + 1}:")
    print([terms[i] for i in topic.argsort()[-10:]])