# YouTube Comment Analysis
This notebook includes:
- Sentiment classification (TF-IDF + Logistic Regression)
- Transformer-based sentiment classifier
- Topic modeling with LDA

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

  from pandas.core import (


In [2]:
# Load data
df = pd.read_csv("data_with_short_comments_for_further_analysis.csv", engine='python', escapechar='\\', on_bad_lines='skip')

# Clean comments
def clean_text(text):
    text = re.sub(r'http\S+', '', str(text))
    text = re.sub(r'[^A-Za-z\u0900-\u097F\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['comment'] = df['comment'].fillna('').apply(clean_text)
df = df.dropna(subset=['comment', 'sentiment'])

## Term Frequency - inverse documentry frequency (TF-IDF)  + Logistic Regression (Baseline)

In [6]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['sentiment'])

X = df['comment']
y = df['label']

vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
X_vec = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

              precision    recall  f1-score   support

      1 star       0.54      0.47      0.51      3865
     2 stars       0.39      0.10      0.16      1104
     3 stars       0.43      0.26      0.32      2860
     4 stars       0.58      0.40      0.47      1747
     5 stars       0.74      0.89      0.81     14452

    accuracy                           0.68     24028
   macro avg       0.53      0.43      0.45     24028
weighted avg       0.64      0.68      0.65     24028



## Transformer-Based Sentiment Classifier (MiniLM + Logistic Regression)

In [7]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.preprocessing import LabelEncoder

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

def get_embeddings(texts, tokenizer, model):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].cpu().numpy()

df_sampled = df.sample(n=1000, random_state=42) if len(df) > 1000 else df.copy()
X_embed = get_embeddings(df_sampled['comment'].tolist(), tokenizer, model)
y = LabelEncoder().fit_transform(df_sampled['sentiment'])
X_train, X_test, y_train, y_test = train_test_split(X_embed, y, test_size=0.2, random_state=42)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

2025-05-06 20:06:59.862923: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


              precision    recall  f1-score   support

           0       0.47      0.21      0.29        43
           1       0.11      0.11      0.11         9
           2       0.06      0.05      0.06        19
           3       0.29      0.24      0.26        17
           4       0.65      0.82      0.72       112

    accuracy                           0.54       200
   macro avg       0.32      0.29      0.29       200
weighted avg       0.50      0.54      0.50       200



## Topic Modeling with LDA

In [7]:
from sklearn.decomposition import LatentDirichletAllocation

lda_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
X_lda = lda_vectorizer.fit_transform(df['comment'])

lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_model.fit(X_lda)

# Display topics
terms = lda_vectorizer.get_feature_names_out()
for idx, topic in enumerate(lda_model.components_):
    print(f"\nTopic {idx + 1}:")
    print([terms[i] for i in topic.argsort()[-10:]])


Topic 1:
['great', 'songs', 'video', 'really', 'love', 'touching', 'heart', 'wow', 'nice', 'song']

Topic 2:
['wow', 'team', 'prakash', 'geet', 'ramro', 'trending', 'wishes', 'waiting', 'luck', 'best']

Topic 3:
['उन', 'मन', 'हर', 'पर', 'पन', 'सप', 'सम', 'भन', 'रक', 'गर']

Topic 4:
['singer', 'parkash', 'love', 'hats', 'team', 'sir', 'respect', 'congratulations', 'saput', 'prakash']

Topic 5:
['मल', 'पर', 'कत', 'अत', 'बध', 'आह', 'सफलत', 'रक', 'भक', 'मन']


In [11]:
from datasets import Dataset

ModuleNotFoundError: No module named 'datasets'

In [10]:
# 2. Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
import torch
from sklearn.metrics import classification_report

# 3. Load your dataset
df = pd.read_csv("your_dataset.csv")  # Replace with your actual file
df = df[['text', 'label']]            # Keep only necessary columns

# 4. Train/test split
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

# 5. Convert to Hugging Face Datasets
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

# 6. Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 7. Tokenize text
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=256)

train_ds = train_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)

# 8. Set format for PyTorch
train_ds.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_ds.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# 9. Load pre-trained BERT with classification head
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

# 10. Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    evaluation_strategy="epoch",
    logging_dir='./logs',
    load_best_model_at_end=True,
    save_strategy="epoch",
)

# 11. Define metrics
def compute_metrics(p):
    preds = torch.argmax(torch.tensor(p.predictions), axis=1)
    return classification_report(p.label_ids, preds, output_dict=True)

# 12. Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
)

# 13. Train the model
trainer.train()

# 14. Evaluate
trainer.evaluate()

ModuleNotFoundError: No module named 'datasets'

In [12]:
## 📁 Project Structure

