# **Exploratory Data Analysis for IMDB Movie Reviews**

In [None]:
# Exploratory Data Analysis for IMDB Movie Reviews
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords

In [None]:
# 1. Load dataset
df = pd.read_csv("IMDB Dataset.csv")

In [None]:
# 2. Overview
print("Dataset shape:", df.shape)
print(df.head(5))
print("\nMissing values per column:\n", df.isnull().sum())
print("Duplicate rows:", df.duplicated().sum())

In [None]:
# 3. Simple cleaning for text analysis
def simple_clean(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', ' ', text)          # Remove HTML tags
    text = re.sub(r'http\S+|www\S+', ' ', text) # Remove URLs
    text = re.sub(r'[^a-z\s]', ' ', text)       # Keep letters only
    text = re.sub(r'\s+', ' ', text).strip()
    return text
df['clean_review'] = df['review'].apply(simple_clean)

In [None]:
# 4. Stopwords and tokenization
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

df['word_count'] = df['clean_review'].apply(lambda x: len(x.split()))
print("\nWord count stats:\n", df['word_count'].describe())

In [None]:
# 5. Class distribution
print("\nSentiment distribution:\n", df['sentiment'].value_counts())

In [None]:
# 6. Top tokens overall
all_tokens = " ".join(df['clean_review']).split()
filtered = [t for t in all_tokens if t not in stop_words and len(t)>1]
top20 = Counter(filtered).most_common(10)
print("\nTop 10 most frequent tokens (excluding stopwords):\n", top20)

In [None]:
# 7. Top tokens by sentiment
top_pos = Counter([t for t in " ".join(df[df['sentiment']=='positive']['clean_review']).split() if t not in stop_words and len(t)>1]).most_common(10)
top_neg = Counter([t for t in " ".join(df[df['sentiment']=='negative']['clean_review']).split() if t not in stop_words and len(t)>1]).most_common(10)
print("\nTop positive tokens:\n", top_pos)
print("\nTop negative tokens:\n", top_neg)

In [None]:
# 8. Visualization
plt.figure(figsize=(6,4))
counts = df['sentiment'].value_counts()
plt.bar(counts.index, counts.values, color=['skyblue','salmon'])
plt.title("Class Distribution (Positive vs Negative)")
plt.xlabel("Sentiment")
plt.ylabel("Number of Reviews")
plt.show()

plt.figure(figsize=(6,4))
plt.hist(df['word_count'], bins=50, color='purple', alpha=0.7)
plt.title("Distribution of Review Lengths (Word Count)")
plt.xlabel("Word Count")
plt.ylabel("Frequency")
plt.xlim(0, 1000)
plt.show()

In [None]:
from wordcloud import WordCloud

# Positive WordCloud
pos_text = " ".join(df[df['sentiment']=='positive']['clean_review'])
wc_pos = WordCloud(width=800, height=400, background_color='white').generate(pos_text)
plt.imshow(wc_pos, interpolation='bilinear')
plt.axis("off")
plt.title("Positive Reviews WordCloud")
plt.show()

# Negative WordCloud
neg_text = " ".join(df[df['sentiment']=='negative']['clean_review'])
wc_neg = WordCloud(width=800, height=400, background_color='white').generate(neg_text)
plt.imshow(wc_neg, interpolation='bilinear')
plt.axis("off")
plt.title("Negative Reviews WordCloud")
plt.show()

In [None]:
# 9. Save preview of cleaned dataset
df[['clean_review','sentiment','word_count']].head(200).to_csv("IMDB_preview_clean.csv", index=False)
print("\nâœ… Preview saved as: IMDB_preview_clean.csv")

In [None]:
# ============================
# Baseline Models for IMDB
# ============================

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

# ------------------------
# 1. Load IMDB dataset
# ------------------------
df = pd.read_csv("IMDB Dataset.csv")

# ------------------------
# 2. Cleaning
# ------------------------
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"http\S+|www\S+", " ", text)
    text = re.sub(r"[^a-z ]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["clean_review"] = df["review"].apply(clean_text)

# ------------------------
# 3. Train-test split
# ------------------------
X_train, X_test, y_train, y_test = train_test_split(
    df["clean_review"], df["sentiment"], test_size=0.2, random_state=42
)

# ------------------------
# 4. TF-IDF Vectorization
# ------------------------
tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# ------------------------
# 5. Logistic Regression
# ------------------------
lr = LogisticRegression(max_iter=200)
lr.fit(X_train_tfidf, y_train)
lr_pred = lr.predict(X_test_tfidf)

print("\n====================")
print("LOGISTIC REGRESSION")
print("====================")
print("Accuracy:", accuracy_score(y_test, lr_pred))
print("F1-score:", f1_score(y_test, lr_pred, average="weighted"))
print(confusion_matrix(y_test, lr_pred))

# ------------------------
# 6. Naive Bayes
# ------------------------
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
nb_pred = nb.predict(X_test_tfidf)

print("\n====================")
print("NAIVE BAYES")
print("====================")
print("Accuracy:", accuracy_score(y_test, nb_pred))
print("F1-score:", f1_score(y_test, nb_pred, average="weighted"))
print(confusion_matrix(y_test, nb_pred))


In [None]:
!pip install transformers datasets torch --quiet

In [None]:
# ============================
# BERT Fine-Tuning on IMDB
# ============================

from datasets import load_dataset
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import TrainingArguments, Trainer
import torch

# ----------------------------
# 1. Load IMDB dataset
# ----------------------------
dataset = load_dataset("imdb")

# ----------------------------
# 2. Tokenization
# ----------------------------
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=256)

dataset = dataset.map(tokenize, batched=True)
dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# ----------------------------
# 3. Load BERT model
# ----------------------------
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# ----------------------------
# 4. Training arguments
# ----------------------------
training_args = TrainingArguments(
    output_dir="./bert-imdb",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=200,
)

# ----------------------------
# 5. Trainer
# ----------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"]
)

# ----------------------------
#                                    6. Train model
# ----------------------------
trainer.train()

# ----------------------------
# 7. Evaluate
# ----------------------------
results = trainer.evaluate()
print("\n===== BERT RESULTS =====")
print(results)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"]
)

trainer.train()

In [None]:
dataset = load_dataset("imdb")

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [None]:
dataset = dataset.map(tokenize, batched=True)
dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./bert-imdb",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=200,
)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./bert-imdb",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=200,)

In [None]:
pip install --upgrade transformers

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./bert-imdb",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=200,)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./bert-imdb",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=500
)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./bert-imdb",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=500)


In [None]:
training_args = TrainingArguments(
    output_dir="./bert-imdb",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=500)


In [None]:
pip install --upgrade transformers accelerate datasets

In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
training_args = TrainingArguments(...)

In [None]:
trainer = Trainer(...)
trainer.train()

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./bert_imdb_output",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=200)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"]
)

trainer.train()

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Load BERT model for binary classification (positive/negative)
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2
)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import load_dataset

In [None]:
dataset = load_dataset("imdb")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

In [None]:
dataset = dataset.map(tokenize, batched=True)
dataset = dataset.rename_column("label", "labels")
dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2)

In [None]:
training_args = TrainingArguments(
    output_dir="./bert_imdb_output",
    do_train=True,
    do_eval=True,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=200)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"])

In [None]:
trainer.train() stop