In [2]:
import pandas as pd
import re
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from collections import Counter

# Step 1: Load datasets
df_train = pd.read_csv("IMDB Dataset.csv")      # must have 'review', 'sentiment'
df_unlabeled = pd.read_csv('tmdb_clean_english_reviews.csv')  # must have 'review'

# Step 2: Clean text
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    return text.strip()

df_train['review'] = df_train['review'].apply(clean_text)
df_unlabeled['review'] = df_unlabeled['review'].apply(clean_text)

# Step 3: Label conversion
df_train['label'] = df_train['sentiment'].map({'positive': 1, 'negative': 0})

# Step 4: Build vocabulary (fixed)
def build_vocab(texts, min_freq=2):
    counter = Counter()
    for text in texts:
        counter.update(text.split())
    vocab = {'<PAD>': 0, '<UNK>': 1}
    index = 2
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = index
            index += 1
    return vocab

vocab = build_vocab(df_train['review'])

# Step 5: Encode text (fixed)
def encode_text(text, vocab, max_len=100):
    tokens = text.split()
    ids = [vocab.get(token, vocab['<UNK>']) for token in tokens]
    ids = ids[:max_len]
    ids += [vocab['<PAD>']] * (max_len - len(ids))
    return ids

# Step 6: Dataset class
class ReviewDataset(Dataset):
    def __init__(self, texts, labels, vocab):
        self.reviews = [encode_text(r, vocab) for r in texts]
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.reviews[idx], dtype=torch.long),
            'label': torch.tensor(self.labels[idx], dtype=torch.float)
        }

# Step 7: Train/Val split
X_train, X_val, y_train, y_val = train_test_split(df_train['review'], df_train['label'], test_size=0.2, random_state=42)
train_dataset = ReviewDataset(X_train.tolist(), y_train.tolist(), vocab)
val_dataset = ReviewDataset(X_val.tolist(), y_val.tolist(), vocab)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# Step 8: Define model
class SentimentModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=64):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, 64, batch_first=True)
        self.fc = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids):
        x = self.embedding(input_ids)
        _, (h_n, _) = self.lstm(x)
        out = self.fc(h_n[-1])
        return self.sigmoid(out).squeeze()

# Step 9: Train model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vocab_size = max(vocab.values()) + 1  # Fix embedding size
model = SentimentModel(vocab_size).to(device)

loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

print("Training started...")
for epoch in range(5):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['label'].to(device)
        preds = model(input_ids)
        loss = loss_fn(preds, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} - Loss: {total_loss / len(train_loader):.4f}")

# Step 10: Predict on scraped data
class UnlabeledDataset(Dataset):
    def __init__(self, texts, vocab):
        self.reviews = [encode_text(r, vocab) for r in texts]
    def __len__(self):
        return len(self.reviews)
    def __getitem__(self, idx):
        return torch.tensor(self.reviews[idx], dtype=torch.long)

unlabeled_dataset = UnlabeledDataset(df_unlabeled['review'], vocab)
unlabeled_loader = DataLoader(unlabeled_dataset, batch_size=32)

model.eval()
predictions = []
with torch.no_grad():
    for batch in unlabeled_loader:
        input_ids = batch.to(device)
        preds = model(input_ids)
        predictions += (preds > 0.5).int().cpu().tolist()

# Step 11: Save results
df_unlabeled['predicted_sentiment'] = ['positive' if p == 1 else 'negative' for p in predictions]
df_unlabeled.to_csv("scraped_reviews_with_sentiment.csv", index=False)
print("✅ Predictions saved to scraped_reviews_with_sentiment.csv")


Training started...
Epoch 1 - Loss: 0.6842
Epoch 2 - Loss: 0.4898
Epoch 3 - Loss: 0.3331
Epoch 4 - Loss: 0.2514
Epoch 5 - Loss: 0.1826
✅ Predictions saved to scraped_reviews_with_sentiment.csv


In [3]:
df=pd.read_csv('scraped_reviews_with_sentiment.csv')

In [4]:
df.head()

Unnamed: 0,movie_id,movie_title,author,review,rating,predicted_sentiment
0,950387,A Minecraft Movie,tmdb98094809,alright buckle up because i just saw a movie t...,10.0,positive
1,950387,A Minecraft Movie,CinemaSerf,who doesnt like a white woolly llama well that...,6.0,positive
2,950387,A Minecraft Movie,CinemaSerf,who doesnt like a white woolly llama well that...,6.0,positive
3,950387,A Minecraft Movie,Jm_15,chickey jockey is so fun to watch i love it so...,,positive
4,574475,Final Destination Bloodlines,CinemaSerf,imagine if your roommate kept getting recurrin...,7.0,negative


In [5]:
df.drop(['movie_id','movie_title','author','rating'],axis=1,inplace=True)

In [10]:
df.head()
df.rename({'predicted_sentiment':'sentiment'},axis=1,inplace=True)

In [11]:
df.head()

Unnamed: 0,review,sentiment
0,alright buckle up because i just saw a movie t...,positive
1,who doesnt like a white woolly llama well that...,positive
2,who doesnt like a white woolly llama well that...,positive
3,chickey jockey is so fun to watch i love it so...,positive
4,imagine if your roommate kept getting recurrin...,negative


In [12]:
df1=pd.read_csv('IMDB Dataset.csv')

In [13]:
df2=pd.concat([df1,df])

In [14]:
df2.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [15]:
df2.sentiment.value_counts()

sentiment
positive    35063
negative    30404
Name: count, dtype: int64