# “Cinemania”

In [11]:
!pip install -U transformers -q

In [12]:
!pip install -U datasets evaluate accelerate -q

In [16]:

import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from pathlib import Path

CSV_PATH = "IMDB Dataset.csv"   # 파일 (Kaggle)
assert Path(CSV_PATH).exists(), "IMDB Dataset.csv 못 잦았어요!"

df = pd.read_csv(CSV_PATH)
df = df[['review', 'sentiment']].rename(columns={'review':'text', 'sentiment':'label'})

# positiv and negativ 1000 개
pos = df[df['label']=='positive'].sample(n=1000, random_state=101)
neg = df[df['label']=='negative'].sample(n=1000, random_state=101)

final_df = shuffle(pd.concat([pos, neg], axis=0), random_state=101).reset_index(drop=True)

# Label 0/1 에 변경
label_map = {'negative':0, 'positive':1}
final_df['y'] = final_df['label'].map(label_map)
final_df.head()


Unnamed: 0,text,label,y
0,They give you the set up then bore you to deat...,negative,0
1,I love this film. Shehzad Khan's portrayal as ...,positive,1
2,"Well, I guess I'll have to be the one to say ""...",negative,0
3,After sitting through this god-awful 82-minute...,negative,0
4,"""Back of Beyond"" takes place at a dive diner/g...",negative,0


In [17]:
import re
from collections import Counter

def simple_tokenize(s: str):
    # 소문자 변환 → HTML 태그 제거 → 문장부호를 공백으로 치환 → 토큰 리스트 생성
    s = re.sub(r"<.*?>", " ", s)
    s = re.sub(r"[^a-zA-Z0-9']", " ", s)
    return [t for t in s.lower().split() if t]

# 2.1 어휘 사전 (Top 4000 + 특수 토큰)
all_tokens = []
for txt in final_df['text']:
    all_tokens.extend(simple_tokenize(txt))

freq = Counter(all_tokens)
most_common = [w for w,_ in freq.most_common(4000)]
# 특수 토큰
PAD_TOKEN, UNK_TOKEN = "<pad>", "<unk>"
itos = [PAD_TOKEN, UNK_TOKEN] + most_common           # index->token
stoi = {tok:i for i,tok in enumerate(itos)}          # token->index

# 2.2 텍스트를 인덱스로 변환하기
def encode_review(text, max_len=256):
    toks = simple_tokenize(text)
    idxs = [stoi.get(t, 1) for t in toks]            # 1 -> <unk>
    if len(idxs) >= max_len:
        return idxs[:max_len]
    return idxs + [0]*(max_len - len(idxs))          # 0 -> <pad>

final_df['input_ids'] = final_df['text'].apply(lambda s: encode_review(s, max_len=256))


In [18]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

class ReviewsDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels
    def __len__(self):
        return len(self.inputs)
    def __getitem__(self, i):
        x = torch.tensor(self.inputs[i], dtype=torch.long)
        y = torch.tensor(self.labels[i], dtype=torch.float32)  # BCE를 위한  float
        return x, y

# Train/Test (1600/400)
SEED = 123
np.random.seed(SEED)
torch.manual_seed(SEED)

train_size = 1600
train_df = final_df.iloc[:train_size].copy()
test_df  = final_df.iloc[train_size:train_size+400].copy()

train_ds = ReviewsDataset(train_df['input_ids'].tolist(), train_df['y'].tolist())
test_ds  = ReviewsDataset(test_df['input_ids'].tolist(),  test_df['y'].tolist())

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
test_loader  = DataLoader(test_ds,  batch_size=64, shuffle=False)

class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc  = nn.Linear(hidden_dim, 1)
        self.sig = nn.Sigmoid()
    def forward(self, x):
        # x: [B, T]
        emb = self.emb(x)                     # [B, T, E]
        out, h_n = self.rnn(emb)              # h_n: [1, B, H] — 마지막 은닉 상태
        logits = self.fc(h_n.squeeze(0))      # [B, 1]
        prob = self.sig(logits).squeeze(1)    # [B]
        return prob

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_rnn = SentimentRNN(vocab_size=len(itos), embedding_dim=128, hidden_dim=256).to(device)

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model_rnn.parameters(), lr=0.001)

# 10 epoch 읽기
EPOCHS = 10
for epoch in range(1, EPOCHS+1):
    model_rnn.train()
    running = 0.0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        prob = model_rnn(xb)
        loss = criterion(prob, yb)
        loss.backward()
        optimizer.step()
        running += loss.item()*xb.size(0)
    print(f"[RNN] Epoch {epoch:02d} | train_loss: {running/len(train_ds):.4f}")

# Test (loss + accuracy)
model_rnn.eval()
test_loss, correct, n = 0.0, 0, 0
with torch.no_grad():
    for xb, yb in test_loader:
        xb, yb = xb.to(device), yb.to(device)
        prob = model_rnn(xb)
        loss = criterion(prob, yb)
        test_loss += loss.item()*xb.size(0)
        pred = (prob >= 0.5).long()
        correct += (pred.cpu() == yb.long().cpu()).sum().item()
        n += xb.size(0)
print(f"[RNN] test_loss: {test_loss/n:.4f} | test_acc: {correct/n:.4f}")


[RNN] Epoch 01 | train_loss: 0.7078
[RNN] Epoch 02 | train_loss: 0.6615
[RNN] Epoch 03 | train_loss: 0.7092
[RNN] Epoch 04 | train_loss: 0.6981
[RNN] Epoch 05 | train_loss: 0.7004
[RNN] Epoch 06 | train_loss: 0.6892
[RNN] Epoch 07 | train_loss: 0.6892
[RNN] Epoch 08 | train_loss: 0.6796
[RNN] Epoch 09 | train_loss: 0.6745
[RNN] Epoch 10 | train_loss: 0.6694
[RNN] test_loss: 0.6899 | test_acc: 0.5425


In [19]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

# 이 버전은 1600/400 설정으로 작동
train_texts, test_texts, train_labels, test_labels = train_test_split(
    final_df['text'], final_df['y'], test_size=0.20, random_state=123, stratify=final_df['y']
)

hf_train = Dataset.from_pandas(pd.DataFrame({'text':train_texts.values, 'label':train_labels.values}))
hf_test  = Dataset.from_pandas(pd.DataFrame({'text':test_texts.values,  'label':test_labels.values}))
dset = DatasetDict({'train': hf_train, 'test': hf_test})

from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(batch):
    return tok(batch["text"], truncation=True, padding="max_length", max_length=256)

tokenized = dset.map(preprocess_function, batched=True, remove_columns=['text'])
tokenized = tokenized.rename_column("label", "labels")
tokenized.set_format(type='torch', columns=['input_ids','attention_mask','labels'])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [20]:
!pip install -U evaluate



In [21]:
!pip install -U transformers -q


In [22]:
# ==== 5-단계: DistilBERT (legacy-safe) ====
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import torch

# 1) Model: 2개 label (neg/pos)
model_bert = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
)

# 2) evaluate 라이브러리 없이 간단한 정확도 계산
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # logits -> numpy
    if isinstance(logits, torch.Tensor):
        logits = logits.detach().cpu().numpy()
    preds = np.argmax(logits, axis=-1)
    acc = (preds == labels).mean()
    return {"accuracy": float(acc)}

# 3) TrainingArguments — 예전 버전에만 존재하는 명확한 파라미터들
args = TrainingArguments(
    output_dir="./cinemania_bert",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    logging_steps=100,
    seed=123,
    report_to="none"  # 마무 것에 log 보내지 않아요
)

# 4) Trainer
trainer = Trainer(
    model=model_bert,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tok,
    compute_metrics=compute_metrics
)

# 5) 학습과 평가
trainer.train()
eval_result = trainer.evaluate()
print("\n📊 평가 결과:", eval_result)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
100,0.4338
200,0.1778
300,0.0678



📊 평가 결과: {'eval_loss': 0.40999385714530945, 'eval_accuracy': 0.9, 'eval_runtime': 2.9852, 'eval_samples_per_second': 133.996, 'eval_steps_per_second': 4.355, 'epoch': 3.0}


In [23]:
# --- RNN 위한 (SentimentRNN) ---
def predict_rnn(text: str, max_length=256):
    device = next(model_rnn.parameters()).device
    model_rnn.eval()
    encoded = encode_review(text, max_len=max_length)
    x = torch.tensor(encoded, dtype=torch.long).unsqueeze(0).to(device)
    with torch.no_grad():
        prob = model_rnn(x).item()
    label = "positive" if prob >= 0.5 else "negative"
    return {"label": label, "prob_positive": round(prob, 3)}


# --- Transformer (BERT) 위한 ---
def predict_bert(text: str, max_length=256):
    device = next(model_bert.parameters()).device
    model_bert.eval()
    inputs = tok(text, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        logits = model_bert(**inputs).logits
        probs = torch.softmax(logits, dim=-1).squeeze(0)
    p_pos = probs[1].item()
    label = "positive" if torch.argmax(probs).item() == 1 else "negative"
    return {"label": label, "prob_positive": round(p_pos, 3)}



In [26]:
samples = [
  "I had been waiting for this movie for a long time, but I didn’t like it at all.",
  "The cast is amazing, and the storyline is really interesting.",
  "The visual effects of the film are good, but the story is weak."
  ]

for s in samples:
    rnn_pred = predict_rnn(s)
    bert_pred = predict_bert(s)
    print("\n🎬 텍스트:", s)
    print("RNN  →", rnn_pred)
    print("BERT →", bert_pred)



🎬 텍스트: I had been waiting for this movie for a long time, but I didn’t like it at all.
RNN  → {'label': 'negative', 'prob_positive': 0.448}
BERT → {'label': 'negative', 'prob_positive': 0.004}

🎬 텍스트: The cast is amazing, and the storyline is really interesting.
RNN  → {'label': 'negative', 'prob_positive': 0.448}
BERT → {'label': 'positive', 'prob_positive': 0.994}

🎬 텍스트: The visual effects of the film are good, but the story is weak.
RNN  → {'label': 'negative', 'prob_positive': 0.448}
BERT → {'label': 'negative', 'prob_positive': 0.004}


In [27]:
print("\n📊 최종 평가:")
print(f"RNN test_accuracy: {correct/n:.4f}")  # 마지막 부분 – RNN 결과
print(f"BERT eval_accuracy: {eval_result['eval_accuracy']:.4f}")



📊 최종 평가:
RNN test_accuracy: 0.5425
BERT eval_accuracy: 0.9000


## 학습 과정의 손실 지표, 평가 결과, 그리고 새로운 리뷰에 대한 예측을 종합적으로 분석한 결과,DistilBERT(Transformer) 모델이 RNN 모델보다 훨씬 높은 정확도와 문맥 이해 능력을 보였다.RNN 모델은 구조가 단순하고 학습 속도가 빠르다는 장점이 있었지만,텍스트 내의 미묘한 의미나 감정적 뉘앙스를 제대로 구분하지 못했다.따라서 Cinemania 프로젝트에서는 Transformer 기반 모델이 더 신뢰할 수 있고 문맥에 민감한 해결책으로 판단된다.또한, 우즈베크어 리뷰 분석에서는 다소 오차가 있었지만, 영어 리뷰에서는 매우 우수한 결과를 보여주었다.