In [1]:
import pandas as pd
import numpy as np

In [3]:
# Step 1: Load Dataset
file_path = 'moviereviews[1].csv'  # Update with the actual dataset path
df = pd.read_csv(file_path)

In [4]:
df.head()

Unnamed: 0,index,label,review
0,0,neg,how do films like mouse hunt get into theatres...
1,1,neg,some talented actresses are blessed with a dem...
2,2,pos,this has been an extraordinary year for austra...
3,3,pos,according to hollywood movies made in last few...
4,4,neg,my first press screening of 1998 and already i...


In [5]:
# Ensure correct column names
if set(df.columns) != {"index", "label", "review"}:
    raise ValueError("Dataset does not have the required columns: 'index', 'label', and 'review'")
df = df[["review", "label"]]
df.columns = ["text", "label"]

In [6]:
# Debugging: Display dataset information
print("Initial Dataset Info:")
print(df.info())
print("Sample Data:", df.head(), sep="\n")

Initial Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    1965 non-null   object
 1   label   2000 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB
None
Sample Data:
                                                text label
0  how do films like mouse hunt get into theatres...   neg
1  some talented actresses are blessed with a dem...   neg
2  this has been an extraordinary year for austra...   pos
3  according to hollywood movies made in last few...   pos
4  my first press screening of 1998 and already i...   neg


In [13]:
# Step 2: Preprocessing
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9.,!?]", " ", text)
    return text

In [14]:
df["text"] = df["text"].apply(preprocess_text)

In [16]:
# Map labels to binary values
label_mapping = {"pos": 1, "neg": 0}  # Adjust as necessary
if not all(label in label_mapping for label in df["label"].unique()):
    raise ValueError("Label values do not match expected mapping.")
df["label"] = df["label"].map(label_mapping)

In [17]:
# Drop invalid rows
df = df.dropna(subset=["text", "label"])
df = df[df['text'].str.strip() != ""]

In [18]:
# Debugging: Check dataset after preprocessing
print("Dataset Info After Preprocessing:")
print(df.info())

Dataset Info After Preprocessing:
<class 'pandas.core.frame.DataFrame'>
Index: 1938 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    1938 non-null   object
 1   label   1938 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 45.4+ KB
None


In [19]:
# Ensure dataset is not empty
if df.empty:
    raise ValueError("The dataset is empty after preprocessing. Please check your input data.")

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [21]:
# Split into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=42
)


In [22]:
# Step 3: Naive Bayes Implementation
vectorizer_nb = CountVectorizer()
X_train_nb = vectorizer_nb.fit_transform(train_texts)
X_test_nb = vectorizer_nb.transform(test_texts)

In [23]:
nb_model = MultinomialNB()
nb_model.fit(X_train_nb, train_labels)
nb_preds = nb_model.predict(X_test_nb)
nb_accuracy = accuracy_score(test_labels, nb_preds)

In [24]:
import pickle

In [25]:
# Save Naive Bayes model
with open("naive_bayes_model.pkl", "wb") as f:
    pickle.dump(nb_model, f)


In [26]:
# Step 4: SVM Implementation
vectorizer_svm = TfidfVectorizer()
X_train_svm = vectorizer_svm.fit_transform(train_texts)
X_test_svm = vectorizer_svm.transform(test_texts)


In [27]:
svm_model = SVC(kernel="linear")
svm_model.fit(X_train_svm, train_labels)
svm_preds = svm_model.predict(X_test_svm)
svm_accuracy = accuracy_score(test_labels, svm_preds)

In [28]:
# Save SVM model
with open("svm_model.pkl", "wb") as f:
    pickle.dump(svm_model, f)

In [29]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, pipeline
import torch

In [58]:
# Step 5: Transformer-Based Model
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [59]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"], padding="max_length", truncation=True, max_length=512
    )

In [77]:
train_dataset = Dataset.from_dict({"text": train_texts.tolist(), "label": train_labels.tolist()})
test_dataset = Dataset.from_dict({"text": test_texts.tolist(), "label": test_labels.tolist()})

In [78]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1550 [00:00<?, ? examples/s]

Map:   0%|          | 0/388 [00:00<?, ? examples/s]

In [62]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True
)




In [63]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [None]:
trainer.train()

In [65]:
# Save Transformer model
model.save_pretrained("transformer_model")
tokenizer.save_pretrained("transformer_model")

('transformer_model/tokenizer_config.json',
 'transformer_model/special_tokens_map.json',
 'transformer_model/vocab.txt',
 'transformer_model/added_tokens.json',
 'transformer_model/tokenizer.json')

In [66]:
# Step 6: Evaluation and Comparison
transformer_pipeline = pipeline(
    "sentiment-analysis", model=model, tokenizer=tokenizer, truncation=True
)

In [68]:
transformer_preds = [
    1 if pred["label"] == "LABEL_1" else 0
    for pred in transformer_pipeline(test_texts.tolist(), truncation=True, max_length=512)
]

In [73]:
transformer_accuracy = accuracy_score(test_labels, transformer_preds)

In [74]:
import os

In [75]:
# Model Sizes
nb_size = os.path.getsize("naive_bayes_model.pkl")
svm_size = os.path.getsize("svm_model.pkl")
transformer_size = sum(
    os.path.getsize(os.path.join("transformer_model", file)) for file in os.listdir("transformer_model")
)

In [76]:
# Print Results
print("Naive Bayes Accuracy:", nb_accuracy)
print("SVM Accuracy:", svm_accuracy)
print("Transformer Accuracy:", transformer_accuracy)
print("Naive Bayes Model Size (bytes):", nb_size)
print("SVM Model Size (bytes):", svm_size)
print("Transformer Model Size (bytes):", transformer_size)

Naive Bayes Accuracy: 0.8324742268041238
SVM Accuracy: 0.8505154639175257
Transformer Accuracy: 0.4742268041237113
Naive Bayes Model Size (bytes): 1137128
SVM Model Size (bytes): 5417353
Transformer Model Size (bytes): 268777617
