In [17]:
import os
import re
import string
import pickle
import pandas as pd
import numpy as np
import fastapi
import uvicorn
import joblib
import logging
import requests
import tarfile

In [18]:
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from starlette.middleware.cors import CORSMiddleware


In [19]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [20]:
response = requests.get(url, stream=True)
with open(dataset_path, "wb") as file:
    for chunk in response.iter_content(chunk_size=1024):
        if chunk:
            file.write(chunk)

print("Download completed!")

with tarfile.open(dataset_path, "r:gz") as tar:
    tar.extractall()

print("Dataset extracted successfully!")

os.remove(dataset_path)

Download completed!
Dataset extracted successfully!


In [21]:
dataset_dir = "aclImdb"
train_dir = os.path.join(dataset_dir, "train")
test_dir = os.path.join(dataset_dir, "test")

In [23]:
def load_reviews(directory, sentiment):
    reviews = []
    labels = []
    folder_path = os.path.join(directory, sentiment)
    
    for file in os.listdir(folder_path):
        with open(os.path.join(folder_path, file), "r", encoding="utf-8") as f:
            reviews.append(f.read().strip())
            labels.append(1 if sentiment == "pos" else 0) 
    
    return reviews, labels


In [24]:
train_pos, train_pos_labels = load_reviews(train_dir, "pos")
train_neg, train_neg_labels = load_reviews(train_dir, "neg")

In [25]:
test_pos, test_pos_labels = load_reviews(test_dir, "pos")
test_neg, test_neg_labels = load_reviews(test_dir, "neg")

In [26]:
train_df = pd.DataFrame({"review": train_pos + train_neg, "sentiment": train_pos_labels + train_neg_labels})
test_df = pd.DataFrame({"review": test_pos + test_neg, "sentiment": test_pos_labels + test_neg_labels})


In [27]:
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [28]:
print(train_df.head())
print("Training data size:", train_df.shape)
print("Testing data size:", test_df.shape)

                                              review  sentiment
0  In Panic In The Streets Richard Widmark plays ...          1
1  If you ask me the first one was really better ...          0
2  I am a big fan a Faerie Tale Theatre and I've ...          1
3  I just finished reading a book about Dillinger...          0
4  Greg Davis and Bryan Daly take some crazed sta...          0
Training data size: (25000, 2)
Testing data size: (25000, 2)


In [29]:
import re
import string
import nltk
from nltk.corpus import stopwords


In [30]:
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [31]:
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

In [32]:
train_df["review"] = train_df["review"].apply(preprocess_text)
test_df["review"] = test_df["review"].apply(preprocess_text)


In [33]:
tfidf = TfidfVectorizer(max_features=5000)
X_train = tfidf.fit_transform(train_df["review"])
X_test = tfidf.transform(test_df["review"])

y_train = train_df["sentiment"]
y_test = test_df["sentiment"]

print("TF-IDF vectorization complete!")

TF-IDF vectorization complete!


In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [35]:
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

In [36]:
y_pred = model.predict(X_test)

In [37]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

Model Accuracy: 0.8818
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.88      0.88     12500
           1       0.88      0.88      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000



In [38]:
joblib.dump(model, "sentiment_model.pkl")

['sentiment_model.pkl']

In [39]:
joblib.dump(tfidf, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']