In [4]:
# =============================================================================
# Project: English Sentiment Analysis (Pro-Scale)
# Description: Large-scale sentiment classification using IMDB Dataset (50k reviews).
#              Leverages TF-IDF Vectorization and Logistic Regression for 
#              high-accuracy production-ready performance.
# Maria Alsadiq
# =============================================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline

# -----------------------------------------------------------------------------
# 1. Data Loading (IMDB Dataset)
# -----------------------------------------------------------------------------
print(" Downloading IMDB Dataset...")
dataset_url = "https://raw.githubusercontent.com/Ankit152/IMDB-sentiment-analysis/master/IMDB-Dataset.csv"

try:
    df = pd.read_csv(dataset_url)
    print(f" Dataset Loaded. Total Rows: {df.shape[0]}")
except Exception as e:
    print(f" Error: {e}")

# Sampling for performance (Using 10,000 reviews for this demo)
df_sample = df.sample(n=10000, random_state=42)

# -----------------------------------------------------------------------------
# 2. Preprocessing & Pipeline Construction
# -----------------------------------------------------------------------------
X = df_sample['review']
y = df_sample['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline Strategy:
# 1. TfidfVectorizer: Converts text to weighted numerical vectors, removing English stop words.
# 2. LogisticRegression: Excellent baseline model for binary text classification.
model_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=5000)),
    ('clf', LogisticRegression())
])

# -----------------------------------------------------------------------------
# 3. Training & Evaluation
# -----------------------------------------------------------------------------
print("Training model...")
model_pipeline.fit(X_train, y_train)
print("Training Complete.")

# Predictions
y_pred = model_pipeline.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy * 100:.2f}%")
print("\n--- Classification Report ---\n")
print(classification_report(y_test, y_pred))

# -----------------------------------------------------------------------------
# 4. Live Testing (Unseen Data)
# -----------------------------------------------------------------------------
print("\n--- Live Testing on New Reviews ---")
custom_reviews = [
    "The movie was absolutely wonderful and the acting was great!",
    "I hated every minute of it. Complete waste of time.",
    "It was okay, not the best but not the worst.",
    "The plot was confusing and the characters were shallow.",
    "A masterpiece! I will watch it again."
]

for review in custom_reviews:
    prediction = model_pipeline.predict([review])[0]
    confidence = model_pipeline.predict_proba([review]).max() * 100
    print(f"Review: {review}")
    print(f"Sentiment: {prediction} (Confidence: {confidence:.1f}%)")
    print("-" * 30)

 Downloading IMDB Dataset...
 Dataset Loaded. Total Rows: 50000
Training model...
Training Complete.

Model Accuracy: 87.25%

--- Classification Report ---

              precision    recall  f1-score   support

    negative       0.89      0.84      0.87       999
    positive       0.85      0.90      0.88      1001

    accuracy                           0.87      2000
   macro avg       0.87      0.87      0.87      2000
weighted avg       0.87      0.87      0.87      2000


--- Live Testing on New Reviews ---
Review: The movie was absolutely wonderful and the acting was great!
Sentiment: positive (Confidence: 92.2%)
------------------------------
Review: I hated every minute of it. Complete waste of time.
Sentiment: negative (Confidence: 92.5%)
------------------------------
Review: It was okay, not the best but not the worst.
Sentiment: negative (Confidence: 93.0%)
------------------------------
Review: The plot was confusing and the characters were shallow.
Sentiment: negative 

In [None]:
# ==========================================
# 5. Interactive Mode (Try it yourself!) 
# ==========================================
print("\n--- English Sentiment Analysis (Interactive Demo) ---")
print("Type your review below. Type 'exit' or 'quit' to stop.\n")

while True:
    user_input = input("Enter text: ")
    
    # Exit condition
    if user_input.lower() in ['exit', 'quit', 'stop']:
        print("Goodbye!")
        break
    
    if not user_input.strip():
        continue

    prediction = model_pipeline.predict([user_input])[0]
    prob = model_pipeline.predict_proba([user_input]).max() * 100
    
    emoji = "üåü" if prediction == "positive" else "üëé"
    print(f"  Review: {user_input}")
    print(f"  Sentiment: {prediction.upper()} {emoji}")
    print(f"  Confidence: {prob:.1f}%")
    print("-" * 30)


--- English Sentiment Analysis (Interactive Demo) ---
Type your review below. Type 'exit' or 'quit' to stop.



Enter text:  none of the characters were realistic


  Review: none of the characters were realistic
  Sentiment: POSITIVE üåü
  Confidence: 74.3%
------------------------------


Enter text:  liked the plot twist


  Review: liked the plot twist
  Sentiment: POSITIVE üåü
  Confidence: 61.5%
------------------------------


Enter text:  thrilling


  Review: thrilling
  Sentiment: POSITIVE üåü
  Confidence: 65.0%
------------------------------


Enter text:  good


  Review: good
  Sentiment: POSITIVE üåü
  Confidence: 88.3%
------------------------------


Enter text:  amazing


  Review: amazing
  Sentiment: POSITIVE üåü
  Confidence: 97.6%
------------------------------


Enter text:  good character development 


  Review: good character development 
  Sentiment: POSITIVE üåü
  Confidence: 50.5%
------------------------------


Enter text:  ŸÅÿßÿ¥ŸÑ


  Review: ŸÅÿßÿ¥ŸÑ
  Sentiment: POSITIVE üåü
  Confidence: 56.8%
------------------------------


Enter text:  ding ding


  Review: ding ding
  Sentiment: POSITIVE üåü
  Confidence: 56.8%
------------------------------


Enter text:  are you ok


  Review: are you ok
  Sentiment: NEGATIVE üëé
  Confidence: 65.7%
------------------------------


Enter text:  the director is crazy


  Review: the director is crazy
  Sentiment: POSITIVE üåü
  Confidence: 60.6%
------------------------------
