<a href="https://colab.research.google.com/github/Neallaz/Machine-Learning/blob/main/Movie%20Review%20Sentiment%20Analysis%20(Text%20Classification).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import string
from nltk.corpus import stopwords
nltk.download('stopwords')


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
import random

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

# Convert list of words back to raw text
texts = [' '.join(doc) for doc, _ in documents]
labels = [1 if category == 'pos' else 0 for _, category in documents]

df = pd.DataFrame({'review': texts, 'label': labels})
df.head()


Unnamed: 0,review,label
0,"these days , we are witnessing the deluge of f...",0
1,i guess it ' s a credit to jackie chan and the...,1
2,arye cross and courteney cox star as a pair of...,0
3,"in double jeopardy , the stakes are high . thi...",0
4,"8mm , written by seven scribe andrew kevin wal...",0


In [3]:
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

df['clean_review'] = df['review'].apply(preprocess_text)


In [4]:
X = df['clean_review']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)


In [5]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [6]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)


In [7]:
y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.82

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.81      0.82       200
           1       0.82      0.82      0.82       200

    accuracy                           0.82       400
   macro avg       0.82      0.82      0.82       400
weighted avg       0.82      0.82      0.82       400



In [8]:
def predict_sentiment(texts):
    texts_clean = [preprocess_text(text) for text in texts]
    features = vectorizer.transform(texts_clean)
    preds = model.predict(features)
    return ['positive' if p == 1 else 'negative' for p in preds]

samples = [
    "This movie was fantastic! I really enjoyed it.",
    "The plot was dull and boring, I did not like it."
]

print(predict_sentiment(samples))


['positive', 'negative']
