In [1]:
!pip install nltk scikit-learn pandas matplotlib




In [2]:
import nltk
from nltk.corpus import movie_reviews

nltk.download('movie_reviews')

# Data load
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

print(f"Total Reviews: {len(documents)}")


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


Total Reviews: 2000


In [3]:
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def preprocess_text(words):
    words = [word.lower() for word in words if word.isalpha()]  # Remove punctuation
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Lemmatization & Stopwords removal
    return " ".join(words)

# Apply preprocessing
documents = [(preprocess_text(words), category) for words, category in documents]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Data split
X = [doc for doc, category in documents]
y = [category for doc, category in documents]

vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(X)


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Train Model
model = LogisticRegression()
model.fit(X_train, y_train)

# Accuracy Check
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")


Model Accuracy: 0.82


In [6]:
def predict_sentiment(review):
    review = preprocess_text(review.split())  # Cleaning
    review_tfidf = vectorizer.transform([review])  # Convert to TF-IDF
    prediction = model.predict(review_tfidf)[0]  # Model prediction
    return "Positive" if prediction == "pos" else "Negative"

# Example Input
user_review = "The movie was absolutely amazing with great storytelling!"
print(f"Predicted Sentiment: {predict_sentiment(user_review)}")


Predicted Sentiment: Positive
