In [1]:
import nltk
from nltk.corpus import movie_reviews
from gensim.models import FastText
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
nltk.download('movie_reviews')
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

texts = [" ".join(words) for words, label in documents]
labels = [1 if label == 'pos' else 0 for words, label in documents]

# Tokenize
tokenized_texts = [text.lower().split() for text in texts]

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Bluepal\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [3]:
ft_model = FastText(
    sentences=tokenized_texts,
    vector_size=100,  
    window=5,          
    min_count=2,      
    sg=1,              
    epochs=10
)

In [4]:
def get_sentence_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

X = np.array([get_sentence_vector(tokens, ft_model) for tokens in tokenized_texts])
y = np.array(labels)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

✅ Accuracy: 0.7125

Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.73      0.72       199
           1       0.72      0.69      0.71       201

    accuracy                           0.71       400
   macro avg       0.71      0.71      0.71       400
weighted avg       0.71      0.71      0.71       400



In [6]:
def predict_sentiment(text):
    tokens = text.lower().split()
    vec = get_sentence_vector(tokens, ft_model).reshape(1, -1)
    pred = clf.predict(vec)[0]
    sentiment = "Positive 😀" if pred == 1 else "Negative 😞"
    print(f"Text: {text}\nPredicted Sentiment: {sentiment}\n")

predict_sentiment("I loved the movie! It was brilliant and emotional.")
predict_sentiment("This was the worst film ever made.")
predict_sentiment("It was okay, not too bad.")

Text: I loved the movie! It was brilliant and emotional.
Predicted Sentiment: Positive 😀

Text: This was the worst film ever made.
Predicted Sentiment: Negative 😞

Text: It was okay, not too bad.
Predicted Sentiment: Negative 😞

