In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [4]:
from google.colab import files
uploaded = files.upload()  # Choose both train.ft.txt.bz2 and test.ft.txt.bz2



Saving train.ft.txt.bz2 to train.ft.txt.bz2
Saving test.ft.txt.bz2 to test.ft.txt.bz2


In [5]:
import bz2

def read_fasttext_file(filepath, max_lines=None):
    reviews = []
    labels = []

    with bz2.open(filepath, 'rt', encoding='utf-8') as f:
        for i, line in enumerate(f):
            parts = line.strip().split(' ', 1)
            if len(parts) != 2:
                continue
            label, text = parts
            label = 1 if label == '__label__2' else 0  # 1 = Positive, 0 = Negative
            reviews.append(text)
            labels.append(label)

            if max_lines and i >= max_lines:
                break

    return reviews, labels

# Read a subset for faster testing
X_train_texts, y_train = read_fasttext_file("train.ft.txt.bz2", max_lines=10000)
X_test_texts, y_test = read_fasttext_file("test.ft.txt.bz2", max_lines=2000)


In [6]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text

X_train_texts = [clean_text(t) for t in X_train_texts]
X_test_texts = [clean_text(t) for t in X_test_texts]


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(X_train_texts)
X_test = vectorizer.transform(X_test_texts)


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.8595702148925537
              precision    recall  f1-score   support

           0       0.85      0.86      0.85       955
           1       0.87      0.86      0.86      1046

    accuracy                           0.86      2001
   macro avg       0.86      0.86      0.86      2001
weighted avg       0.86      0.86      0.86      2001



In [9]:
sample = "This product is awesome and works perfectly!"
sample_cleaned = clean_text(sample)
sample_vec = vectorizer.transform([sample_cleaned])
prediction = model.predict(sample_vec)
print("Sentiment:", "Positive" if prediction[0] == 1 else "Negative")


Sentiment: Positive


In [10]:
import joblib

# Save the trained model
joblib.dump(model, "sentiment_model.pkl")

# Save the TF-IDF vectorizer
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")


['tfidf_vectorizer.pkl']

In [13]:
import json
import numpy as np

# Save TF-IDF vectorizer's vocabulary
tfidf_vocab = vectorizer.vocabulary_
# Convert numpy int64 values to standard Python integers
tfidf_vocab_serializable = {k: int(v) for k, v in tfidf_vocab.items()}
with open("tfidf_vocab.json", "w") as f:
    json.dump(tfidf_vocab_serializable, f)

In [14]:
from google.colab import files

files.download("sentiment_model.pkl")
files.download("tfidf_vectorizer.pkl")
files.download("tfidf_vocab.json")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [15]:
# Load model and vectorizer again
model = joblib.load("sentiment_model.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")


In [16]:
def predict_sentiment(text):
    text_clean = clean_text(text)
    vector = vectorizer.transform([text_clean])
    prediction = model.predict(vector)
    return "Positive" if prediction[0] == 1 else "Negative"

predict_sentiment("This product is not good at all")


'Negative'