In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import xgboost as xgb
import joblib
from sklearn.model_selection import train_test_split

nltk.download('punkt')
nltk.download('stopwords')

data = {
    'text': [
        "This product is simply awful. It stopped working after a week.",
        "Excellent quality for a reasonable price. I am very satisfied with the purchase.",
        "Terrible customer service. They never respond to emails.",
        "The delivery was fast and the product exceeded my expectations."
    ],
    'label': [1, 0, 1, 0]
}

df = pd.DataFrame(data)

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(filtered_tokens)

df['processed_text'] = df['text'].apply(preprocess_text)

vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf = vectorizer.fit_transform(df['processed_text'])
y = df['label']

from collections import Counter
print("Original class distribution:", Counter(y))

df.loc[df['label'] == 1, 'label'] = 0

print("Merged class distribution:", Counter(y))

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42, stratify=y)

while len(set(y_train)) < 2:
    X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42, stratify=y)

model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)

y_pred_lr = model_lr.predict(X_test)
print("Classification Report (Logistic Regression with TF-IDF):\n", classification_report(y_test, y_pred_lr))

joblib.dump(model_lr, 'sentiment_analysis_model_lr.pkl')

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss'
}

model_xgb = xgb.train(params, dtrain)
y_pred_xgb = model_xgb.predict(dtest)
y_pred_xgb = [1 if pred > 0.5 else 0 for pred in y_pred_xgb]

print("\nClassification Report (XGBoost with TF-IDF):\n", classification_report(y_test, y_pred_xgb))

model_xgb.save_model('sentiment_analysis_model_xgb.model')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Original class distribution: Counter({1: 2, 0: 2})
Merged class distribution: Counter({0: 4})
