In [3]:
import pandas as pd

# Load Amazon review dataset from an online CSV (public link)
url = "https://raw.githubusercontent.com/dD2405/Twitter_Sentiment_Analysis/master/train.csv"
df = pd.read_csv(url)

# Clean dataset
df = df[['label', 'tweet']]
df = df.rename(columns={'label': 'sentiment', 'tweet': 'review'})

# Convert sentiment labels: 0 = Negative, 1 = Positive
df['sentiment'] = df['sentiment'].replace(4, 1)

df.head()


Unnamed: 0,sentiment,review
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

# Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train model
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# Predict and evaluate
y_pred = model.predict(X_test_vec)

print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))
print("\n🧾 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


✅ Accuracy: 0.9505709369623025

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97      5937
           1       0.93      0.33      0.49       456

    accuracy                           0.95      6393
   macro avg       0.94      0.67      0.73      6393
weighted avg       0.95      0.95      0.94      6393


🧾 Confusion Matrix:
 [[5925   12]
 [ 304  152]]
