In [1]:
# Step 1: Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Step 2: Load Dataset
file_path = r"C:\Users\DELL\Downloads\twitter_sentiment_cleaned.csv"
df = pd.read_csv(file_path, encoding='latin-1')

# Step 3: Rename columns for clarity
df.rename(columns={'target': 'sentiment', 'text': 'tweet'}, inplace=True)

# Step 4: Drop missing values and take smaller sample
df = df.dropna(subset=['tweet', 'sentiment'])
df = df.sample(5000, random_state=42)   # 💡 use only 5000 rows for faster training

# Step 5: Split data
X = df['tweet']
y = df['sentiment']

# Step 6: Convert text to numeric features
vectorizer = CountVectorizer(stop_words='english', max_features=2000)
X_vectorized = vectorizer.fit_transform(X)

# Step 7: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Step 8: Logistic Regression
lr = LogisticRegression(max_iter=200)
lr.fit(X_train, y_train)
lr_acc = lr.score(X_test, y_test)

# Step 9: Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
rf_acc = rf.score(X_test, y_test)

# Step 10: Results
print(f"\n🔹 Logistic Regression Accuracy: {lr_acc:.3f}")
print(f"🔹 Random Forest Accuracy: {rf_acc:.3f}")

if lr_acc > rf_acc:
    print("\n✅ Logistic Regression performed better.")
elif rf_acc > lr_acc:
    print("\n✅ Random Forest performed better.")
else:
    print("\n⚖️ Both models performed equally well.")



🔹 Logistic Regression Accuracy: 0.699
🔹 Random Forest Accuracy: 0.679

✅ Logistic Regression performed better.
