# Sentiment Analysis — Starter Notebook

In [None]:
import os, numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
os.makedirs("plots", exist_ok=True)
print("Working directory:", os.getcwd())

In [None]:
# Load data
csv_path = "data/sentiment.csv"
df = pd.read_csv(csv_path)
df = df[["text","label"]].dropna()
print("Shape:", df.shape)
df.head()

In [None]:
# Split & vectorize
X = df["text"].astype(str)
y = df["label"].astype(str)
X_train,X_valid,y_train,y_valid = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)
vectorizer = TfidfVectorizer(stop_words="english", max_features=2000)
X_train_vec = vectorizer.fit_transform(X_train)
X_valid_vec = vectorizer.transform(X_valid)

In [None]:
# Train models
models = {"NaiveBayes": MultinomialNB(),"LogReg": LogisticRegression(max_iter=2000)}
metrics={}
for name,m in models.items():
    m.fit(X_train_vec,y_train)
    preds=m.predict(X_valid_vec)
    acc=accuracy_score(y_valid,preds)
    metrics[name]=acc
    print(name,acc)
    print(classification_report(y_valid,preds))
best_name=max(metrics,key=metrics.get)
best_model=models[best_name]

In [None]:
# Confusion matrix
labels=sorted(y_valid.unique())
preds=best_model.predict(X_valid_vec)
cm=confusion_matrix(y_valid,preds,labels=labels)
plt.imshow(cm,cmap="Blues");plt.title(f"Confusion — {best_name}")
plt.xticks(range(len(labels)),labels,rotation=45)
plt.yticks(range(len(labels)),labels)
plt.colorbar()
plt.savefig("plots/confusion_matrix.png",dpi=150,bbox_inches="tight")
plt.show()

In [None]:
# Example predictions
examples=["I love this!","This is terrible.","Not bad, could be better."]
for t in examples:
    print(t,"->",best_model.predict(vectorizer.transform([t]))[0])