<a href="https://colab.research.google.com/github/HillaryDrugs/li7/blob/main/Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# -----------------------------
# Config
# -----------------------------
CSV_PATH     = "spam.csv"
MAX_FEATURES = 10000
NGRAM_RANGE  = (1, 2)
MIN_DF       = 2
BATCH_SIZE   = 256
EPOCHS       = 15
LR           = 1e-3
WEIGHT_DECAY = 1e-4
SEED         = 42

rng = np.random.default_rng(SEED)
torch.manual_seed(SEED)

# -----------------------------
# Load & prep data
# -----------------------------
df = pd.read_csv(CSV_PATH, encoding="latin-1", engine="python", on_bad_lines="skip")
if "v1" not in df.columns or "v2" not in df.columns:
    raise ValueError("Expected columns 'v1' (label) and 'v2' (text) not found.")

df = df.rename(columns={"v1": "label", "v2": "text"})[["label", "text"]].dropna()
df["label"] = df["label"].astype(str).str.strip().str.lower().map({"ham": 0, "spam": 1}).astype(int)

X_text = df["text"].astype(str).values
y_all  = df["label"].values.astype(np.float32)

X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text, y_all, test_size=0.2, random_state=SEED, stratify=y_all
)

# Vectorize (fit on train only)
vectorizer = TfidfVectorizer(max_features=MAX_FEATURES, ngram_range=NGRAM_RANGE, min_df=MIN_DF)
X_train = vectorizer.fit_transform(X_train_text)
X_test  = vectorizer.transform(X_test_text)

# Tensors
X_train = torch.tensor(X_train.toarray(), dtype=torch.float32)
X_test  = torch.tensor(X_test.toarray(),  dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
y_test  = torch.tensor(y_test,  dtype=torch.float32)

# -----------------------------
# Model
# -----------------------------
class LogisticRegression(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        return self.linear(x).squeeze(-1)  # logits

input_dim = X_train.shape[1]
model = LogisticRegression(input_dim)

# Class imbalance weight: pos_weight = N_neg / N_pos
num_pos = float((y_train == 1).sum())
num_neg = float((y_train == 0).sum())
pos_weight = torch.tensor(num_neg / max(num_pos, 1.0))
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

train_dl = DataLoader(TensorDataset(X_train, y_train), batch_size=BATCH_SIZE, shuffle=True)

# -----------------------------
# Train
# -----------------------------
final_loss = None
for epoch in range(1, EPOCHS + 1):
    model.train()
    running = 0.0
    for xb, yb in train_dl:
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        running += loss.item() * xb.size(0)
    final_loss = running / len(X_train)
    print(f"Epoch {epoch:02d}/{EPOCHS} - loss: {final_loss:.4f}")

# === Final Loss (last-epoch average training loss) ===
print(f"\nFinal Loss: {final_loss:.4f}")

# -----------------------------
# Evaluate (accuracy)
# -----------------------------
model.eval()
with torch.no_grad():
    te_logits = model(X_test)
    te_probs = torch.sigmoid(te_logits)
    te_preds = (te_probs >= 0.5).float()

acc = accuracy_score(y_test.numpy(), te_preds.numpy())
print(f"Test Accuracy: {acc * 100:.2f}%\n")

# -----------------------------
# Styled predictions (like your screenshot)
# -----------------------------
def predict_texts(model, vectorizer, texts):
    X_new = vectorizer.transform(texts).toarray()
    X_new = torch.tensor(X_new, dtype=torch.float32)
    model.eval()
    with torch.no_grad():
        probs = torch.sigmoid(model(X_new)).cpu().numpy()
        preds = (probs >= 0.5).astype(int)
    return preds, probs

# Example set to mirror your format
new_texts = [
    "Congratulations! You won a prize",
    "Let's schedule a meeting next week",
    "Claim your free gift now",
    "Can you review the document?"
]

preds, probs = predict_texts(model, vectorizer, new_texts)
for text, pred, prob in zip(new_texts, preds, probs):
    label = "Spam" if pred == 1 else "Not Spam"
    print(f"Text: {text}")
    print(f"Prediction: {label} (probability: {prob:.4f})\n")