# Train a simple TF-IDF + Logistic Regression model for spam detection

This notebook loads `spam.csv`, does inline preprocessing (no function definitions), trains a Logistic Regression model, evaluates it, saves artifacts, and runs several hardcoded message checks.

All steps are top-level code cells (no `def` or `if __name__` blocks) as requested.

In [1]:
# Section 1: Install (if needed) and import libraries
# (If packages are already installed in your environment you can skip installs.)
import warnings
warnings.filterwarnings('ignore')

import os
import re
import pandas as pd
import numpy as np
import joblib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score

print('Libraries imported')

Libraries imported


In [2]:
# Section 2: Notebook configuration (seed, pandas display options)
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_rows', 20)

print('Random seed and display options set')

Random seed and display options set


In [3]:
# Section 3: Load dataset
csv_path = 'spam.csv'

if not os.path.exists(csv_path):
    raise FileNotFoundError(f"{csv_path} not found in notebook working directory: {os.getcwd()}")

# Many versions of this dataset have extra columns and are encoded latin-1
df = pd.read_csv(csv_path, encoding='latin-1', usecols=[0,1])
df.columns = ['label', 'text']

df = df.dropna(subset=['text']).copy()
print('Dataset shape:', df.shape)
print('\nLabel distribution:')
print(df['label'].value_counts())

# Balance dataset: downsample ham to match number of spam samples
spam_count = int(df['label'].value_counts().get('spam', 0))
if spam_count <= 0:
    raise ValueError('No spam samples found in dataset; cannot balance.')

ham_df = df[df['label'] == 'ham']
spam_df = df[df['label'] == 'spam']
print(f"Original counts -> ham: {len(ham_df)}, spam: {len(spam_df)}")

# downsample ham to spam_count using the notebook random seed
ham_down = ham_df.sample(n=spam_count, random_state=RANDOM_STATE)

df = pd.concat([ham_down, spam_df], axis=0).sample(frac=1.0, random_state=RANDOM_STATE).reset_index(drop=True)
print('\nAfter balancing — label distribution:')
print(df['label'].value_counts())

df.head(5)

Dataset shape: (5572, 2)

Label distribution:
label
ham     4825
spam     747
Name: count, dtype: int64
Original counts -> ham: 4825, spam: 747

After balancing — label distribution:
label
spam    747
ham     747
Name: count, dtype: int64


Unnamed: 0,label,text
0,spam,"URGENT, IMPORTANT INFORMATION FOR O2 USER. TODAY IS YOUR LUCKY DAY! 2 FIND OUT WHY LOG ONTO HTTP://WWW.URAWINNER.COM THERE IS A FANTASTIC SURPRISE AWAITING FOR YOU"
1,spam,Panasonic & BluetoothHdset FREE. Nokia FREE. Motorola FREE & DoubleMins & DoubleTxt on Orange contract. Call MobileUpd8 on 08000839402 or call 2optout
2,spam,Do you want a new Video handset? 750 any time any network mins? UNLIMITED TEXT? Camcorder? Reply or Call now 08000930705 for del Sat AM
3,spam,Hi if ur lookin 4 saucy daytime fun wiv busty married woman Am free all next week Chat now 2 sort time 09099726429 JANINExx Callså£1/minMobsmoreLKPOBOX177HP51FL
4,spam,"09066362231 URGENT! Your mobile No 07xxxxxxxxx won a å£2,000 bonus caller prize on 02/06/03! this is the 2nd attempt to reach YOU! call 09066362231 ASAP!"


In [4]:
# Section 4: Inspect data
print('Dtypes:\n', df.dtypes)
print('\nAny missing values:\n', df.isnull().sum())

# Show a few example rows with long text
df.head(8)

Dtypes:
 label    object
text     object
dtype: object

Any missing values:
 label    0
text     0
dtype: int64


Unnamed: 0,label,text
0,spam,"URGENT, IMPORTANT INFORMATION FOR O2 USER. TODAY IS YOUR LUCKY DAY! 2 FIND OUT WHY LOG ONTO HTTP://WWW.URAWINNER.COM THERE IS A FANTASTIC SURPRISE AWAITING FOR YOU"
1,spam,Panasonic & BluetoothHdset FREE. Nokia FREE. Motorola FREE & DoubleMins & DoubleTxt on Orange contract. Call MobileUpd8 on 08000839402 or call 2optout
2,spam,Do you want a new Video handset? 750 any time any network mins? UNLIMITED TEXT? Camcorder? Reply or Call now 08000930705 for del Sat AM
3,spam,Hi if ur lookin 4 saucy daytime fun wiv busty married woman Am free all next week Chat now 2 sort time 09099726429 JANINExx Callså£1/minMobsmoreLKPOBOX177HP51FL
4,spam,"09066362231 URGENT! Your mobile No 07xxxxxxxxx won a å£2,000 bonus caller prize on 02/06/03! this is the 2nd attempt to reach YOU! call 09066362231 ASAP!"
5,ham,I'm going out to buy mum's present ar.
6,ham,I thk 530 lor. But dunno can get tickets a not. Wat u doing now?
7,ham,Lol I would but my mom would have a fit and tell the whole family how crazy and terrible I am


In [5]:
# Section 5: Clean & preprocess (inline, no functions)
# lowercase, remove urls, remove non-alphanumerics, collapse whitespace

# use pandas vectorized string operations
cleaned = (
    df['text'].astype(str)
      .str.lower()
      .str.replace(r"https?://\S+|www\.\S+", ' ', regex=True)
      .str.replace(r"[^a-z0-9 ]+", ' ', regex=True)
      .str.replace(r"\s+", ' ', regex=True)
      .str.strip()
)

df['text_clean'] = cleaned

# show a few cleaned examples
pd.DataFrame({'original': df['text'].head(6), 'clean': df['text_clean'].head(6)})

Unnamed: 0,original,clean
0,"URGENT, IMPORTANT INFORMATION FOR O2 USER. TODAY IS YOUR LUCKY DAY! 2 FIND OUT WHY LOG ONTO HTTP://WWW.URAWINNER.COM THERE IS A FANTASTIC SURPRISE AWAITING FOR YOU",urgent important information for o2 user today is your lucky day 2 find out why log onto there is a fantastic surprise awaiting for you
1,Panasonic & BluetoothHdset FREE. Nokia FREE. Motorola FREE & DoubleMins & DoubleTxt on Orange contract. Call MobileUpd8 on 08000839402 or call 2optout,panasonic bluetoothhdset free nokia free motorola free doublemins doubletxt on orange contract call mobileupd8 on 08000839402 or call 2optout
2,Do you want a new Video handset? 750 any time any network mins? UNLIMITED TEXT? Camcorder? Reply or Call now 08000930705 for del Sat AM,do you want a new video handset 750 any time any network mins unlimited text camcorder reply or call now 08000930705 for del sat am
3,Hi if ur lookin 4 saucy daytime fun wiv busty married woman Am free all next week Chat now 2 sort time 09099726429 JANINExx Callså£1/minMobsmoreLKPOBOX177HP51FL,hi if ur lookin 4 saucy daytime fun wiv busty married woman am free all next week chat now 2 sort time 09099726429 janinexx calls 1 minmobsmorelkpobox177hp51fl
4,"09066362231 URGENT! Your mobile No 07xxxxxxxxx won a å£2,000 bonus caller prize on 02/06/03! this is the 2nd attempt to reach YOU! call 09066362231 ASAP!",09066362231 urgent your mobile no 07xxxxxxxxx won a 2 000 bonus caller prize on 02 06 03 this is the 2nd attempt to reach you call 09066362231 asap
5,I'm going out to buy mum's present ar.,i m going out to buy mum s present ar


In [6]:
# Section 6: Feature extraction and train/test split (inline)
# Map labels to numbers
label_map = {'ham': 0, 'spam': 1}
df['label_num'] = df['label'].map(label_map)

X = df['text_clean']
y = df['label_num']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)
print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)

# TF-IDF vectorizer
vect = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_t = vect.fit_transform(X_train)
X_test_t = vect.transform(X_test)

print('TF-IDF fitted — feature matrix shapes:', X_train_t.shape, X_test_t.shape)

Train shape: (1195,) Test shape: (299,)
TF-IDF fitted — feature matrix shapes: (1195, 3760) (299, 3760)


In [7]:
# Section 7: Train Logistic Regression (inline)
model = LogisticRegression(max_iter=1000)
model.fit(X_train_t, y_train)

# Predictions and evaluation
preds = model.predict(X_test_t)
probs = model.predict_proba(X_test_t)[:, 1]

print('Accuracy:', accuracy_score(y_test, preds))
print('\nClassification report:\n', classification_report(y_test, preds, target_names=['ham', 'spam']))
print('\nConfusion matrix:\n', confusion_matrix(y_test, preds))
try:
    print('\nROC AUC:', roc_auc_score(y_test, probs))
except Exception:
    pass

Accuracy: 0.9331103678929766

Classification report:
               precision    recall  f1-score   support

         ham       0.91      0.96      0.94       150
        spam       0.96      0.91      0.93       149

    accuracy                           0.93       299
   macro avg       0.93      0.93      0.93       299
weighted avg       0.93      0.93      0.93       299


Confusion matrix:
 [[144   6]
 [ 14 135]]

ROC AUC: 0.9817002237136465


In [8]:
# Section 8: Save trained artifacts
joblib.dump(vect, 'tfidf_vectorizer.joblib')
joblib.dump(model, 'logreg_model.joblib')
print('Saved tfidf_vectorizer.joblib and logreg_model.joblib')

Saved tfidf_vectorizer.joblib and logreg_model.joblib


In [9]:
# Section 9: Hardcoded message checks (inline)
hardcoded_messages = [
    "Congratulations! You have won a FREE ticket. Call now to claim.",
    "Hi love, are we still meeting for lunch today?",
    "URGENT! Your account has been suspended. Click the link to verify.",
    "I'll be late to class, please start without me.",
    "Win a brand new iPhone by texting WIN to 80085!"
]

import re

for m in hardcoded_messages:
    m_clean = (
        m.lower()
         .replace('\n', ' ')
    )
    m_clean = re.sub(r"https?://\S+|www\.\S+", ' ', m_clean)
    m_clean = re.sub(r"[^a-z0-9 ]+", ' ', m_clean)
    m_clean = re.sub(r"\s+", ' ', m_clean).strip()

    v = vect.transform([m_clean])
    p = model.predict(v)[0]
    prob = model.predict_proba(v)[0][p]
    label = 'spam' if p == 1 else 'ham'
    print(f"Message: {m}\n  -> Prediction: {label} (confidence: {prob:.3f})\n")

Message: Congratulations! You have won a FREE ticket. Call now to claim.
  -> Prediction: spam (confidence: 0.845)

Message: Hi love, are we still meeting for lunch today?
  -> Prediction: ham (confidence: 0.829)

Message: URGENT! Your account has been suspended. Click the link to verify.
  -> Prediction: spam (confidence: 0.630)

Message: I'll be late to class, please start without me.
  -> Prediction: ham (confidence: 0.857)

Message: Win a brand new iPhone by texting WIN to 80085!
  -> Prediction: spam (confidence: 0.749)



## Notes

- This notebook performs the full flow inline (no functions).  
- Run cells top-to-bottom to reproduce results.  
- If you want the same steps moved into a script or wrapped into functions, I can produce that separately.