# SCAM VS HAM

In [6]:
import pandas as pd
import random
"""| Function               | What it does                             |
| ---------------------- | ---------------------------------------- |
| `random.choice(list)`  | Picks a random item from a list          |
| `random.randint(a, b)` | Returns a random integer between a and b |
| `random.random()`      | Returns a float between 0 and 1          |
| `random.shuffle(list)` | Shuffles the list randomly               |"""

scam_messages = [
    "Congratulations! You've won a $1000 Walmart gift card. Go to scam-link.com to claim.",
    "URGENT: Your bank account has been compromised. Visit scam-bank.com now!",
    "You've been selected for a free iPhone. Click scam-offer.net now!"
]

ham_messages = [
    "Hey, are we still on for lunch today?",
    "Don't forget to bring your notebook to class.",
    "Can you call me when you're free?"
]

In [8]:
# Generate dataset
data = []
for _ in range(50):
    data.append([random.choice(scam_messages), 'scam'])
    data.append([random.choice(ham_messages), 'ham'])

df = pd.DataFrame(data, columns=['message', 'label'])
df.to_csv("synthetic_sms_dataset.csv", index=False)
print(df.sample(5))

                                              message label
97              Hey, are we still on for lunch today?   ham
75      Don't forget to bring your notebook to class.   ham
14  You've been selected for a free iPhone. Click ...  scam
10  URGENT: Your bank account has been compromised...  scam
89      Don't forget to bring your notebook to class.   ham


# USING UNSUPERVISED LEARNING TECHNIQUE

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

In [12]:
# Load dataset (use your real or synthetic one)
df = pd.read_csv("synthetic_sms_dataset.csv")

In [15]:
# Drop the label for unsupervised learning
texts = df['message']

In [17]:
# Convert text to numerical features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

In [19]:
# Apply KMeans (2 clusters for scam/ham)
model = KMeans(n_clusters=2, random_state=42)
df['cluster'] = model.fit_predict(X)

In [21]:
# Optional: compare with real labels (for evaluation only)
print("\nCluster assignment vs True label:")
print(pd.crosstab(df['cluster'], df['label']))


Cluster assignment vs True label:
label    ham  scam
cluster           
0         50    14
1          0    36


# SMS Scam vs Ham Classifier (Supervised)

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [4]:
# 1. Load dataset with labels
df = pd.read_csv("synthetic_sms_dataset.csv")  # use your labeled dataset

In [6]:
# 2. Split data
X = df['message']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# 3. Convert text to TF-IDF vectors
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [12]:
# 4. Train classifier
model = LogisticRegression()
model.fit(X_train_vec, y_train)

In [14]:
# 5. Predict and evaluate
y_pred = model.predict(X_test_vec)

In [16]:
# Metrics
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n🧾 Classification Report:\n", classification_report(y_test, y_pred))
print("\n📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

✅ Accuracy: 1.0

🧾 Classification Report:
               precision    recall  f1-score   support

         ham       1.00      1.00      1.00         8
        scam       1.00      1.00      1.00        12

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20


📊 Confusion Matrix:
 [[ 8  0]
 [ 0 12]]
