<a href="https://colab.research.google.com/github/KAMAL0657/KAMAL-HUSSAIN/blob/main/Copy_of_KamalHusain_28_pp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# KamalHusain
# !pip install scikit-learn pandas matplotlib joblib


In [None]:
import random
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from scipy.sparse import hstack, csr_matrix
import joblib
import os

RANDOM_STATE = 42
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)


In [None]:
def generate_email(is_spam, spam_vocab, ham_vocab, neutral_vocab):
    length = np.random.randint(5, 81)
    words = []
    for i in range(length):
        r = random.random()
        if is_spam:
            if r < 0.45:
                words.append(random.choice(spam_vocab))
            elif r < 0.65:
                words.append(random.choice(neutral_vocab))
            else:
                words.append(random.choice(ham_vocab))
        else:
            if r < 0.6:
                words.append(random.choice(ham_vocab))
            elif r < 0.85:
                words.append(random.choice(neutral_vocab))
            else:
                words.append(random.choice(spam_vocab))
    has_url = False
    if is_spam:
        if random.random() < 0.45:
            words.append('http://example.com')
            has_url = True
    else:
        if random.random() < 0.05:
            words.append('http://example.com')
            has_url = True
    text = ' '.join(words)
    return text, int(has_url), len(words)

spam_vocab = ['free','winner','credit','loan','offer','click','buy','cheap','limited','deal','congratulations','urgent','trial','guarantee','money','prize']
ham_vocab = ['meeting','schedule','project','report','family','dinner','school','assignment','invoice','regards','thanks','please','update','team','tomorrow']
neutral_vocab = ['today','please','information','attached','see','note','hello','regards','email','sent']

n_spam = 500
n_ham = 4500

texts = []
urls = []
lengths = []
labels = []

for _ in range(n_spam):
    t,u,l = generate_email(True, spam_vocab, ham_vocab, neutral_vocab)
    texts.append(t)
    urls.append(u)
    lengths.append(l)
    labels.append(1)
for _ in range(n_ham):
    t,u,l = generate_email(False, spam_vocab, ham_vocab, neutral_vocab)
    texts.append(t)
    urls.append(u)
    lengths.append(l)
    labels.append(0)

df = pd.DataFrame({'text': texts, 'has_url': urls, 'length': lengths, 'label': labels})
print('Dataset created:', df.shape)
print(df['label'].value_counts())


Dataset created: (5000, 4)
label
0    4500
1     500
Name: count, dtype: int64


In [None]:
vectorizer = TfidfVectorizer(max_features=3000, stop_words='english')
X_text = vectorizer.fit_transform(df['text'])
X_len = df[['length']].values.astype(float)
scaler_len = StandardScaler()
X_len_scaled = scaler_len.fit_transform(X_len)
X_url = df[['has_url']].astype(float).values

X = hstack([X_text, csr_matrix(X_len_scaled), csr_matrix(X_url)])
Y = df['label'].values

print('Feature matrix shape:', X.shape)


Feature matrix shape: (5000, 42)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=RANDOM_STATE)
print('Train:', X_train.shape, 'Test:', X_test.shape)


Train: (4000, 42) Test: (1000, 42)


In [None]:
results = {}

def evaluate_model(name, model, X_tr, y_tr, X_te, y_te):
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_te)
    if hasattr(model, 'predict_proba'):
        y_prob = model.predict_proba(X_te)[:,1]
    else:
        if hasattr(model, 'decision_function'):
            try:
                scores = model.decision_function(X_te)
                scaler = MinMaxScaler()
                y_prob = scaler.fit_transform(scores.reshape(-1,1)).ravel()
            except Exception:
                y_prob = None
        else:
            y_prob = None
    acc = accuracy_score(y_te, y_pred)
    prec = precision_score(y_te, y_pred)
    rec = recall_score(y_te, y_pred)
    f1 = f1_score(y_te, y_pred)
    auc = roc_auc_score(y_te, y_prob) if y_prob is not None else None
    cm = confusion_matrix(y_te, y_pred)
    results[name] = {'model': model, 'accuracy': acc, 'precision': prec, 'recall': rec, 'f1': f1, 'roc_auc': auc, 'confusion_matrix': cm}
    print(f"=== {name} ===")
    print('Accuracy:', round(acc,4))
    print('Precision:', round(prec,4))
    print('Recall:', round(rec,4))
    print('F1:', round(f1,4))
    if auc is not None:
        print('ROC AUC:', round(auc,4))
    print('Confusion matrix:\n', cm)
    print()


In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
evaluate_model('k-NN (k=5)', knn, X_train, y_train, X_test, y_test)

svm = SVC(kernel='linear', class_weight='balanced', probability=True, random_state=RANDOM_STATE)
evaluate_model('SVM (linear)', svm, X_train, y_train, X_test, y_test)

dt = DecisionTreeClassifier(random_state=RANDOM_STATE, max_depth=10)
evaluate_model('Decision Tree (max_depth=10)', dt, X_train, y_train, X_test, y_test)


=== k-NN (k=5) ===
Accuracy: 0.972
Precision: 0.9737
Recall: 0.74
F1: 0.8409
ROC AUC: 0.9646
Confusion matrix:
 [[898   2]
 [ 26  74]]

=== SVM (linear) ===
Accuracy: 0.968
Precision: 0.7656
Recall: 0.98
F1: 0.8596
ROC AUC: 0.9973
Confusion matrix:
 [[870  30]
 [  2  98]]

=== Decision Tree (max_depth=10) ===
Accuracy: 0.911
Precision: 0.5534
Recall: 0.57
F1: 0.5616
ROC AUC: 0.691
Confusion matrix:
 [[854  46]
 [ 43  57]]



In [None]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)

svm_grid = {'C':[0.1,1,10]}
svm_for_grid = SVC(kernel='linear', class_weight='balanced', probability=True, random_state=RANDOM_STATE)

gs_svm = GridSearchCV(svm_for_grid, svm_grid, scoring='f1', cv=skf, n_jobs=-1)
print('GridSearch SVM...')
gs_svm.fit(X_train, y_train)
print('Best SVM params:', gs_svm.best_params_)
best_svm = gs_svm.best_estimator_
evaluate_model('SVM (tuned)', best_svm, X_train, y_train, X_test, y_test)


dt_grid = {'max_depth':[5,10,20], 'min_samples_split':[2,5,10]}
dt_for_grid = DecisionTreeClassifier(random_state=RANDOM_STATE)
gs_dt = GridSearchCV(dt_for_grid, dt_grid, scoring='f1', cv=skf, n_jobs=-1)
print('GridSearch Decision Tree...')
gs_dt.fit(X_train, y_train)
print('Best DT params:', gs_dt.best_params_)
best_dt = gs_dt.best_estimator_
evaluate_model('Decision Tree (tuned)', best_dt, X_train, y_train, X_test, y_test)


GridSearch SVM...


In [None]:
best_name = max(results.keys(), key=lambda k: results[k]['f1'])
print('Best by F1:', best_name)
best_model = results[best_name]['model']

os.makedirs('models', exist_ok=True)
joblib.dump({'model': best_model, 'vectorizer': vectorizer, 'scaler_len': scaler_len}, 'models/best_spam_model.joblib')
print('Saved model to models/best_spam_model.joblib')


In [None]:
def predict_email(text, has_url_flag=None):
    X_t = vectorizer.transform([text])
    length_val = len(text.split())
    length_scaled = scaler_len.transform([[length_val]])
    url_val = 0 if has_url_flag is None else (1 if has_url_flag else 0)
    X_comb = hstack([X_t, csr_matrix(length_scaled), csr_matrix([[url_val]])])
    pred = best_model.predict(X_comb)[0]
    prob = None
    if hasattr(best_model, 'predict_proba'):
        prob = best_model.predict_proba(X_comb)[0,1]
    return pred, prob

example = 'congratulations you are a winner click buy limited offer'
print('Example prediction (spam-like):', predict_email(example, has_url_flag=True))
