In [None]:
from google.colab import drive
drive.mount('/content/drive')

!pip install scikit-learn fpdf -q

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone


In [None]:
import pandas as pd
import os

data_path = '/content/drive/MyDrive/spam sms/spam.csv'
output_folder = '/content/drive/MyDrive/spam sms/'

df = pd.read_csv(data_path, encoding='latin1')
print("Loaded data shape:", df.shape)

df = df[['v1', 'v2']]
df.columns = ['label', 'message']
print(df.head())
print("\nLabel distribution:\n", df['label'].value_counts())


Loaded data shape: (5572, 5)
  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...

Label distribution:
 label
ham     4825
spam     747
Name: count, dtype: int64


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['label_num'] = le.fit_transform(df['label'])

X_train, X_test, y_train, y_test = train_test_split(
    df['message'], df['label_num'], test_size=0.2, random_state=42, stratify=df['label_num']
)

print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")


Train size: 4457, Test size: 1115


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

joblib.dump(tfidf, os.path.join(output_folder, 'tfidf_vectorizer.pkl'))
print(" TF‑IDF done. Shape:", X_train_tfidf.shape)


 TF‑IDF done. Shape: (4457, 5000)


In [None]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)

joblib.dump(nb, os.path.join(output_folder, 'naive_bayes_model.pkl'))
print("Naive Bayes trained & saved.")


Naive Bayes trained & saved.


In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_tfidf, y_train)

joblib.dump(lr, os.path.join(output_folder, 'logistic_regression_model.pkl'))
print(" Logistic Regression trained & saved.")


 Logistic Regression trained & saved.


In [None]:
from sklearn.svm import LinearSVC

svm = LinearSVC()
svm.fit(X_train_tfidf, y_train)

joblib.dump(svm, os.path.join(output_folder, 'svm_model.pkl'))
print("SVM trained & saved.")


SVM trained & saved.


In [None]:
from sklearn.metrics import accuracy_score, classification_report

results = {}
for name, model in {'NaiveBayes': nb, 'LogReg': lr, 'SVM': svm}.items():
    y_pred = model.predict(X_test_tfidf)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(f"\n{name} Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred, target_names=['ham', 'spam']))

with open(os.path.join(output_folder, 'model_accuracies.txt'), 'w') as f:
    for name, acc in results.items():
        f.write(f"{name}: {acc:.4f}\n")
print(" Accuracies saved.")



NaiveBayes Accuracy: 0.9641
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       966
        spam       1.00      0.73      0.84       149

    accuracy                           0.96      1115
   macro avg       0.98      0.87      0.91      1115
weighted avg       0.97      0.96      0.96      1115


LogReg Accuracy: 0.9749
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99       966
        spam       0.99      0.82      0.90       149

    accuracy                           0.97      1115
   macro avg       0.98      0.91      0.94      1115
weighted avg       0.98      0.97      0.97      1115


SVM Accuracy: 0.9865
              precision    recall  f1-score   support

         ham       0.99      1.00      0.99       966
        spam       0.99      0.91      0.95       149

    accuracy                           0.99      1115
   macro avg       0.99      0.95      0.97      111