<a href="https://colab.research.google.com/github/ManasiA/MLproject/blob/main/Spamdetectorproject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install numpy pandas scikit-learn nltk



In [4]:
!wget https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv

--2025-07-31 05:50:51--  https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 503663 (492K) [application/octet-stream]
Saving to: ‘spam.csv’


2025-07-31 05:50:51 (10.0 MB/s) - ‘spam.csv’ saved [503663/503663]



In [5]:
import pandas as pd

# Load dataset
data = pd.read_csv('spam.csv', encoding='latin-1')
data = data[['v1', 'v2']]  # Keep only label and text columns
data.columns = ['label', 'text']  # Rename columns

In [6]:
print(data.head())#for display 1st 5 rows

  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [7]:
# Check counts of spam vs ham
print("\nSpam vs Ham Counts:")
print(data['label'].value_counts())


Spam vs Ham Counts:
label
ham     4825
spam     747
Name: count, dtype: int64


In [8]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    # Stemming
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)

# Apply preprocessing
data['processed_text'] = data['text'].apply(preprocess_text)

# Display cleaned text
print(data.head())

  label                                               text  \
0   ham  Go until jurong point, crazy.. Available only ...   
1   ham                      Ok lar... Joking wif u oni...   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...   
3   ham  U dun say so early hor... U c already then say...   
4   ham  Nah I don't think he goes to usf, he lives aro...   

                                      processed_text  
0  go jurong point crazi avail bugi n great world...  
1                              ok lar joke wif u oni  
2  free entri 2 wkli comp win fa cup final tkt 21...  
3                u dun say earli hor u c alreadi say  
4          nah dont think goe usf live around though  


In [12]:
from sklearn.model_selection import train_test_split

X = data['processed_text']  # Features (processed text)
y = data['label']           # Labels (spam/ham)

# Split into 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit on training data, transform both train & test
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [14]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

# Initialize and train the model
model = MultinomialNB()
model.fit(X_train_vec, y_train)

# Predict on test data
y_pred = model.predict(X_test_vec)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Confusion matrix (shows true vs predicted labels)
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.97

Confusion Matrix:
[[965   0]
 [ 33 117]]


In [15]:
def predict_spam(email_text):
    processed = preprocess_text(email_text)
    vector = vectorizer.transform([processed])
    prediction = model.predict(vector)[0]
    probability = model.predict_proba(vector)[0]
    return prediction, probability

# Test with a sample email
sample_email = "WIN A FREE IPHONE! CLICK NOW!"
prediction, prob = predict_spam(sample_email)
print(f"Prediction: {prediction}")
print(f"Probabilities (ham, spam): {prob}")

Prediction: spam
Probabilities (ham, spam): [0.42332138 0.57667862]


In [16]:
import joblib

# Save model and vectorizer
joblib.dump(model, 'spam_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

# Later, you can load them with:
# model = joblib.load('spam_model.pkl')
# vectorizer = joblib.load('tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']