<a href="https://colab.research.google.com/github/HijabNaqvi/MachineLearningProjects/blob/main/SpamMessageClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import required libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer  #Term Frequence, Inverse Document Frequency Vectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix


In [None]:
# Load the data
data = pd.read_csv('spam.csv', encoding='latin-1') # instead of ascii and uft-8

# Keep only necessary columns (text and label)
data = data[['v1', 'v2']]
data.columns = ['label', 'text']

# Convert the label to binary: 'spam' -> 1, 'ham' -> 0
data['label'] = data['label'].map({'spam': 1, 'ham': 0})

# Split the data into features (X) and label (y)
X = data['text']
y = data['label']


In [None]:
# Convert text data to feature vectors
tfidf = TfidfVectorizer(stop_words='english', max_features=3000)

# Fit and transform the feature vectors for both training and testing sets
X = tfidf.fit_transform(X)


In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
# Create the model
log_reg = LogisticRegression()

# Train the model
log_reg.fit(X_train, y_train)


In [None]:
# Predict the labels for test set
y_pred = log_reg.predict(X_test)

# Evaluate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)


Accuracy: 95.10%
Confusion Matrix:
[[1450    3]
 [  79  140]]


In [None]:
# Function to predict whether a message is spam or not
def predict_spam(message):
    # Transform the user's message using the same TF-IDF vectorizer
    message_transformed = tfidf.transform([message])

    # Predict whether the message is spam (1) or ham (0)
    prediction = log_reg.predict(message_transformed)

    if prediction == 1:
        print("The message is spam.")
    else:
        print("The message is not spam.")

# Take user input and predict
user_message = input("Enter a message to check if it's spam or not: ")
predict_spam(user_message)


Enter a message to check if it's spam or not: How do you say “on a roll” in Arabic? Way to commit to learning Arabic! Make it to a 4 day streak with a quick lesson now.
The message is not spam.
