In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Download necessary NLTK resources (run once)
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to preprocess messages
def preprocess_message(text):
    # Convert the message to lowercase
    text = text.lower()
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove stopwords and apply lemmatization
    words = text.split()
    cleaned_message = ' '.join([lemmatizer.lemmatize(word) for word in words if word not in stop_words])
    
    return cleaned_message

# Load and preprocess dataset (customize path based on your dataset)
df = pd.read_csv('spam1.csv')

# Keep only the necessary columns
df = df[['v1', 'v2']]
df.columns = ['label', 'message']

# Preprocess all messages
df['cleaned_message'] = df['message'].apply(preprocess_message)

# Convert labels: spam = 1, ham = 0
df['label'] = df['label'].map({'spam': 1, 'ham': 0})

# Use TF-IDF Vectorizer to transform text data into numerical features
vectorizer = TfidfVectorizer(max_features=3000)  # Adjust max features based on performance
X = vectorizer.fit_transform(df['cleaned_message']).toarray()  # Convert text to numerical data
y = df['label'].values  # Labels (target variable)

# Split dataset into training and test sets (use 80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Logistic Regression model with suitable parameters
logistic_model = LogisticRegression(solver='liblinear', penalty='l2', C=0.5)  # Adjust parameters for your use case
logistic_model.fit(X_train, y_train)

# Evaluate the model (optional, helps in checking performance)
y_pred = logistic_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=['ham', 'spam']))

# Function to predict whether a message is spam or ham
def predict_spam_or_ham(model, input_message, vectorizer):
    # Preprocess the input message (apply the same preprocessing as before)
    processed_message = preprocess_message(input_message)
    
    # Transform the message using the trained TF-IDF vectorizer
    transformed_message = vectorizer.transform([processed_message])
    
    # Predict using the trained model
    prediction = model.predict(transformed_message)
    
    # Return the result (1 = spam, 0 = ham)
    return 'Spam' if prediction == 1 else 'Ham'

# Allow the user to input a message for classification
user_message = input("Enter a message to check if it's spam or ham: ")
result = predict_spam_or_ham(logistic_model, user_message, vectorizer)
print(f"The message is: {result}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\janha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\janha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Model Accuracy: 93.54%
Classification Report:
               precision    recall  f1-score   support

         ham       0.93      1.00      0.96       965
        spam       0.96      0.54      0.69       150

    accuracy                           0.94      1115
   macro avg       0.95      0.77      0.83      1115
weighted avg       0.94      0.94      0.93      1115

Enter a message to check if it's spam or ham: Happy birthday!!
The message is: Ham
