<a href="https://colab.research.google.com/github/Mr112004/INTERNSHIP----2/blob/main/TASK_4_EMAIL_SPAM_DETECTION.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

try:
    df = pd.read_csv('spam.csv', encoding='latin-1')
except FileNotFoundError:
    print("Error: 'spam.csv' not found. Please make sure the CSV file is in the same directory as the script.")
    exit()

df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True, errors='ignore')
df.rename(columns={'v1': 'Category', 'v2': 'Message'}, inplace=True)
df['Category'] = df['Category'].map({'ham': 0, 'spam': 1})

X = df['Message']
y = df['Category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

model = LogisticRegression()
model.fit(X_train_features, y_train)

prediction_on_train_data = model.predict(X_train_features)
accuracy_on_train_data = accuracy_score(y_train, prediction_on_train_data)

prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(y_test, prediction_on_test_data)

print("--- Model Performance ---")
print(f'Accuracy on training data: {accuracy_on_train_data:.4f}')
print(f'Accuracy on test data: {accuracy_on_test_data:.4f}\n')

print("--- Classification Report ---")
print(classification_report(y_test, prediction_on_test_data, target_names=['Ham (Not Spam)', 'Spam']))

print("--- Confusion Matrix ---")
conf_matrix = confusion_matrix(y_test, prediction_on_test_data)
print(pd.DataFrame(conf_matrix, index=['Actual Ham', 'Actual Spam'], columns=['Predicted Ham', 'Predicted Spam']))
print("\n" + "="*50 + "\n")

def predict_email(email_text):
    input_features = vectorizer.transform([email_text])
    prediction = model.predict(input_features)
    if prediction[0] == 1:
        return "Prediction: This looks like SPAM."
    else:
        return "Prediction: This looks like a legitimate email (HAM)."

print("--- Spam Detector in Action ---")
spam_email = "Congratulations! You've won a $1000 Walmart gift card. Go to http://bit.ly/scam-link to claim now."
print(f"Email: '{spam_email}'")
print(f"{predict_email(spam_email)}\n")

ham_email = "Hey, are we still on for the meeting tomorrow at 2 PM? Let me know."
print(f"Email: '{ham_email}'")
print(f"{predict_email(ham_email)}\n")

spam_email_2 = "URGENT: Your account has been compromised. Please verify your details immediately to avoid suspension."
print(f"Email: '{spam_email_2}'")
print(f"{predict_email(spam_email_2)}\n")


--- Model Performance ---
Accuracy on training data: 0.9693
Accuracy on test data: 0.9525

--- Classification Report ---
                precision    recall  f1-score   support

Ham (Not Spam)       0.95      1.00      0.97       965
          Spam       0.97      0.67      0.79       150

      accuracy                           0.95      1115
     macro avg       0.96      0.83      0.88      1115
  weighted avg       0.95      0.95      0.95      1115

--- Confusion Matrix ---
             Predicted Ham  Predicted Spam
Actual Ham             962               3
Actual Spam             50             100


--- Spam Detector in Action ---
Email: 'Congratulations! You've won a $1000 Walmart gift card. Go to http://bit.ly/scam-link to claim now.'
Prediction: This looks like SPAM.

Email: 'Hey, are we still on for the meeting tomorrow at 2 PM? Let me know.'
Prediction: This looks like a legitimate email (HAM).

Email: 'URGENT: Your account has been compromised. Please verify your details