In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import nltk
import re

# Download stopwords if not already done
nltk.download('stopwords')
from nltk.corpus import stopwords

# Step 1: Create a more balanced synthetic spam dataset
data = {
    'Text': [
        "Congratulations! You've won a $1000 gift card. Click here to claim now!",
        "Your loan is approved. Get the money fast by clicking here.",
        "Don't miss out on our special offer. Limited time only!",
        "Click this link to get a free iPhone!",
        "Urgent: Your bank account has been compromised. Verify your information now.",
        "Exclusive offer just for you! Get 50% off on all products.",
        "Hey John, let's meet up tomorrow at the cafe.",
        "Can you send me the project files by tomorrow?",
        "How was your weekend? Let’s catch up soon.",
        "Your invoice is attached. Please review it.",
        "Looking forward to our meeting next week.",
        "Let’s go for lunch together.",
        "You won a free trip to Hawaii! Claim your prize now!",
        "You have a new message from your friend.",
        "Get rich quick! Invest now!"
    ],
    'Label': [
        1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1  # Balanced spam and not spam samples
    ]
}

df = pd.DataFrame(data)

# Step 2: Preprocess text data (cleaning, tokenization, and vectorization)
# Function to clean the text
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.lower()  # Convert to lowercase
    return text

# Apply preprocessing
df['Text'] = df['Text'].apply(preprocess_text)

# Step 3: Feature Extraction using TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words=stopwords.words('english'))
X = tfidf.fit_transform(df['Text']).toarray()
y = df['Label']

# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 5: Train an XGBoost Classifier
xgb_model = XGBClassifier(eval_metric='logloss')
xgb_model.fit(X_train, y_train)

# Step 6: Make predictions on the test data
y_pred = xgb_model.predict(X_test)

# Step 7: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Not Spam', 'Spam'], zero_division=0))

# Step 8: Example prediction
sample_text = "You have won a lottery! Claim your prize now."
cleaned_text = preprocess_text(sample_text)
vectorized_text = tfidf.transform([cleaned_text]).toarray()

prediction = xgb_model.predict(vectorized_text)
if prediction == 1:
    print(f"The message '{sample_text}' is classified as SPAM.")
else:
    print(f"The message '{sample_text}' is classified as NOT SPAM.")


Accuracy: 0.40
Classification Report:
              precision    recall  f1-score   support

    Not Spam       0.00      0.00      0.00         3
        Spam       0.40      1.00      0.57         2

    accuracy                           0.40         5
   macro avg       0.20      0.50      0.29         5
weighted avg       0.16      0.40      0.23         5

The message 'You have won a lottery! Claim your prize now.' is classified as SPAM.


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
