In [None]:
# === Imports ===
import pandas as pd
import numpy as np
import re
import boto3
import io
import sagemaker
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sagemaker.inputs import TrainingInput
from sagemaker import image_uris
from sagemaker.estimator import Estimator
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer
from sagemaker.predictor import Predictor

# === Configuration ===
bucket = 'email-spam-bucket'     # Replace with your S3 bucket name
prefix = 'spam-xgboost'
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
region = sagemaker_session.boto_region_name

# === Load and Clean Data ===
df = pd.read_csv("spamdata.csv", encoding='latin1')
df = df.rename(columns={'v1': 'label', 'v2': 'text'})[['label', 'text']]
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
df['text'] = df['text'].apply(lambda x: re.sub(r'\W+', ' ', str(x).lower()))

# === Vectorize Text Using TF-IDF ===
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text'])
y = df['label'].values

# === Split Dataset ===
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.1, random_state=42)

# === Create DataFrames for CSV ===
train_data = np.hstack((y_train.reshape(-1, 1), X_train.toarray()))
val_data = np.hstack((y_val.reshape(-1, 1), X_val.toarray()))

train_df = pd.DataFrame(train_data)
val_df = pd.DataFrame(val_data)

# === Upload to S3 ===
s3 = boto3.resource('s3')

def upload_df_to_s3(df, key):
    csv_buffer = io.StringIO()
    df.to_csv(csv_buffer, header=False, index=False)
    s3.Bucket(bucket).Object(key).put(Body=csv_buffer.getvalue())
    print(f"✅ Uploaded to s3://{bucket}/{key}")

upload_df_to_s3(train_df, f'{prefix}/train/train.csv')
upload_df_to_s3(val_df, f'{prefix}/validation/val.csv')

# === Prepare SageMaker Inputs ===
train_input = TrainingInput(f's3://{bucket}/{prefix}/train/', content_type='text/csv')
val_input = TrainingInput(f's3://{bucket}/{prefix}/validation/', content_type='text/csv')

# === Retrieve XGBoost Container ===
container = image_uris.retrieve('xgboost', region=region, version='1.2-1')

# === Handle Class Imbalance ===
ham_count = len(df[df['label'] == 0])
spam_count = len(df[df['label'] == 1])
if spam_count == 0:
    print("⚠️ Warning: No spam samples found. Using default scale_pos_weight = 1.")
    pos_weight = 1
else:
    pos_weight = ham_count / spam_count

# === Define Estimator ===
estimator = Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    output_path=f's3://{bucket}/{prefix}/output',
    sagemaker_session=sagemaker_session,
    hyperparameters={
        "max_depth": 6,
        "eta": 0.1,
        "gamma": 4,
        "min_child_weight": 6,
        "objective": "binary:logistic",
        "num_round": 200,
        "verbosity": 1,
        "scale_pos_weight": pos_weight
    }
)

# === Train Model ===
estimator.fit({'train': train_input, 'validation': val_input})
print("✅ Model training complete.")

# === Deploy Endpoint ===
predictor = estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.large',
    serializer=CSVSerializer(),
    deserializer=JSONDeserializer()
)
print("✅ Endpoint deployed.")

# === Predict New Message ===
sample_text = ["Congratulations! You’ve won a $1000 Walmart gift card. Call now!"]
sample_vectorized = vectorizer.transform(sample_text).toarray()
response = predictor.predict(sample_vectorized)
score = float(response)

print("📩 Spam Score:", score)
print("📢 Prediction:", "Spam" if score > 0.5 else "Not spam")

# === OPTIONAL: Clean up endpoint ===
# predictor.delete_endpoint()


✅ Uploaded to s3://email-spam-bucket/spam-xgboost/train/train.csv


INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-05-27-18-16-37-815


✅ Uploaded to s3://email-spam-bucket/spam-xgboost/validation/val.csv
2025-05-27 18:16:39 Starting - Starting the training job...
2025-05-27 18:17:08 Downloading - Downloading input data...
2025-05-27 18:17:28 Downloading - Downloading the training image.....[34m[2025-05-27 18:18:25.602 ip-10-0-175-215.ap-south-1.compute.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:De

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2025-05-27-18-19-24-551


Training seconds: 124
Billable seconds: 124
✅ Model training complete.


INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2025-05-27-18-19-24-551
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2025-05-27-18-19-24-551


------

In [3]:
sample_text = ["Congratulations! You’ve won a $1000 Walmart gift card. Call now!"]
sample_vectorized = vectorizer.transform(sample_text).toarray()
response = predictor.predict(sample_vectorized)
score = float(response)

print("📩 Spam Score:", score)
print("📢 Prediction:", "Spam" if score > 0.5 else "Not spam")


📩 Spam Score: 2.437997500237543e-05
📢 Prediction: Not spam
