# Training Notebook 2
## Dropping Non-Important Features

Based on feature importance analysis, we identified the following features as having zero importance:

- `IsDomainIP` (Index 3)
- `NoOfEqualsInURL` (Index 6)
- `NoOfQMarkInURL` (Index 7)
- `URLEntropy` (Index 12)

Since the dataset has no headers, we use these indices to drop the columns directly. This ensures that the final dataset is smaller, more efficient, and contains only the features that actually contribute to the model’s predictions.

The remaining columns are:

- `label`
- `URLLength`
- `DomainLength`
- `NoOfSubDomain`
- `LetterRatioInURL`
- `NoOfAmpersandInURL`
- `SpacialCharRatioInURL`
- `IsHTTPS`
- `CharContinuationRate`


## Import Libraries

In [None]:
# Standard library
import os
import tarfile

# Data handling
import pandas as pd
import numpy as np

# AWS and SageMaker
import boto3
import sagemaker
from sagemaker import Session, get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator
from sagemaker.model import Model
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer
from sagemaker import image_uris

# Machine learning and evaluation
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [None]:
# Directories
input_dir = '../data/initial_processed_data'
output_dir = '../data/data_reduced_features'
os.makedirs(output_dir, exist_ok=True)

# Columns to drop (indices of the features to remove)
columns_to_drop = [3,6,7,12]

# Files to process
files = ['train.csv', 'validation.csv', 'test.csv']

for file in files:
    input_path = os.path.join(input_dir, file)
    output_path = os.path.join(output_dir, file)
    
    # Read CSV without headers
    df = pd.read_csv(input_path, header=None)
    
    # Drop the specified columns
    df_cleaned = df.drop(columns=columns_to_drop)
    
    # Save the cleaned dataset
    df_cleaned.to_csv(output_path, index=False, header=False)
    print(f"Processed and saved {file} -> {output_path}")

In [None]:
# S3 config
bucket = 'bucket_name'
s3_prefix = 'data/data_reduced_features'
s3 = boto3.client('s3')

# Upload files
for file in files:
    local_path = os.path.join(output_dir, file)
    s3_key = f"{s3_prefix}/{file}"
    
    s3.upload_file(local_path, bucket, s3_key)
    print(f"Uploaded {file} to s3://{bucket}/{s3_key}")

In [None]:
# Inspect data

# Paths
original_path = '../data/initial_processed_data/train.csv'
cleaned_path = '../data/data_reduced_features/train.csv'

# Load datasets (no headers!)
df_original = pd.read_csv(original_path, header=None)
df_cleaned = pd.read_csv(cleaned_path, header=None)

# Show head of original dataset
print("\n===== Original Dataset Head (first 5 rows) =====")
print(df_original.head())

# Show head of cleaned dataset
print("\n===== Cleaned Dataset Head (first 5 rows) =====")
print(df_cleaned.head())

# Show mapping of original to cleaned (skipping dropped columns)
original_cols_to_keep = [0, 1, 2, 4, 5, 8, 9, 10, 11]
print("\n===== Original Columns Kept Indices =====")
print(original_cols_to_keep)

# Retrain with the new dataset

In [None]:
role = get_execution_role()
session = Session()
region = session.boto_region_name

# initialize hyperparameters
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"binary:logistic",
        "num_round":"300"}

# set an output path where the trained model will be saved
bucket = 'bucket_name'
s3_output_key = 'models/xgboost/v2'
output_path = f's3://{bucket}/{s3_output_key}'

# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
# specify the repo_version depending on your preference.
xgboost_container = sagemaker.image_uris.retrieve("xgboost", region, "1.7-1")

# construct a SageMaker AI estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_uri=xgboost_container, 
                                          hyperparameters=hyperparameters,
                                          role=role,
                                          instance_count=2, # demonstrating multi instance training
                                          instance_type='ml.m5.large', 
                                          volume_size=5, # 5 GB 
                                          output_path=output_path)


In [None]:
# define the data type and paths to the training and validation datasets
content_type = "text/csv"
bucket = 'bucket_name'
prefix = 'data/data_reduced_features'

train_input = TrainingInput(f"s3://{bucket}/{prefix}/train.csv", content_type=content_type)
validation_input = TrainingInput(f"s3://{bucket}/{prefix}/validation.csv", content_type=content_type)

In [None]:
# inspect path
f"s3://{bucket}/{prefix}/train.csv"

In [None]:
estimator.fit({'train': train_input, 'validation': validation_input}, wait=True, logs="All")

model data saved to `path_to_model_tar`

# Test XGBOOST Model
- need metrics to show

In [None]:
bucket = 'bucket_name'
prefix = 'data/data_reduced_features/test.csv'
local_file = '../data/local_test_data/test_v2.csv'

s3 = boto3.client('s3')
s3.download_file(bucket, prefix, local_file)
print(f"Downloaded {prefix} from S3 to {local_file}")


In [None]:
# Load CSV (no header, label is first column)
df_test = pd.read_csv(local_file, header=None)
print(df_test.head())

In [None]:
# Model location in S3
bucket = 'bucket_name'
model_key = 'models/xgboost/v2/sagemaker-xgboost-2025-06-03-14-53-12-660/output/model.tar.gz'
local_file = '../data/local_model_data/xgboost-v2/model.tar.gz'

# Download model file
s3 = boto3.client('s3')
s3.download_file(bucket, model_key, local_file)
print(f"Downloaded {model_key} from S3 to {local_file}")

In [None]:
# Specify your desired target directory
target_dir = "../data/local_model_data/xgboost-v2/"

with tarfile.open(local_file) as tar:
    tar.extractall(path=target_dir)

print(f"Model extracted to {target_dir}")

In [None]:
booster = xgb.Booster()
booster.load_model('../data/local_model_data/xgboost-v2/xgboost-model')  # built-in XGBoost saves as this name
print("Model loaded!")

In [None]:
# separate train and test data
y_test = df_test.iloc[:, 0].astype(int)  # first column = label
X_test = df_test.iloc[:, 1:]             # rest = features

In [None]:
dtest = xgb.DMatrix(X_test)
y_pred_prob = booster.predict(dtest)
y_pred = (y_pred_prob >= 0.5).astype(int)

In [None]:
# Print Classification Report
print("\n===== Classification Report =====")
print(classification_report(y_test, y_pred))

# Print Accuracy
print("\n===== Accuracy Score =====")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# Print Confusion Matrix
print("\n===== Confusion Matrix =====")
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

# Deploy Endpoint
- can continue to deployment from here no need to retrain if model is already trained

In [None]:
# Set up SageMaker session and role
role = get_execution_role()
session = Session()
region = session.boto_region_name

# Path to the model artifact
model_data = 'path_to_model_tar'

# Create the Model object using SageMaker's built-in XGBoost image
xgboost_image_uri = sagemaker.image_uris.retrieve('xgboost', region, version='1.7-1')

In [None]:
model = Model(
    image_uri=xgboost_image_uri,
    model_data=model_data,
    role=role,
    sagemaker_session=session
)

In [None]:
# Deploy the model as an endpoint
predictor = model.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.large', 
    serializer=CSVSerializer(),
    deserializer=JSONDeserializer()
)

print("Endpoint deployed and ready for real-time inference!")

## Endpoint Deployed
- Now create the lambda function using the code in `lambda_functions/lambda_functions.py`
- Follow instructions in `README` to do this and test it and finally to create the API and Web UI