# Training Notebook 1
- in this notebook the first iteration of the xgboost model will be trained and tested

## Import Libraries

In [None]:
# AWS & SageMaker
import boto3
import sagemaker
from sagemaker import Session, get_execution_role, image_uris
from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator

# Data handling & processing
import pandas as pd
import tarfile

# Model training & evaluation
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Visualisation
from matplotlib import pyplot as plt

In [None]:
role = get_execution_role()
session = Session()
region = session.boto_region_name

# initialise hyperparameters
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"binary:logistic",
        "num_round":"300"}

# set an output path where the trained model will be saved
bucket = 'bucket_name'
s3_output_key = 'models/xgboost/v1'
output_path = f's3://{bucket}/{s3_output_key}'

# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
# specify the repo_version depending on your preference.
xgboost_container = sagemaker.image_uris.retrieve("xgboost", region, "1.7-1")

# construct a SageMaker  estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_uri=xgboost_container, 
                                          hyperparameters=hyperparameters,
                                          role=role,
                                          instance_count=2, # demonstrating multi instance training
                                          instance_type='ml.m5.large', 
                                          volume_size=5, # 5 GB 
                                          output_path=output_path)

In [None]:
# define the data type and paths to the training and validation datasets
content_type = "text/csv"
bucket = 'bucket_name'
prefix = 'data/initial_processed_data'

train_input = TrainingInput(f"s3://{bucket}/{prefix}/train.csv", content_type=content_type)
validation_input = TrainingInput(f"s3://{bucket}/{prefix}/validation.csv", content_type=content_type)

In [None]:
f"s3://{bucket}/{prefix}/train.csv"

In [None]:
# execute the XGBoost training job
estimator.fit({'train': train_input, 'validation': validation_input}, wait=True, logs="All")

model data saved to `s3://sagemaker-eu-west-1-277841471265/models/xgboost/v1/sagemaker-xgboost-2025-06-03-14-33-29-138/output/model.tar.gz`

# Test XGBOOST Model


In [None]:
# download test data locally
bucket = 'bucket_name'
prefix = 'data/initial_processed_data/test.csv'
local_file = '../data/local_test_data/test.csv'

s3 = boto3.client('s3')
s3.download_file(bucket, prefix, local_file)
print(f"Downloaded {prefix} from S3 to {local_file}")

In [None]:
# open test data
df_test = pd.read_csv(local_file)

In [None]:
# split into test and train
y_test = df_test.iloc[:, 0].astype(int)  # first column = label
X_test = df_test.iloc[:, 1:]             # rest = features

In [None]:
# Download model
bucket = 'bucket_name'
model_key = 'models/xgboost/v1/sagemaker-xgboost-2025-06-03-14-33-29-138/output/model.tar.gz'
local_file = '../data/local_model_data/xgboost-v1/model.tar.gz'

s3 = boto3.client('s3')
s3.download_file(bucket, model_key, local_file)
print(f"Downloaded {model_key} from S3 to {local_file}")

In [None]:
# Specify your desired target directory
target_dir = "../data/local_model_data/xgboost-v1/"

with tarfile.open(local_file) as tar:
    tar.extractall(path=target_dir)

print(f"Model extracted to {target_dir}")

In [None]:
# load model
booster = xgb.Booster()
booster.load_model('../data/local_model_data/xgboost-v1/xgboost-model')  # built-in XGBoost saves as this name
print("Model loaded!")

In [None]:
# create predictions
dtest = xgb.DMatrix(X_test)
y_pred_prob = booster.predict(dtest)
y_pred = (y_pred_prob >= 0.5).astype(int)

In [None]:
# Print Classification Report
print("\n===== Classification Report =====")
print(classification_report(y_test, y_pred))

# Print Accuracy
print("\n===== Accuracy Score =====")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# Print Confusion Matrix
print("\n===== Confusion Matrix =====")
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

# Feature Importance
- We will explore which features were the most important in the model
- this can help to remove features not contributing, reducing the feature engineering requirement and the pre processing required at inference

In [None]:
# get feature importance as a list
importance_dict = booster.get_score(importance_type='weight')

In [None]:
importance_dict

In [None]:
# feature name list for mapping
feature_names = [
    'URLLength', 'DomainLength', 'IsDomainIP', 'NoOfSubDomain', 'LetterRatioInURL',
    'NoOfEqualsInURL', 'NoOfQMarkInURL', 'NoOfAmpersandInURL',
    'SpacialCharRatioInURL', 'IsHTTPS', 'CharContinuationRate', 'URLEntropy'
]

In [None]:
# Convert dictionary to DataFrame
importance_df = pd.DataFrame({
    'Feature': [feature_names[int(k[1:])] for k in importance_dict.keys()],
    'Importance': list(importance_dict.values())
}).sort_values(by='Importance', ascending=False)

print(importance_df)

In [None]:
plt.figure(figsize=(10, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance (Weight)')
plt.gca().invert_yaxis()
plt.show()

### We will drop features not present here, as they did not contribute to the model
These features are: 
- f3: IsDomainIP
- f6: NoOfEqualsInURL
- f7: NoOfQMarkInURL
- f12: URLEntropy

# We will continue in `training_notebook_2`

