# Regression with Amazon SageMaker XGBoost algorithm

In [None]:
# !pip3 install -U sagemaker
!pip install pyarrow fastparquet
!pip install xgboost


In [1]:
# Import Libraries
import os
import boto3
from boto3 import session
import pandas as pd
import pyarrow.parquet as pq
from boto3.s3.transfer import S3Transfer
import matplotlib.pyplot as plt
import pickle

print("done import")


done import


In [3]:
# Load Training Data
bucket_name = 'pipeline2-data-storage'
file_name = 'part-00003-9a62fedd-6eb0-4804-a32b-302a11cd6a91-c000.snappy.parquet'
file_key = 'historic-processed/df-cascade/'+ file_name


s3 = boto3.client('s3')
transfer = S3Transfer(s3)
transfer.download_file(bucket_name, file_key, file_name)

# Load Parquet file into a DataFrame
table = pq.read_table(file_name)
df = table.to_pandas()



# # Sample 1000000 records for training due to resource limitations
df = df.sample(n=1000000, random_state=42)
      
print("Total number of records:", df.shape[0])


Total number of records: 1000000


In [None]:

df.columns
df.dtypes



In [15]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

df['crsdeptime'] = df['crsdeptime'].astype(str).str.zfill(4)  # Ensure all times are in "HHMM"
df['crsdephour'] = df['crsdeptime'].str[:2].astype(int)       # Extract hour as integer
df['crsdepminute'] = df['crsdeptime'].str[2:].astype(int)     # Extract minute as integer


# Assuming your dataframe is named df
X = df[['dayofmonth', 'year', 'month', 'origin', 'dest', 'crsdephour','crsdepminute', 'uniquecarrier']]

y = df['arrdelay']

# One-hot encoding for categorical features
X = pd.get_dummies(X, columns=['origin', 'dest', 'uniquecarrier'])

# Assuming your preprocessed data is in X and y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBRegressor
xgb_model = XGBRegressor(
    objective='reg:squarederror',  # For regression tasks
    n_estimators=100,              # Number of trees
    learning_rate=0.1,             # Step size shrinkage
    max_depth=6,                   # Maximum depth of a tree
    random_state=42
)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test)

print(y_pred)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"Mean Absolute Error: {mae}")
print(f"Root Mean Squared Error: {rmse}")



[23.208885   6.1441884 14.356416  ... 11.580776   6.9681387 11.414705 ]
Mean Absolute Error: 20.71948922856197
Root Mean Squared Error: 35.9846092948558




In [None]:
# Gridsearch to optimize model

from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Grid search
grid_search = GridSearchCV(XGBRegressor(objective='reg:squarederror', random_state=42),
                           param_grid, scoring='neg_mean_absolute_error', cv=3)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)


In [19]:
# Save the model to S3
# Save the features to S3 as well

import joblib

# Define bucket name and file name in S3
bucket_name = 'pipeline2-data-storage'
model_path = 'models/xgboost_model.pkl'
columns_path = 'models/feature_columns.pkl'

# Save the trained model locally as a .pkl file
joblib.dump(xgb_model, 'xgboost_model.pkl')

# Save the feature columns locally as a .pkl file
joblib.dump(X.columns, 'feature_columns.pkl')

# Function to upload file to S3
def upload_file_to_s3(local_path, bucket, s3_path):
    with open(local_path, 'rb') as file_data:
        s3.upload_fileobj(file_data, bucket, s3_path)
    print(f"{local_path} uploaded to s3://{bucket}/{s3_path}")

# Upload the model and feature columns to S3
upload_file_to_s3('xgboost_model.pkl', bucket_name, model_path)
upload_file_to_s3('feature_columns.pkl', bucket_name, columns_path)

xgboost_model.pkl uploaded to s3://pipeline2-data-storage/models/xgboost_model.pkl
feature_columns.pkl uploaded to s3://pipeline2-data-storage/models/feature_columns.pkl


In [21]:
# Testing the model with mock data mimicking live API data input

test_data = [
    {
        'flightnum':123,
        'origin':'ABE',
        'dest':'ABI',
        'uniquecarrier': 'B6',
        'deptime': 1620,
        'dayofmonth':5,
        'year':2019,
        'month':3,
    },
    {
        'flightnum':345,
        'origin':'ABI',
        'dest':'ABQ',
        'uniquecarrier': 'DL',
        'deptime': 1620,
        'dayofmonth':12,
        'year':2012,
        'month':12,
    },
    {
        'flightnum':678,
        'origin':'ABQ',
        'dest':'CDC',
        'uniquecarrier': 'F9',
        'deptime': 1620,
        'dayofmonth':2,
        'year':2003,
        'month':6,
    }
]

new_data_df = pd.DataFrame(test_data)
flightnums = new_data_df['flightnum']
origins = new_data_df['origin']
dests = new_data_df['dest']
uniquecarriers = new_data_df['uniquecarrier']


# Process the input to match training features
# Ensure 'crsdeptime' has 4 characters by padding with leading zeros if necessary
new_data_df['deptime'] = new_data_df['deptime'].astype(str).str.zfill(4)
# new_data_df['deptime'] = new_data_df['deptime'].str[:2].astype(int)
new_data_df['crsdepminute'] = new_data_df['deptime'].str[2:].astype(int)
new_data_df = new_data_df.drop(columns=['deptime','flightnum'])
new_data_df = pd.get_dummies(new_data_df, columns=['origin', 'dest', 'uniquecarrier'])

# Reindex to match training feature columns
new_data_df = new_data_df.reindex(columns=X.columns, fill_value=0)

# Make predictions for all rows
predictions = xgb_model.predict(new_data_df)


result_df = pd.DataFrame({
        'flightnum': flightnums,
        'orgin': origins,
        'destination': dests,
        'carrier': uniquecarriers,
        'expected_delay': predictions
    })

# Convert result to JSON format for return
result_json = result_df.to_dict(orient='records')
print(result_json)


[{'flightnum': 123, 'orgin': 'ABE', 'destination': 'ABI', 'carrier': 'B6', 'predicted_delay': 3.362814426422119}, {'flightnum': 345, 'orgin': 'ABI', 'destination': 'ABQ', 'carrier': 'DL', 'predicted_delay': 2.300037145614624}, {'flightnum': 678, 'orgin': 'ABQ', 'destination': 'CDC', 'carrier': 'F9', 'predicted_delay': -0.5688889622688293}]
