# Install necessary packages
This block installs the s3fs package using the %pip magic command.

In [1]:
%pip install s3fs

Collecting fsspec==2024.3.1 (from s3fs)
  Using cached fsspec-2024.3.1-py3-none-any.whl.metadata (6.8 kB)
Using cached fsspec-2024.3.1-py3-none-any.whl (171 kB)
[0mInstalling collected packages: fsspec
  Attempting uninstall: fsspec
[0m    Found existing installation: fsspec 2023.6.0
[31mERROR: Cannot uninstall fsspec 2023.6.0, RECORD file not found. You might be able to recover from this via: 'pip install --force-reinstall --no-deps fsspec==2023.6.0'.[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


# Import libraries
Here, necessary libraries and modules are imported, including boto3 for AWS interactions, sagemaker for using SageMaker functionalities,
 pandas for data manipulation, and various functions from sklearn for machine learning metrics.

In [2]:
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput

import numpy as np
import io
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sagemaker.serializers import CSVSerializer

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


# Load data from S3
This section loads training and testing datasets from Amazon S3 into pandas DataFrames.

In [3]:
train_data = pd.read_csv(
    's3://team7-bucket/Airline_Dataset/train.csv'
)

In [4]:
test_val_data = pd.read_csv(
    's3://team7-bucket/Airline_Dataset/test.csv'
)

In [5]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


# Define transformation functions
Here, transformation functions are defined to preprocess categorical variables into numerical format.

In [6]:
def transform_gender(x):
    if x == 'Female':
        return 1
    elif x == 'Male':
        return 0
    else:
        return -1
    
def transform_customer_type(x):
    if x == 'Loyal Customer':
        return 1
    elif x == 'disloyal Customer':
        return 0
    else:
        return -1
    
def transform_travel_type(x):
    if x == 'Business travel':
        return 1
    elif x == 'Personal Travel':
        return 0
    else:
        return -1
    
def transform_class(x):
    if x == 'Business':
        return 2
    elif x == 'Eco Plus':
        return 1
    elif x == 'Eco':
        return 0    
    else:
        return -1
    
def transform_satisfaction(x):
    if x == 'satisfied':
        return 1
    elif x == 'neutral or dissatisfied':
        return 0
    else:
        return -1
    
def process_data(df):
    df = df.drop(['Unnamed: 0', 'id'], axis = 1)
    df['Gender'] = df['Gender'].apply(transform_gender)
    df['Customer Type'] = df['Customer Type'].apply(transform_customer_type)
    df['Type of Travel'] = df['Type of Travel'].apply(transform_travel_type)
    df['Class'] = df['Class'].apply(transform_class)
    df['satisfaction'] = df['satisfaction'].apply(transform_satisfaction)
    df['Arrival Delay in Minutes'].fillna(df['Arrival Delay in Minutes'].median(), inplace = True)
    
    return df

# Apply transformations to data
The defined transformation functions are applied to preprocess the loaded datasets.

In [7]:
train = process_data(train_data)
test_val = process_data(test_val_data)

In [8]:
train.head()

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,1,13,0,1,460,3,4,3,1,...,5,4,3,4,4,5,5,25,18.0,0
1,0,0,25,1,2,235,3,2,3,3,...,1,1,5,3,1,4,1,1,6.0,0
2,1,1,26,1,2,1142,2,2,2,2,...,5,4,3,4,4,4,5,0,0.0,1
3,1,1,25,1,2,562,2,5,5,5,...,2,2,5,3,1,4,2,11,9.0,0
4,0,1,61,1,2,214,3,3,3,3,...,3,3,4,4,3,3,3,0,0.0,1


# Split data into train, validation, and test sets
The data is split into training, validation, and test sets using train_test_split.

In [9]:
val, test = train_test_split(test_val, train_size=0.5, random_state=1200)

In [10]:
train.shape, val.shape, test.shape

((103904, 23), (12988, 23), (12988, 23))

# Prepare train, validation, and test sets
This block prepares the training, validation, and test sets for modeling and aligns their columns.

In [11]:
# Assuming 'train' and 'val' DataFrames contain your training and validation data
X_train = train.drop('satisfaction', axis=1)  # Features for training
y_train = train['satisfaction']  # Target variable for training

X_val = val.drop('satisfaction', axis=1)  # Features for validation
y_val = val['satisfaction']  # Target variable for validation

X_test = test.drop('satisfaction', axis=1)  # Features for testing
y_test = test['satisfaction']  # Target variable for testing

In [12]:
train = pd.concat([y_train, X_train], axis=1)
val = pd.concat([y_val, X_val], axis=1)

In [13]:
# After one-hot encoding or any other preprocessing
train, val = train.align(val, join='inner', axis=1)  # Ensures both DataFrames have the same columns in the same order

In [14]:
train.head()

Unnamed: 0,satisfaction,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes
0,0,0,1,13,0,1,460,3,4,3,...,5,5,4,3,4,4,5,5,25,18.0
1,0,0,0,25,1,2,235,3,2,3,...,1,1,1,5,3,1,4,1,1,6.0
2,1,1,1,26,1,2,1142,2,2,2,...,5,5,4,3,4,4,4,5,0,0.0
3,0,1,1,25,1,2,562,2,5,5,...,2,2,2,5,3,1,4,2,11,9.0
4,1,0,1,61,1,2,214,3,3,3,...,5,3,3,4,4,3,3,3,0,0.0


# Upload train and validation data to S3
Train and validation datasets are uploaded to S3 for training the model.

In [15]:
s3 = boto3.resource('s3')

In [16]:
def upload_to_s3(df, bucket, filename):
    
    placeholder = io.StringIO()
    df.to_csv(placeholder, header=False, index=False)
    object = s3.Object(bucket, filename)
    object.put(Body=placeholder.getvalue())

In [17]:
bucket = 'team7-bucket'

In [18]:
upload_to_s3(train, bucket, 'sagemaker-data/train.csv')

In [19]:
upload_to_s3(val, bucket, 'sagemaker-data/val.csv')

In [20]:
# Define the data location
train_data_location = f's3://{bucket}/sagemaker-data/train.csv'
val_data_location = f's3://{bucket}/sagemaker-data/val.csv'

# Specify output location for model artifacts
output_location = f's3://{bucket}/sagemaker-models/'

# Define training input
Training inputs are defined using TrainingInput for training and validation datasets.

In [21]:
# Specify the input data configuration using TrainingInput
train_input = TrainingInput(s3_data=train_data_location, content_type='csv')
val_input = TrainingInput(s3_data=val_data_location, content_type='csv')

# Set up SageMaker environment
The SageMaker environment is set up, including getting the execution role and retrieving the XGBoost image URI.

In [22]:
# Get the current execution role and XGBoost image URI (ensure the region is correct)
role = get_execution_role()
xgboost_image = sagemaker.image_uris.retrieve(framework='xgboost', region=boto3.Session().region_name, version='1.0-1')


# Define XGBoost model
An XGBoost model is defined using the Estimator class, specifying parameters like instance type, instance count, etc.

In [23]:
# Define the model
xgboost_model = Estimator(image_uri=xgboost_image,
                          role=role,
                          instance_count=1,
                          instance_type='ml.m4.xlarge',
                          output_path=output_location,
                          sagemaker_session=sagemaker.Session())


# Set hyperparameters and launch training job
Hyperparameters for the XGBoost model are set, and the training job is launched with fit().

In [24]:
# Set the hyperparameters
xgboost_model.set_hyperparameters(objective='binary:logistic',
                                  num_round=1000)


In [25]:
# Launch the training job
xgboost_model.fit({'train': train_input, 'validation': val_input})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-04-01-20-21-28-858


2024-04-01 20:21:29 Starting - Starting the training job...
2024-04-01 20:21:44 Starting - Preparing the instances for training...
2024-04-01 20:22:24 Downloading - Downloading input data...
2024-04-01 20:22:59 Downloading - Downloading the training image......
2024-04-01 20:23:44 Training - Training image download completed. Training in progress...[34m[2024-04-01 20:24:00.047 ip-10-0-203-165.ec2.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m

# Get training job name
Training job name.

In [26]:
xgboost_model._current_job_name

'sagemaker-xgboost-2024-04-01-20-21-28-858'

# Deploy the trained model
The trained model is deployed to a SageMaker endpoint for making predictions.

In [29]:
predictor = xgboost_model.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge', serializer=sagemaker.serializers.CSVSerializer())

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-04-01-20-26-12-964
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2024-04-01-20-26-12-964
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2024-04-01-20-26-12-964


-------!

# Make predictions and calculate evaluation metrics
Predictions are made using the deployed model, and evaluation metrics like accuracy, precision, recall, F1 score, and ROC-AUC are calculated and printed.

In [30]:
predictor.serializer = CSVSerializer()

In [34]:
predictions = predictor.predict(X_test.to_csv(header=False, index=False)).decode("utf-8")

In [35]:
prediction_values = np.fromstring(predictions[1:-1], sep=',')

In [36]:
# Convert predictions to binary (this step might vary based on your specific use case)
predicted_labels = np.where(prediction_values > 0.5, 1, 0)

# Calculate metrics
accuracy = accuracy_score(y_test, predicted_labels)
precision = precision_score(y_test, predicted_labels)
recall = recall_score(y_test, predicted_labels)
f1 = f1_score(y_test, predicted_labels)
roc_auc = roc_auc_score(y_test, prediction_values)  # Use prediction values directly for AUC

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"ROC-AUC: {roc_auc}")


Accuracy: 0.9608869725900832
Precision: 0.9653860094237042
Recall: 0.9438341601700921
F1 Score: 0.9544884429313744
ROC-AUC: 0.9944481163793969
