In [1]:
import pandas as pd
import numpy as np

In [12]:
df = pd.read_csv("data/combined_csv_v1.csv")
df.head(3)

Unnamed: 0,target,Distance,Month_2,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9,...,Dest_DEN,Dest_DFW,Dest_IAH,Dest_LAX,Dest_ORD,Dest_PHX,Dest_SFO,DepHourofDay_Early Morning,DepHourofDay_Afternoon,DepHourofDay_Evening
0,0.0,1464.0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,True,False,False
1,1.0,1464.0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,True,False,False
2,0.0,1464.0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,True,False,False


In [13]:
print(df.columns[df.dtypes == 'object'])


Index([], dtype='object')


In [14]:
# Convert boolean columns to 0 and 1
for col in df.columns:
    if df[col].dtype == 'bool':
        df[col] = df[col].astype(float)  # Converting True to 1 and False to 0


In [16]:
from sklearn.model_selection import train_test_split

train, validation = train_test_split(df, test_size=0.3)
validation, test = train_test_split(validation, test_size = 0.5)

In [17]:
train.to_csv("data/q1a/train.csv", index = False, header = False)
validation.to_csv("data/q1a/validation.csv", index = False, header = False)
test.to_csv("data/q1a/test.csv", index = False, header = False)

In [37]:
import sagemaker
import boto3
from sagemaker import image_uris

sess = sagemaker.Session()
bucket = sess.default_bucket()


region = boto3.Session().region_name
linear_learner_container = image_uris.retrieve("linear-learner", region)

INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: 1.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [38]:
prefix = "flights_delay/q1a"
train_path = sess.upload_data(path="./data/q1a/train.csv", key_prefix=prefix + "/input/training")
valid_path = sess.upload_data(path="./data/q1a/validation.csv", key_prefix=prefix + "/input/validation")
test_path = sess.upload_data(path="./data/q1a/test.csv", key_prefix=prefix + "/input/test")

In [39]:
from sagemaker.estimator import Estimator

linear_estimator = Estimator(
    linear_learner_container,
    role=sagemaker.get_execution_role(),
    instance_count=1,
    instance_type='ml.m4.xlarge',
    output_path='s3://{}/{}/output'.format(bucket,prefix)
)
# setting mini_batch_size to 100 since my dataset is large
linear_estimator.set_hyperparameters(predictor_type='binary_classifier', mini_batch_size=100)

In [40]:
training_data_channel = sagemaker.TrainingInput(s3_data=train_path,content_type='text/csv')
validation_data_channel = sagemaker.TrainingInput(s3_data=valid_path,content_type='text/csv')

In [41]:
linear_estimator.fit({'train': training_data_channel,'validation': validation_data_channel})


INFO:sagemaker:Creating training-job with name: linear-learner-2024-11-01-11-39-44-121


2024-11-01 11:39:45 Starting - Starting the training job...
2024-11-01 11:40:10 Starting - Preparing the instances for training......
2024-11-01 11:40:58 Downloading - Downloading input data...
2024-11-01 11:41:23 Downloading - Downloading the training image.........
2024-11-01 11:42:55 Training - Training image download completed. Training in progress.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[11/01/2024 11:43:10 INFO 140556695951168] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sigma': '0.01', 'init_bias': '0.0', 'o

In [42]:
transformer = linear_estimator.transformer(instance_count=1, 
                                           instance_type="ml.m4.xlarge", 
                                           assemble_with="Line", 
                                           output_path=f"s3://{bucket}/{prefix}/batch_output")

INFO:sagemaker:Creating model with name: linear-learner-2024-11-01-12-10-25-196


In [43]:
test_df = pd.read_csv("data/q1a/test.csv")
test_batch = test_df[test_df.columns[1:]]

In [46]:
test_batch.to_csv("data/batch/batch_input_q1a.csv", index = False, header = False)


In [47]:
batch_test_path = sess.upload_data(path="data/batch/batch_input_q1a.csv", key_prefix=prefix + "/batch_input")


In [48]:
transformer.transform(batch_test_path, content_type = "text/csv", split_type="Line")
transformer.wait()

INFO:sagemaker:Creating transform job with name: linear-learner-2024-11-01-12-14-16-229


...............................................[34mDocker entrypoint called with argument(s): serve[0m
[34mRunning default environment configuration script[0m
[34m[11/01/2024 12:22:09 INFO 139668160649024] Memory profiler is not enabled by the environment variable ENABLE_PROFILER.[0m
  if num_device is 1 and 'dist' not in kvstore:[0m
  if cons['type'] is 'ineq':[0m
  if len(self.X_min) is not 0:[0m
[34m[11/01/2024 12:22:13 INFO 139668160649024] loaded entry point class algorithm.serve.server_config:config_api[0m
[34m[11/01/2024 12:22:13 INFO 139668160649024] loading entry points[0m
[34m[11/01/2024 12:22:13 INFO 139668160649024] loaded request iterator application/json[0m
[34m[11/01/2024 12:22:13 INFO 139668160649024] loaded request iterator application/jsonlines[0m
[34m[11/01/2024 12:22:13 INFO 139668160649024] loaded request iterator application/x-recordio-protobuf[0m
[34m[11/01/2024 12:22:13 INFO 139668160649024] loaded request iterator text/csv[0m
[34m[11/01/20

In [52]:
print(prefix)

flights_delay/q1a


In [54]:
import io

y_file = boto3.client("s3").get_object(Bucket = bucket, Key = "flights_delay/q1a/batch_output/batch_input_q1a.csv.out")
y_pred = pd.read_csv(io.BytesIO(y_file["Body"].read()), header = None, names = ["Predicted"])

In [51]:
import boto3

s3 = boto3.client("s3")
response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
if 'Contents' in response:
    for obj in response['Contents']:
        print(obj['Key'])
else:
    print("No objects found in the specified prefix.")


flights_delay/q1a/batch_input/batch_input_q1a.csv
flights_delay/q1a/batch_output/batch_input_q1a.csv.out
flights_delay/q1a/input/test/test.csv
flights_delay/q1a/input/training/train.csv
flights_delay/q1a/input/validation/validation.csv
flights_delay/q1a/output/linear-learner-2024-11-01-11-22-43-218/debug-output/training_job_end.ts
flights_delay/q1a/output/linear-learner-2024-11-01-11-22-43-218/profiler-output/framework/training_job_end.ts
flights_delay/q1a/output/linear-learner-2024-11-01-11-22-43-218/profiler-output/system/incremental/2024110111/1730460180.algo-1.json
flights_delay/q1a/output/linear-learner-2024-11-01-11-22-43-218/profiler-output/system/incremental/2024110111/1730460240.algo-1.json
flights_delay/q1a/output/linear-learner-2024-11-01-11-22-43-218/profiler-output/system/incremental/2024110111/1730460300.algo-1.json
flights_delay/q1a/output/linear-learner-2024-11-01-11-22-43-218/profiler-output/system/training_job_end.ts
flights_delay/q1a/output/linear-learner-2024-11-01-

In [55]:
y_pred["target"] = y_pred.index
y_pred

Unnamed: 0,Predicted,target
"{""predicted_label"":0",score:0.254663109779357},"{""predicted_label"":0"
"{""predicted_label"":0",score:0.19239018857479},"{""predicted_label"":0"
"{""predicted_label"":0",score:0.34109279513359},"{""predicted_label"":0"
"{""predicted_label"":0",score:0.126033708453178},"{""predicted_label"":0"
"{""predicted_label"":0",score:0.233907371759414},"{""predicted_label"":0"
...,...,...
"{""predicted_label"":0",score:0.1641056984663},"{""predicted_label"":0"
"{""predicted_label"":0",score:0.40089264512062},"{""predicted_label"":0"
"{""predicted_label"":0",score:0.172371342778205},"{""predicted_label"":0"
"{""predicted_label"":0",score:0.224455416202545},"{""predicted_label"":0"


In [56]:
from sklearn.metrics import accuracy_score

predicted_values = y_pred['target'].apply(lambda x: 1 if x == 1 else 0)

known_labels = test_df.iloc[:, 0]
accuracy = accuracy_score(known_labels, predicted_values)
print("Accuracy:", accuracy)

Accuracy: 0.7743978054188118
