In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("data/combined_csv_v2.csv")
df.head(3)

Unnamed: 0,target,Distance,DepHourofDay,AWND_O,AWND_O.1,AWND_O.2,AWND_O.3,AWND_O.4,AWND_O.5,PRCP_O,...,Origin_SFO,Dest_CLT,Dest_DEN,Dest_DFW,Dest_IAH,Dest_LAX,Dest_ORD,Dest_PHX,Dest_SFO,is_holiday_True
0,0.0,1464.0,7,57,57,57,57,57,57,0,...,False,False,False,False,False,False,False,False,True,False
1,0.0,1464.0,7,49,49,49,49,49,49,0,...,False,False,False,False,False,False,False,False,True,False
2,0.0,1464.0,7,29,29,29,29,29,29,0,...,False,False,False,False,False,False,False,False,True,False


In [3]:
# Convert boolean columns to 0 and 1
for col in df.columns:
    if df[col].dtype == 'bool':
        df[col] = df[col].astype(float)  # Converting True to 1 and False to 0


In [4]:
# Convert boolean columns to 0 and 1
for col in df.columns:
    if df[col].dtype == 'int64':
        df[col] = df[col].astype(float)  # Converting True to 1 and False to 0


In [5]:
df.dtypes

target             float64
Distance           float64
DepHourofDay       float64
AWND_O             float64
AWND_O.1           float64
                    ...   
Dest_LAX           float64
Dest_ORD           float64
Dest_PHX           float64
Dest_SFO           float64
is_holiday_True    float64
Length: 126, dtype: object

In [6]:
from sklearn.model_selection import train_test_split

train, validation = train_test_split(df, test_size=0.3)
validation, test = train_test_split(validation, test_size = 0.5)

In [7]:
train.to_csv("data/q1b/train.csv", index = False, header = False)
validation.to_csv("data/q1b/validation.csv", index = False, header = False)
test.to_csv("data/q1b/test.csv", index = False, header = False)

In [8]:
import sagemaker
import boto3
from sagemaker import image_uris

sess = sagemaker.Session()
bucket = sess.default_bucket()


region = boto3.Session().region_name
linear_learner_container = image_uris.retrieve("linear-learner", region)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [9]:
prefix = "final_assignment/q1b"
train_path = sess.upload_data(path="./data/q1b/train.csv", key_prefix=prefix + "/input/training")
valid_path = sess.upload_data(path="./data/q1b/validation.csv", key_prefix=prefix + "/input/validation")
test_path = sess.upload_data(path="./data/q1b/test.csv", key_prefix=prefix + "/input/test")

In [10]:
from sagemaker.estimator import Estimator

linear_estimator = Estimator(
    linear_learner_container,
    role=sagemaker.get_execution_role(),
    instance_count=1,
    instance_type='ml.c5.4xlarge',
    output_path='s3://{}/{}/output'.format(bucket,prefix)
)
# setting mini_batch_size to 100 since my dataset is large
linear_estimator.set_hyperparameters(predictor_type='binary_classifier', mini_batch_size=100, epochs = 4)

In [11]:
training_data_channel = sagemaker.TrainingInput(s3_data=train_path,content_type='text/csv')
validation_data_channel = sagemaker.TrainingInput(s3_data=valid_path,content_type='text/csv')

In [12]:
linear_estimator.fit({'train': training_data_channel,'validation': validation_data_channel})

INFO:sagemaker:Creating training-job with name: linear-learner-2024-11-03-05-59-57-821


2024-11-03 05:59:58 Starting - Starting the training job...
2024-11-03 06:00:13 Starting - Preparing the instances for training...
2024-11-03 06:00:57 Downloading - Downloading the training image......
2024-11-03 06:01:48 Training - Training image download completed. Training in progress...[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[11/03/2024 06:02:14 INFO 140668318713664] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sigma': '0.01', 'init_bias': '0.0', 'optimizer': 'auto', 'loss': 'auto', 'margin': '1.0', 'quantile': 

In [13]:
transformer = linear_estimator.transformer(instance_count=1, 
                                           instance_type="ml.c5.4xlarge", 
                                           assemble_with="Line", 
                                           output_path=f"s3://{bucket}/{prefix}/batch_output")

INFO:sagemaker:Creating model with name: linear-learner-2024-11-03-06-34-58-792


In [14]:
test_df = pd.read_csv("data/q1b/test.csv")
test_batch = test_df[test_df.columns[1:]]

In [16]:
test_batch.to_csv("data/batch/batch_input_q1b.csv", index = False, header = False)


In [17]:
batch_test_path = sess.upload_data(path="data/batch/batch_input_q1b.csv", key_prefix=prefix + "/batch_input")


In [18]:
transformer.transform(batch_test_path, content_type = "text/csv", split_type="Line")
transformer.wait()

INFO:sagemaker:Creating transform job with name: linear-learner-2024-11-03-06-36-29-968


...................................[34mDocker entrypoint called with argument(s): serve[0m
[34mRunning default environment configuration script[0m
[34m[11/03/2024 06:42:24 INFO 140059492329280] Memory profiler is not enabled by the environment variable ENABLE_PROFILER.[0m
  if num_device is 1 and 'dist' not in kvstore:[0m
  if cons['type'] is 'ineq':[0m
  if len(self.X_min) is not 0:[0m
[34m[11/03/2024 06:42:26 INFO 140059492329280] loaded entry point class algorithm.serve.server_config:config_api[0m
[34m[11/03/2024 06:42:26 INFO 140059492329280] loading entry points[0m
[34m[11/03/2024 06:42:26 INFO 140059492329280] loaded request iterator application/json[0m
[34m[11/03/2024 06:42:26 INFO 140059492329280] loaded request iterator application/jsonlines[0m
[34m[11/03/2024 06:42:26 INFO 140059492329280] loaded request iterator application/x-recordio-protobuf[0m
[34m[11/03/2024 06:42:26 INFO 140059492329280] loaded request iterator text/csv[0m
[34m[11/03/2024 06:42:26 

In [19]:
print(prefix)

final_assignment/q1b


In [21]:
import io

y_file = boto3.client("s3").get_object(Bucket = bucket, Key = "final_assignment/q1b/batch_output/batch_input_q1b.csv.out")
y_pred = pd.read_csv(io.BytesIO(y_file["Body"].read()), header = None, names = ["Predicted"])

In [20]:
import boto3

s3 = boto3.client("s3")
response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
if 'Contents' in response:
    for obj in response['Contents']:
        print(obj['Key'])
else:
    print("No objects found in the specified prefix.")


final_assignment/q1b/batch_input/batch_input_q1b.csv
final_assignment/q1b/batch_output/batch_input_q1b.csv.out
final_assignment/q1b/input/test/test.csv
final_assignment/q1b/input/training/train.csv
final_assignment/q1b/input/validation/validation.csv
final_assignment/q1b/output/linear-learner-2024-11-03-05-18-28-404/debug-output/training_job_end.ts
final_assignment/q1b/output/linear-learner-2024-11-03-05-18-28-404/profiler-output/framework/training_job_end.ts
final_assignment/q1b/output/linear-learner-2024-11-03-05-18-28-404/profiler-output/system/incremental/2024110305/1730611140.algo-1.json
final_assignment/q1b/output/linear-learner-2024-11-03-05-18-28-404/profiler-output/system/incremental/2024110305/1730611200.algo-1.json
final_assignment/q1b/output/linear-learner-2024-11-03-05-18-28-404/profiler-output/system/incremental/2024110305/1730611260.algo-1.json
final_assignment/q1b/output/linear-learner-2024-11-03-05-18-28-404/profiler-output/system/incremental/2024110305/1730611320.algo

In [22]:
y_pred["target"] = y_pred.index
y_pred

Unnamed: 0,Predicted,target
"{""predicted_label"":0",score:0.468951910734176},"{""predicted_label"":0"
"{""predicted_label"":1",score:0.665442287921905},"{""predicted_label"":1"
"{""predicted_label"":1",score:0.557413637638092},"{""predicted_label"":1"
"{""predicted_label"":1",score:0.739693045616149},"{""predicted_label"":1"
"{""predicted_label"":0",score:0.32567611336708},"{""predicted_label"":0"
...,...,...
"{""predicted_label"":0",score:0.485297739505767},"{""predicted_label"":0"
"{""predicted_label"":1",score:0.600257813930511},"{""predicted_label"":1"
"{""predicted_label"":1",score:0.649752378463745},"{""predicted_label"":1"
"{""predicted_label"":1",score:0.657172977924346},"{""predicted_label"":1"


In [23]:
from sklearn.metrics import accuracy_score

predicted_values = y_pred['target'].apply(lambda x: 1 if x == 1 else 0)

known_labels = test_df.iloc[:, 0]
accuracy = accuracy_score(known_labels, predicted_values)
print("Accuracy:", accuracy)

Accuracy: 0.4992377675229637


In [24]:
from sklearn.metrics import classification_report


In [25]:
# Generate classification report
report = classification_report(known_labels, predicted_values)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

         0.0       0.50      1.00      0.67    193543
         1.0       0.00      0.00      0.00    194134

    accuracy                           0.50    387677
   macro avg       0.25      0.50      0.33    387677
weighted avg       0.25      0.50      0.33    387677



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
