In [44]:
import sagemaker
from sagemaker import get_execution_role
import boto3
import os
from sagemaker.session import Session
from sagemaker.feature_store.feature_group import FeatureGroup  # For manipulating filepath names
import numpy as np


bucket = sagemaker.Session().default_bucket()
prefix = 'sagemaker/DEMO-linlearn-dm'
# Define IAM role
role = get_execution_role()

# Import the data from SageMaker Feature Store

In [45]:
region = boto3.Session().region_name
boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)
featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime
)

feature_group_name = "FG-dataflow-8defd904"  # replace with your feature group name 
feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session)

In [46]:
# Build SQL query to features group
fs_query = feature_group.athena_query()
fs_table = fs_query.table_name
query_string = 'SELECT * FROM "'+fs_table+'"'
print('Running ' + query_string)

Running SELECT * FROM "fg_dataflow_8defd904_1729124938"


In [47]:
# Run Athena query. The output is loaded to a Pandas dataframe.
fs_query.run(query_string=query_string, output_location='s3://'+bucket+'/'+prefix+'/fs_query_results/')
fs_query.wait()
model_data = fs_query.as_dataframe()

INFO:sagemaker:Query fe40e5b0-191b-4c05-bbdc-9acdcc253caf is being executed.
INFO:sagemaker:Query fe40e5b0-191b-4c05-bbdc-9acdcc253caf successfully executed.


In [48]:
model_data

Unnamed: 0,sensor_id,vehicle_count,avg_speed,incident,weather_condition_fog,weather_condition_rain,weather_condition_clear,weather_condition_snow,timestamp_month,write_time,api_invocation_time,is_deleted
0,5,4.015968,11.034325,1,0.0,0.0,1.0,0.0,1,2024-10-17 00:38:15.888,2024-10-17 00:33:19.000,False
1,3,4.015968,10.088235,1,0.0,0.0,1.0,0.0,1,2024-10-17 00:38:15.888,2024-10-17 00:33:19.000,False
2,5,5.800843,9.139949,1,0.0,1.0,0.0,0.0,1,2024-10-17 00:38:15.888,2024-10-17 00:33:19.000,False
3,5,4.685296,7.285128,1,0.0,0.0,0.0,1.0,1,2024-10-17 00:38:15.888,2024-10-17 00:33:19.000,False
4,1,2.454203,8.131718,1,1.0,0.0,0.0,0.0,1,2024-10-17 00:38:15.888,2024-10-17 00:33:19.000,False
...,...,...,...,...,...,...,...,...,...,...,...,...
692,4,5.577733,8.837058,0,0.0,0.0,1.0,0.0,1,2024-10-17 00:38:15.942,2024-10-17 00:33:21.000,False
693,1,4.462187,10.360192,0,0.0,1.0,0.0,0.0,1,2024-10-17 00:38:15.942,2024-10-17 00:33:21.000,False
694,5,3.346640,8.290972,0,1.0,0.0,0.0,0.0,1,2024-10-17 00:38:15.942,2024-10-17 00:33:21.000,False
695,3,2.900421,7.090557,0,0.0,0.0,0.0,1.0,1,2024-10-17 00:38:15.942,2024-10-17 00:33:21.000,False


In [49]:
#remove unwanted columns
model_data = model_data.drop(['write_time', 'api_invocation_time', 'is_deleted'], axis=1)

In [50]:
model_data

Unnamed: 0,sensor_id,vehicle_count,avg_speed,incident,weather_condition_fog,weather_condition_rain,weather_condition_clear,weather_condition_snow,timestamp_month
0,5,4.015968,11.034325,1,0.0,0.0,1.0,0.0,1
1,3,4.015968,10.088235,1,0.0,0.0,1.0,0.0,1
2,5,5.800843,9.139949,1,0.0,1.0,0.0,0.0,1
3,5,4.685296,7.285128,1,0.0,0.0,0.0,1.0,1
4,1,2.454203,8.131718,1,1.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...
692,4,5.577733,8.837058,0,0.0,0.0,1.0,0.0,1
693,1,4.462187,10.360192,0,0.0,1.0,0.0,0.0,1
694,5,3.346640,8.290972,0,1.0,0.0,0.0,0.0,1
695,3,2.900421,7.090557,0,0.0,0.0,0.0,1.0,1


In [51]:
model_data.head()

Unnamed: 0,sensor_id,vehicle_count,avg_speed,incident,weather_condition_fog,weather_condition_rain,weather_condition_clear,weather_condition_snow,timestamp_month
0,5,4.015968,11.034325,1,0.0,0.0,1.0,0.0,1
1,3,4.015968,10.088235,1,0.0,0.0,1.0,0.0,1
2,5,5.800843,9.139949,1,0.0,1.0,0.0,0.0,1
3,5,4.685296,7.285128,1,0.0,0.0,0.0,1.0,1
4,1,2.454203,8.131718,1,1.0,0.0,0.0,0.0,1


In [52]:
# Prepare data SageMaker's Linear Learner
# Amazon SageMaker's Linear Learner container expects data in CSV data format. 
# Note that the first column must be the target variable and the CSV should not include headers. 

# move target (ie, incident) to first column
model_data2 = model_data.reindex(columns = ['incident', 'sensor_id', 'vehicle_count', 'avg_speed', 'weather_condition_fog',
                                         'weather_condition_rain', 'weather_condition_clear', 'weather_condition_snow',	'timestamp_month'])
model_data2

Unnamed: 0,incident,sensor_id,vehicle_count,avg_speed,weather_condition_fog,weather_condition_rain,weather_condition_clear,weather_condition_snow,timestamp_month
0,1,5,4.015968,11.034325,0.0,0.0,1.0,0.0,1
1,1,3,4.015968,10.088235,0.0,0.0,1.0,0.0,1
2,1,5,5.800843,9.139949,0.0,1.0,0.0,0.0,1
3,1,5,4.685296,7.285128,0.0,0.0,0.0,1.0,1
4,1,1,2.454203,8.131718,1.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...
692,0,4,5.577733,8.837058,0.0,0.0,1.0,0.0,1
693,0,1,4.462187,10.360192,0.0,1.0,0.0,0.0,1
694,0,5,3.346640,8.290972,1.0,0.0,0.0,0.0,1
695,0,3,2.900421,7.090557,0.0,0.0,0.0,1.0,1


# For best results, ensure your data is shuffled before training. Training with unshuffled data may cause training to fail.



In [53]:
#randomizw/shuffle the data so all the 1s aren't at the endmodel_data2 = np.random.shuffle(model_data2.values)

model_data2 = model_data2.sample(frac=1)

In [54]:
model_data2

Unnamed: 0,incident,sensor_id,vehicle_count,avg_speed,weather_condition_fog,weather_condition_rain,weather_condition_clear,weather_condition_snow,timestamp_month
104,0,3,4.239077,9.240443,0.0,0.0,1.0,0.0,1
647,0,5,4.015968,8.721206,1.0,0.0,0.0,0.0,1
52,0,5,5.131515,9.465079,1.0,0.0,0.0,0.0,1
420,0,4,4.015968,9.596323,1.0,0.0,0.0,0.0,1
450,0,2,4.908405,9.162554,0.0,0.0,0.0,1.0,1
...,...,...,...,...,...,...,...,...,...
188,0,3,4.462187,9.066290,0.0,0.0,0.0,1.0,1
197,0,2,5.131515,8.926531,0.0,0.0,0.0,1.0,1
12,1,1,4.685296,9.123541,0.0,0.0,0.0,1.0,1
545,0,3,3.346640,7.768443,0.0,0.0,1.0,0.0,1


In [55]:
# remove the headers
model_data2.columns = range(model_data2.shape[1])   # Delete headers

model_data2

Unnamed: 0,0,1,2,3,4,5,6,7,8
104,0,3,4.239077,9.240443,0.0,0.0,1.0,0.0,1
647,0,5,4.015968,8.721206,1.0,0.0,0.0,0.0,1
52,0,5,5.131515,9.465079,1.0,0.0,0.0,0.0,1
420,0,4,4.015968,9.596323,1.0,0.0,0.0,0.0,1
450,0,2,4.908405,9.162554,0.0,0.0,0.0,1.0,1
...,...,...,...,...,...,...,...,...,...
188,0,3,4.462187,9.066290,0.0,0.0,0.0,1.0,1
197,0,2,5.131515,8.926531,0.0,0.0,0.0,1.0,1
12,1,1,4.685296,9.123541,0.0,0.0,0.0,1.0,1
545,0,3,3.346640,7.768443,0.0,0.0,1.0,0.0,1


# Separate data into train/validation/test data split

In [56]:
# The model will be trained on 70% of data, it will then be evaluated on 20% of data to give us an estimate 
# of the accuracy, and 10% will be held back as a final testing dataset which will be used later on.

# Compute split indices
train_end = int(0.7 * len(model_data2))
val_end = int(0.9 * len(model_data2))

# Split the DataFrame into train, validation, and test sets
train_data = model_data2.iloc[:train_end]
validation_data = model_data2.iloc[train_end:val_end]
test_data = model_data2.iloc[val_end:]

In [57]:
len(train_data)

487

In [58]:
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
104,0,3,4.239077,9.240443,0.0,0.0,1.0,0.0,1
647,0,5,4.015968,8.721206,1.0,0.0,0.0,0.0,1
52,0,5,5.131515,9.465079,1.0,0.0,0.0,0.0,1
420,0,4,4.015968,9.596323,1.0,0.0,0.0,0.0,1
450,0,2,4.908405,9.162554,0.0,0.0,0.0,1.0,1


In [59]:
len(validation_data)

140

In [60]:
len(test_data)

70

In [61]:
# train data to CSV
train_data.to_csv('lil-train.csv', index=False, header=False)
validation_data.to_csv('lil-validation.csv', index=False, header=False)
test_data.to_csv('lil-test.csv', index=False, header=False)

In [62]:
bucket

'sagemaker-us-east-1-241215432415'

# Copy data to S3 for SageMaker to access

In [63]:
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('lil-train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('lil-validation.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'test/test.csv')).upload_file('lil-validation.csv')

# Train the model using Linear Learner

In [64]:
# specify the ECR container location for Amazon SageMaker's implementation of Linear Learner

container = sagemaker.image_uris.retrieve(region=boto3.Session().region_name, framework="linear-learner")

INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: 1.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [65]:
# Then, because we're training with the CSV file format, we'll create s3_inputs that our training function can use 
# as a pointer to the files in S3, which also specify that the content type is CSV.
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='text/csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='text/csv')
s3_input_test = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/test/'.format(bucket, prefix), content_type='text/csv')

In [66]:
s3_input_test

<sagemaker.inputs.TrainingInput at 0x7f316aaf3850>

In [None]:
sess = sagemaker.Session()

artificat_output_location = 's3://{}/{}/output'.format(bucket, prefix)
print("The model artificat will be loaded to: ", artificat_output_location)

ll = sagemaker.estimator.Estimator(container,
                                    role, 
                                    instance_count=1, 
                                    instance_type='ml.m4.xlarge',
                                    output_path=artificat_output_location,
                                    sagemaker_session=sess)
ll.set_hyperparameters(optimizer = "sgd",
                       learning_rate = 0.01,
                       mini_batch_size = 50,
                       epochs = 30,
                       predictor_type = "binary_classifier")

ll.fit({'train': s3_input_train, 'validation': s3_input_validation, 'test': s3_input_test})

# Tuning

In [68]:
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

hyperparameter_ranges = {
    "learning_rate": ContinuousParameter(0.00001, 1.0),
    "mini_batch_size": IntegerParameter(16, 64)
}

In [69]:
objective_metric_name = "validation:binary_classification_accuracy"

In [70]:
tuner = HyperparameterTuner(ll,
                            hyperparameter_ranges=hyperparameter_ranges,
                            max_jobs=20,
                            max_parallel_jobs=3,
                            objective_metric_name=objective_metric_name)

In [71]:
tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})

INFO:sagemaker:Creating hyperparameter tuning job with name: linear-learner-241018-2134


...........................................................................................................!


In [72]:
tuning_job_name = tuner.latest_tuning_job.job_name
tuning_job_name

'linear-learner-241018-2134'

In [None]:
tuner = HyperparameterTuner.attach(tuning_job_name=tuning_job_name)
tuner.describe()["BestTrainingJob"]["FinalHyperParameterTuningJobObjectiveMetric"]["Value"]