In [164]:
import sagemaker
from sagemaker import get_execution_role
import boto3
import os
from sagemaker.session import Session
from sagemaker.feature_store.feature_group import FeatureGroup  # For manipulating filepath names
import numpy as np


bucket = sagemaker.Session().default_bucket()
prefix = 'sagemaker/DEMO-linlearn-dm'
# Define IAM role
role = get_execution_role()

# Import the data from SageMaker Feature Store

In [165]:
region = boto3.Session().region_name
boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)
featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime
)

feature_group_name = "FG-dataflow-8defd904"  # replace with your feature group name 
feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session)

In [166]:
# Build SQL query to features group
fs_query = feature_group.athena_query()
fs_table = fs_query.table_name
query_string = 'SELECT * FROM "'+fs_table+'"'
print('Running ' + query_string)

Running SELECT * FROM "fg_dataflow_8defd904_1729124938"


In [167]:
# Run Athena query. The output is loaded to a Pandas dataframe.
fs_query.run(query_string=query_string, output_location='s3://'+bucket+'/'+prefix+'/fs_query_results/')
fs_query.wait()
model_data = fs_query.as_dataframe()

INFO:sagemaker:Query 39a48fdf-d682-4cdc-bb4a-782f76177c00 is being executed.
INFO:sagemaker:Query 39a48fdf-d682-4cdc-bb4a-782f76177c00 successfully executed.


In [168]:
model_data

Unnamed: 0,sensor_id,vehicle_count,avg_speed,incident,weather_condition_fog,weather_condition_rain,weather_condition_clear,weather_condition_snow,timestamp_month,write_time,api_invocation_time,is_deleted
0,2,4.685296,8.589093,0,1.0,0.0,0.0,0.0,1,2024-10-17 00:38:15.942,2024-10-17 00:33:19.000,False
1,4,4.239077,10.535598,0,0.0,0.0,0.0,1.0,1,2024-10-17 00:38:15.942,2024-10-17 00:33:19.000,False
2,2,3.569749,9.831642,0,0.0,0.0,1.0,0.0,1,2024-10-17 00:38:15.942,2024-10-17 00:33:19.000,False
3,3,4.908405,10.714162,0,0.0,1.0,0.0,0.0,1,2024-10-17 00:38:15.942,2024-10-17 00:33:19.000,False
4,1,4.239077,7.996590,0,0.0,0.0,1.0,0.0,1,2024-10-17 00:38:15.942,2024-10-17 00:33:19.000,False
...,...,...,...,...,...,...,...,...,...,...,...,...
692,5,4.685296,8.369228,1,0.0,0.0,0.0,1.0,1,2024-10-17 00:38:15.888,2024-10-17 00:33:20.000,False
693,1,6.023952,8.490530,1,1.0,0.0,0.0,0.0,1,2024-10-17 00:38:15.888,2024-10-17 00:33:20.000,False
694,1,3.792859,9.846547,1,0.0,1.0,0.0,0.0,1,2024-10-17 00:38:15.888,2024-10-17 00:33:20.000,False
695,1,3.123531,9.182528,1,0.0,1.0,0.0,0.0,1,2024-10-17 00:38:15.888,2024-10-17 00:33:20.000,False


In [169]:
#remove unwanted columns
model_data = model_data.drop(['write_time', 'api_invocation_time', 'is_deleted'], axis=1)

In [170]:
model_data

Unnamed: 0,sensor_id,vehicle_count,avg_speed,incident,weather_condition_fog,weather_condition_rain,weather_condition_clear,weather_condition_snow,timestamp_month
0,2,4.685296,8.589093,0,1.0,0.0,0.0,0.0,1
1,4,4.239077,10.535598,0,0.0,0.0,0.0,1.0,1
2,2,3.569749,9.831642,0,0.0,0.0,1.0,0.0,1
3,3,4.908405,10.714162,0,0.0,1.0,0.0,0.0,1
4,1,4.239077,7.996590,0,0.0,0.0,1.0,0.0,1
...,...,...,...,...,...,...,...,...,...
692,5,4.685296,8.369228,1,0.0,0.0,0.0,1.0,1
693,1,6.023952,8.490530,1,1.0,0.0,0.0,0.0,1
694,1,3.792859,9.846547,1,0.0,1.0,0.0,0.0,1
695,1,3.123531,9.182528,1,0.0,1.0,0.0,0.0,1


In [171]:
model_data.head()

Unnamed: 0,sensor_id,vehicle_count,avg_speed,incident,weather_condition_fog,weather_condition_rain,weather_condition_clear,weather_condition_snow,timestamp_month
0,2,4.685296,8.589093,0,1.0,0.0,0.0,0.0,1
1,4,4.239077,10.535598,0,0.0,0.0,0.0,1.0,1
2,2,3.569749,9.831642,0,0.0,0.0,1.0,0.0,1
3,3,4.908405,10.714162,0,0.0,1.0,0.0,0.0,1
4,1,4.239077,7.99659,0,0.0,0.0,1.0,0.0,1


In [172]:
# Prepare data SageMaker's Linear Learner
# Amazon SageMaker's Linear Learner container expects data in CSV data format. 
# Note that the first column must be the target variable and the CSV should not include headers. 

# move target (ie, incident) to first column
model_data2 = model_data.reindex(columns = ['incident', 'sensor_id', 'vehicle_count', 'avg_speed', 'weather_condition_fog',
                                         'weather_condition_rain', 'weather_condition_clear', 'weather_condition_snow',	'timestamp_month'])
model_data2

Unnamed: 0,incident,sensor_id,vehicle_count,avg_speed,weather_condition_fog,weather_condition_rain,weather_condition_clear,weather_condition_snow,timestamp_month
0,0,2,4.685296,8.589093,1.0,0.0,0.0,0.0,1
1,0,4,4.239077,10.535598,0.0,0.0,0.0,1.0,1
2,0,2,3.569749,9.831642,0.0,0.0,1.0,0.0,1
3,0,3,4.908405,10.714162,0.0,1.0,0.0,0.0,1
4,0,1,4.239077,7.996590,0.0,0.0,1.0,0.0,1
...,...,...,...,...,...,...,...,...,...
692,1,5,4.685296,8.369228,0.0,0.0,0.0,1.0,1
693,1,1,6.023952,8.490530,1.0,0.0,0.0,0.0,1
694,1,1,3.792859,9.846547,0.0,1.0,0.0,0.0,1
695,1,1,3.123531,9.182528,0.0,1.0,0.0,0.0,1


# For best results, ensure your data is shuffled before training. Training with unshuffled data may cause training to fail.



In [173]:
#randomizw/shuffle the data so all the 1s aren't at the endmodel_data2 = np.random.shuffle(model_data2.values)

model_data2 = model_data2.sample(frac=1)

In [174]:
model_data2

Unnamed: 0,incident,sensor_id,vehicle_count,avg_speed,weather_condition_fog,weather_condition_rain,weather_condition_clear,weather_condition_snow,timestamp_month
636,0,2,3.792859,10.106101,0.0,1.0,0.0,0.0,1
377,0,4,3.792859,9.070409,1.0,0.0,0.0,0.0,1
37,0,1,4.685296,9.699326,0.0,1.0,0.0,0.0,1
569,0,2,4.685296,8.506998,0.0,0.0,1.0,0.0,1
322,0,2,6.470171,7.248965,1.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...
405,0,5,5.131515,8.767406,1.0,0.0,0.0,0.0,1
360,0,4,3.346640,8.701550,0.0,1.0,0.0,0.0,1
454,0,5,5.354624,8.057039,0.0,1.0,0.0,0.0,1
136,0,4,5.354624,9.989829,0.0,1.0,0.0,0.0,1


In [175]:
# remove the headers
model_data2.columns = range(model_data2.shape[1])   # Delete headers

model_data2

Unnamed: 0,0,1,2,3,4,5,6,7,8
636,0,2,3.792859,10.106101,0.0,1.0,0.0,0.0,1
377,0,4,3.792859,9.070409,1.0,0.0,0.0,0.0,1
37,0,1,4.685296,9.699326,0.0,1.0,0.0,0.0,1
569,0,2,4.685296,8.506998,0.0,0.0,1.0,0.0,1
322,0,2,6.470171,7.248965,1.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...
405,0,5,5.131515,8.767406,1.0,0.0,0.0,0.0,1
360,0,4,3.346640,8.701550,0.0,1.0,0.0,0.0,1
454,0,5,5.354624,8.057039,0.0,1.0,0.0,0.0,1
136,0,4,5.354624,9.989829,0.0,1.0,0.0,0.0,1


# Separate data into train/validation/test data split

In [176]:
# The model will be trained on 70% of data, it will then be evaluated on 20% of data to give us an estimate 
# of the accuracy, and 10% will be held back as a final testing dataset which will be used later on.

# Compute split indices
train_end = int(0.7 * len(model_data2))
val_end = int(0.9 * len(model_data2))

# Split the DataFrame into train, validation, and test sets
train_data = model_data2.iloc[:train_end]
validation_data = model_data2.iloc[train_end:val_end]
test_data = model_data2.iloc[val_end:]

In [177]:
len(train_data)

487

In [178]:
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
636,0,2,3.792859,10.106101,0.0,1.0,0.0,0.0,1
377,0,4,3.792859,9.070409,1.0,0.0,0.0,0.0,1
37,0,1,4.685296,9.699326,0.0,1.0,0.0,0.0,1
569,0,2,4.685296,8.506998,0.0,0.0,1.0,0.0,1
322,0,2,6.470171,7.248965,1.0,0.0,0.0,0.0,1


In [179]:
len(validation_data)

140

In [180]:
len(test_data)

70

In [181]:
# train data to CSV
train_data.to_csv('lil-train.csv', index=False, header=False)
validation_data.to_csv('lil-validation.csv', index=False, header=False)
test_data.to_csv('lil-test.csv', index=False, header=False)

In [None]:
bucket

# Copy data to S3 for SageMaker to access

In [183]:
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('lil-train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('lil-validation.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'test/test.csv')).upload_file('lil-validation.csv')

# Train the model using Linear Learner

In [184]:
# specify the ECR container location for Amazon SageMaker's implementation of Linear Learner

container = sagemaker.image_uris.retrieve(region=boto3.Session().region_name, framework="linear-learner")

INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: 1.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [185]:
# Then, because we're training with the CSV file format, we'll create s3_inputs that our training function can use 
# as a pointer to the files in S3, which also specify that the content type is CSV.
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='text/csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='text/csv')
s3_input_test = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/test/'.format(bucket, prefix), content_type='text/csv')

In [186]:
s3_input_test

<sagemaker.inputs.TrainingInput at 0x7f07ea010a10>

In [None]:
sess = sagemaker.Session()

artificat_output_location = 's3://{}/{}/output'.format(bucket, prefix)
print("The model artifact will be loaded to: ", artificat_output_location)

ll = sagemaker.estimator.Estimator(container,
                                    role, 
                                    instance_count=1, 
                                    instance_type='ml.m4.xlarge',
                                    output_path=artificat_output_location,
                                    sagemaker_session=sess)
ll.set_hyperparameters(optimizer = "sgd",
                       learning_rate = 0.01,
                       mini_batch_size = 50,
                       epochs = 30,
                       predictor_type = "binary_classifier")

ll.fit({'train': s3_input_train, 'validation': s3_input_validation, 'test': s3_input_test})