In [1]:
import pandas as pd
import numpy as np
import boto3 #AWS SDK for Python; used to connect with AWS services like S3, SageMaker.
from sklearn.model_selection import train_test_split
import sagemaker 
from sagemaker import Session
import io # uploading data to AWS without saving locally.
import sagemaker.amazon.common as smac #used for converting NumPy arrays to Amazon RecordIO format for SageMaker training.
import os #Standard module for interacting with the operating system (file paths, environment variables, etc.).
from sagemaker.amazon.amazon_estimator import get_image_uri #Used to get the container image URI for built-in Amazon SageMaker algorithms.


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
df = pd.read_csv("student_scores.csv")
df.head()

Unnamed: 0,Hours,Scores
0,2.5,21
1,5.1,47
2,3.2,27
3,8.5,75
4,3.5,30


In [3]:
df.shape
(25, 2)


(25, 2)

In [4]:
x = df[["Hours"]]    # Independent variable (input feature)
y = df[["Scores"]]   # Dependent variable (target/output)


In [5]:
x.dtypes


Hours    float64
dtype: object

In [6]:
# Converts both x and y columns to float32 type.
# AWS SageMaker, as built-in algorithms (e.g., linear learner) often expect float32
x = x.astype("float32")
y = y.astype("float32")


In [7]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)


In [8]:
# Purpose: Reset the indices of each DataFrame after the split.
#drop=True avoids adding the old index as a new column.
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)


In [9]:
# y_train was originally a DataFrame with one column (e.g., "Scores").
# y_train.iloc[:, 0] selects all rows (:) and the first column (0) using .iloc, which is index-based.
#This converts the single-column DataFrame into a Series (1D vector), which is often required for model training functions that expect a vector, not a matrix.
y_train = y_train.iloc[:, 0]


In [10]:
# Convert the label column to vector (1D array)
y_test = y_test.iloc[:, 0]

# Let's create SageMaker session
sagemaker_session = sagemaker.Session()

# Define the S3 bucket name
bucket_name = "gaurobsagemaker"

# Define the prefix (folder in S3)
prefix = "linear-learner"

# Get the execution role (IAM role for SageMaker)
role = sagemaker.get_execution_role()


In [12]:
# Convert x_train into numpy array
X_train = np.array(X_train)

# Create the buffer
buf = io.BytesIO()

# Write numpy arrays to a dense tensor protobuf format compatible with SageMaker
smac.write_numpy_to_dense_tensor(buf, X_train, y_train)

# Reset buffer position to the start
buf.seek(0)


0

In [13]:
# Define the name of the file to be uploaded
key = "students.data"

# Code to upload the buffer object to S3
boto3.resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)

# Construct the full S3 path of the uploaded training data
s3_train_data = f"s3://{bucket_name}/{prefix}/train/{key}"

# Confirmation message
print("Data uploaded:", s3_train_data)


Data uploaded: s3://gaurobsagemaker/linear-learner/train/students.data


In [14]:
# Convert X_test into numpy array
X_test = np.array(X_test)

# Create the buffer
buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, X_test, y_test)
buf.seek(0)

# Define the name of the file to be uploaded
key = "student-data-test"

# Code to upload the buffer content to S3
boto3.resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'test', key)).upload_fileobj(buf)

# Construct the S3 path of the test dataset
s3_train_data = f"s3://{bucket_name}/{prefix}/test/{key}"

# Print confirmation
print("Data uploaded:", s3_train_data)


Data uploaded: s3://gaurobsagemaker/linear-learner/test/student-data-test


In [15]:
# Output location for the trained model artifacts in S3
output_location = f"s3://{bucket_name}/{prefix}/output"
output_location


's3://gaurobsagemaker/linear-learner/output'

In [16]:
# Retrieve the container image URI for the Linear Learner algorithm
container = sagemaker.image_uris.retrieve("linear-learner", boto3.Session().region_name)


In [21]:
# define the estimator
linear=sagemaker.estimator.Estimator(container,role,instance_count=1,instance_type="ml.c4.xlarge",
                                     output_path=output_location,sagemaker_session=sagemaker_session)

In [22]:
# setting up the hyperparameters
linear.set_hyperparameters(feature_dim=1,predictor_type="regressor",mini_batch_size=4,epochs=5,
                          num_models=32,loss="absolute_loss")

In [23]:
# fit the model
linear.fit({"train":s3_train_data})

INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: linear-learner-2025-07-07-18-47-06-676


2025-07-07 18:47:11 Starting - Starting the training job...
2025-07-07 18:47:25 Starting - Preparing the instances for training...
2025-07-07 18:48:09 Downloading - Downloading the training image............
2025-07-07 18:50:10 Training - Training image download completed. Training in progress.
2025-07-07 18:50:10 Uploading - Uploading generated training model[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[07/07/2025 18:50:02 INFO 139947078211392] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sigma': '0.01', 'init_bias': '0

In [24]:
# Deploy the trained model as an endpoint
linear_regressor = linear.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge"
)


INFO:sagemaker:Creating model with name: linear-learner-2025-07-07-18-54-49-867
INFO:sagemaker:Creating endpoint-config with name linear-learner-2025-07-07-18-54-49-867
INFO:sagemaker:Creating endpoint with name linear-learner-2025-07-07-18-54-49-867


--------!

In [25]:
# Set up serialization and deserialization for the endpoint
linear_regressor.serializer = sagemaker.serializers.CSVSerializer()
linear_regressor.deserializer = sagemaker.deserializers.JSONDeserializer()

# Make prediction
results = linear_regressor.predict(X_test)


In [26]:
results

{'predictions': [{'score': 32.262969970703125},
  {'score': 57.60004425048828},
  {'score': 38.22463607788086},
  {'score': 47.91233825683594},
  {'score': 26.301307678222656}]}