# ANA680_Week_3:

### Build linear regression model

In [1]:
import pandas as pd

## Load and inspect data

In [2]:
wine_quality = pd.read_csv("winequality.csv", header=0)
wine_quality.head()

Unnamed: 0,quality,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,6,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,6,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,6,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
3,6,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
4,6,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9


## <u>Linear regression WITHOUT container technology</u>

In [3]:
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sagemaker import get_execution_role, Session
from sagemaker.amazon.linear_learner import LinearLearner
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


## Split into Train(80%) and Test(20%)

In [4]:
# Split features and target
X = wine_quality.drop(columns=['quality'])
y = wine_quality['quality']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=680)

# Convert to float32 (required by SageMaker)
X_train_np = X_train.to_numpy().astype('float32')
y_train_np = y_train.to_numpy().astype('float32')

## Start session

In [5]:
role = get_execution_role()
session = Session()
bucket = "sagemaker-us-east-1-421498156986"
prefix = "linear-regression-wine"
output_path = f's3://{bucket}/linear-regression-output'

## Train model

In [6]:
linear = LinearLearner(
    role=role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    predictor_type='regressor',
    output_path=output_path)

record_set = linear.record_set(X_train_np, labels=y_train_np)
linear.fit(record_set)

2025-06-04 16:02:47 Starting - Starting the training job...
2025-06-04 16:03:09 Starting - Preparing the instances for training...
2025-06-04 16:03:49 Downloading - Downloading the training image.........
2025-06-04 16:05:16 Training - Training image download completed. Training in progress...[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[06/04/2025 16:05:25 INFO 140352932898624] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sigma': '0.01', 'init_bias': '0.0', 'optimizer': 'auto', 'loss': 'auto', 'margin': '1.0', 'quantile

## Deploy model

In [7]:
predictor = linear.deploy(instance_type='ml.m5.xlarge', 
                          initial_instance_count=1, 
                          serializer = CSVSerializer())

-------!

## Run prediction

In [8]:
# Attach deserializer
predictor.serializer = CSVSerializer()
predictor.deserializer = JSONDeserializer()

# (CSV-formatted string or 2D array)
QUALITY_CHECK = [5, 1.02, 0.04, 1.4, 0.045, 41, 85, 0.9938, 3.75, 0.48, 10.5]
result = predictor.predict(QUALITY_CHECK)

# Display
predicted_quality = round(result['predictions'][0]['score'], 1)
print(f"Predicted wine quality: {predicted_quality}")

Predicted wine quality: 5.0


# Run multiple(switch cell back to code)
batch_input = [[5, 1.02, 0.04, 1.4, 0.045, 41, 85, 0.9938, 3.75, 0.48, 10.5],
               [8.5, 0.26, 0.21, 16.2, 0.074, 41, 197, 0.998, 3.02, 0.5, 9.8],
               [6.9, 0.36, 0.34, 4.2, 0.018, 57, 119, 0.9898, 3.28, 0.36, 12.7]]
result = predictor.predict(batch_input)

# Extract and round scores
for i, record in enumerate(result['predictions']):
    score = round(record['score'], 1)
    print(f"Predicted wine quality for sample {i+1}: {score}")

## Clean up

In [9]:
# Delete endpoint
Session().delete_endpoint('linear-learner-2025-05-30-22-12-19-871')

## <u>Linear regression WITH containers (model_2)</u>

In [10]:
import os
import io
import boto3
import argparse
import time
import numpy as np
from joblib import dump
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sagemaker import get_execution_role, Session
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.sklearn.model import SKLearnModel
from sagemaker.model import Model
from sagemaker.predictor import Predictor
from sagemaker.inputs import TrainingInput
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import CSVDeserializer

In [11]:
# Combine X_train and y_train for training CSV
wine_train = pd.concat([y_train.reset_index(drop=True), X_train.reset_index(drop=True)], axis=1)
wine_train.to_csv("train.csv", index=False, encoding='utf-8')

In [12]:
train_2 = pd.read_csv("train.csv")
train_2.head()

Unnamed: 0,quality,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,6,9.3,0.36,0.39,1.5,0.08,41.0,55.0,0.99652,3.47,0.73,10.9
1,6,7.6,0.27,0.25,13.9,0.05,45.0,199.0,0.9984,3.34,0.5,9.8
2,5,6.4,0.25,0.57,1.0,0.062,21.0,122.0,0.99238,3.0,0.4,9.5
3,5,7.4,0.67,0.12,1.6,0.186,5.0,21.0,0.996,3.39,0.54,9.5
4,6,9.0,0.26,0.34,6.7,0.029,21.0,162.0,0.99497,3.08,0.5,10.6


# Model_2 session

In [13]:
role = get_execution_role()
session = Session()
bucket = "sagemaker-us-east-1-421498156986"
prefix = "linear-regression-wine"
output_path = f's3://{bucket}/linear-regression-output'

In [14]:
s3_train2_path = session.upload_data("train.csv", bucket=bucket, key_prefix=prefix)
train2_input = TrainingInput(s3_train2_path, content_type="text/csv")

# Train model_2

In [15]:
sklearn_model = SKLearn(
    role=role,
    entry_point="train_script.py",
    source_dir=".",
    instance_type="ml.m5.xlarge",
    framework_version="0.23-1",
    script_mode=True,
    output_path=f"s3://{bucket}/{prefix}/output",
    sagemaker_session=session)

sklearn_model.fit({"train": train2_input})

2025-06-04 16:11:05 Starting - Starting the training job...
2025-06-04 16:11:20 Starting - Preparing the instances for training...
2025-06-04 16:12:00 Downloading - Downloading the training image...
2025-06-04 16:12:30 Training - Training image download completed. Training in progress..[34m2025-06-04 16:12:38,857 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2025-06-04 16:12:38,860 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-06-04 16:12:38,900 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2025-06-04 16:12:39,324 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[34m/miniconda3/bin/python -m pip install -r requirements.txt[0m
[34m[notice] A new release of pip is available: 23.0 -> 24.0[0m
[34m[notice] To update, run: pip install --upgrade pip[0m
[34m2025-06-04 16:12:40,280 sagemaker-training-toolkit INF

# Deploy model

In [16]:

# Create a deployable model object from the trained estimator
deployable_model = sklearn_model.create_model(
    entry_point="inference.py",
    source_dir=".",)

# Deploy the model
predictor = deployable_model.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large",
    serializer=CSVSerializer(),
    deserializer=CSVDeserializer(),
    endpoint_name=f"wine-quality-endpoint-{int(time.time())}")

-----!

# Predict

In [17]:
input_data = (7, 0.27, 0.36, 20.7, 0.045, 45, 170, 1.001, 3, 0.45, 8.8)

try:
    prediction = predictor.predict(input_data)
    print("Predicted wine quality:", prediction)

except Exception as e:
    print(f"An error occurred during prediction: \n{e}")


An error occurred during prediction: 
An error occurred (ModelError) when calling the InvokeEndpoint operation: Received server error (500) from primary with message "<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<title>500 Internal Server Error</title>
<h1>Internal Server Error</h1>
<p>The server encountered an internal error and was unable to complete your request. Either the server is overloaded or there is an error in the application.</p>
". See https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#logEventViewer:group=/aws/sagemaker/Endpoints/wine-quality-endpoint-1749053596 in account 421498156986 for more information.


In [18]:
predictor.serializer = CSVSerializer()
predictor.deserializer = CSVDeserializer()

# Example prediction
sample = X_test.iloc[:5]
response = predictor.predict(sample.to_csv(index=False, header=False))
print(response)

[['5.194891936235308', '5.857477399025498', '5.738887889233851', '5.645412771567791']]


In [19]:
input_values = [7, 0.27, 0.36, 20.7, 0.045, 45, 170, 1.001, 3, 0.45, 8.8]

# Option A: Pass as a list of lists
input_data_for_prediction = [input_values]
response = predictor.predict(input_data_for_prediction)
print("\nPredicted output (List of Lists):", response)

# Option B: Pass as a 2D NumPy array
input_data_for_prediction_np = np.array([input_values])
response_np = predictor.predict(input_data_for_prediction_np)
print("\nPredicted output (NumPy Array):", response_np)

In [None]:
input = [7, 0.27, 0.36, 20.7, 0.045, 45, 170, 1.001, 3, 0.45, 8.8]
response = predictor.predict(input)

print("\nPredicted output:", response)


# END