# ANA680_Week_3:

### Build linear regression model

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split

## Load and inspect data

In [15]:
wine_quality = pd.read_csv("winequality.csv", header=0)
wine_quality.head()

Unnamed: 0,quality,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,6,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,6,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,6,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
3,6,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
4,6,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9


## Split into Train(80%) and Test(20%)

In [16]:
# Split features and target
X = wine_quality.iloc[:, 1:]
y = wine_quality.iloc[:, 0]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=680)

## <u>Linear regression WITHOUT container technology</u>

In [17]:
# Set up environment
import numpy as np
import sagemaker
from sagemaker import get_execution_role, Session
from sagemaker.inputs import TrainingInput
from sagemaker.amazon.linear_learner import LinearLearner

## Shape data for Sagemaker

In [18]:
train_array = np.concatenate([y_train.to_numpy().reshape(-1, 1), X_train.to_numpy()], axis=1)
test_array = np.concatenate([y_test.to_numpy().reshape(-1, 1), X_test.to_numpy()], axis=1)

np.savetxt("train_split.csv", train_array, delimiter=",")
np.savetxt("test_split.csv", test_array, delimiter=",")

## Start session

In [19]:
# Start SageMaker session
role = get_execution_role()
session = Session()
bucket = "sagemaker-us-east-1-421498156986"
prefix = "linear-regression-wine"

# Upload train and test splits to S3
s3_train_path = f"s3://{bucket}/{prefix}/train_split.csv"
s3_test_path = f"s3://{bucket}/{prefix}/test_split.csv"

print(s3_train_path)
print(s3_test_path)

s3://sagemaker-us-east-1-421498156986/linear-regression-wine/train_split.csv
s3://sagemaker-us-east-1-421498156986/linear-regression-wine/test_split.csv


## Define Training objects

In [20]:
train_input = TrainingInput(s3_data=s3_train_path, content_type="text/csv")
test_input = TrainingInput(s3_data=s3_test_path, content_type="text/csv")
print(type(train_input))
print(type(test_input))

<class 'sagemaker.inputs.TrainingInput'>
<class 'sagemaker.inputs.TrainingInput'>


In [21]:
# Train model
linear = LinearLearner(
    role=role,
    instance_count=1,
    instance_type='ml.m3.medium',
    predictor_type='regressor',
    output_path=f's3://{bucket}/linear-regression-output')

linear.fit('s3://sagemaker-us-east-1-421498156986/linear-regression-wine/train_split.csv')

In [None]:
# Deploy model
predictor = linear.deploy(instance_type='ml.m3.medium', 
                          initial_instance_count=1, serializer = CSVSerializer())


In [None]:
# Run prediction
result = predictor.predict(X_test.to_numpy())
print(result)


In [None]:
# Clean up by deleting endpoint
sagemaker.Session().delete_endpoint(predictor.endpoint_name)

In [None]:
print(train_df.isnull().sum())

## <u>Linear regression WITH containers (from Docker)</u>

# END