<a href="https://clarusway.com/contact-us/"><img align="center" src="https://i.ibb.co/B43qn24/officially-licensed-logo.png" alt="Open in Clarusway LMS" width="110" height="200" title="This notebook is licensed by Clarusway IT training school. Please contact the authorized persons about the conditions under which you can use or share."></a>

# TRAINING AND MODEL/ENDPOINT CREATION FROM NOTEBOOK

In [1]:
import pandas as pd      
import numpy as np 
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [2]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    mse = mean_squared_error(actual, pred)
    score = r2_score(actual, pred)
    return print(" r2_score:", score, "\n","mae:", mae, "\n","mse:",mse, "\n","rmse:",rmse)

In [3]:
train_data = pd.read_csv("train_data.csv")
test_data = pd.read_csv("test_data.csv")

In [4]:
train_data.shape

(12731, 6)

In [5]:
test_data.shape

(3183, 6)

In [6]:
train_data.head()

Unnamed: 0,price,age,hp_kW,km,Gearing_Type,make_model
0,26379,0.0,100.0,5900.0,Manual,Opel Insignia
1,18990,0.0,66.0,1900.0,Manual,Opel Astra
2,12480,2.0,81.0,39792.0,Manual,Opel Astra
3,34490,0.0,154.0,10.0,Automatic,Opel Insignia
4,15888,2.0,60.0,11903.0,Manual,Audi A1


In [7]:
test_data.head()

Unnamed: 0,price,age,hp_kW,km,Gearing_Type,make_model
0,14500,2.0,141.0,80000.0,Automatic,Audi A1
1,16790,3.0,66.0,16200.0,Automatic,Audi A1
2,15090,3.0,85.0,63668.0,Automatic,Audi A1
3,17990,2.0,70.0,16103.0,Automatic,Audi A1
4,17990,3.0,92.0,26415.0,Automatic,Audi A1


In [8]:
train_data.describe()

Unnamed: 0,price,age,hp_kW,km
count,12731.0,12731.0,12731.0,12731.0
mean,17996.286702,1.387244,88.382374,32002.454174
std,7349.138624,1.121765,26.693078,37081.360187
min,4950.0,0.0,40.0,0.0
25%,12850.0,0.0,66.0,1699.5
50%,16890.0,1.0,85.0,20321.0
75%,21910.0,2.0,101.0,46375.0
max,74600.0,3.0,294.0,291800.0


In [9]:
test_data.describe()

Unnamed: 0,price,age,hp_kW,km
count,3183.0,3183.0,3183.0,3183.0
mean,18133.550424,1.399623,88.968269,32441.995475
std,7509.562135,1.119744,26.602421,36567.08854
min,5450.0,0.0,51.0,0.0
25%,12880.0,0.0,66.0,2970.5
50%,16990.0,1.0,85.0,20900.0
75%,21900.0,2.0,103.0,48000.0
max,68320.0,3.0,294.0,317000.0


In [10]:
train_data.make_model.value_counts()

Audi A3           2488
Audi A1           2111
Opel Insignia     2044
Opel Astra        1995
Opel Corsa        1791
Renault Clio      1488
Renault Espace     786
Renault Duster      28
Name: make_model, dtype: int64

In [11]:
test_data.make_model.value_counts()

Audi A3           609
Opel Insignia     554
Opel Astra        530
Audi A1           503
Opel Corsa        425
Renault Clio      351
Renault Espace    205
Renault Duster      6
Name: make_model, dtype: int64

In [12]:
train_data.Gearing_Type.value_counts()

Manual            6496
Automatic         5861
Semi-automatic     374
Name: Gearing_Type, dtype: int64

# Split train_data into train and validation

In [13]:
X=train_data.drop(["price"], axis = 1)
y=train_data['price']

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=1)

In [15]:
X_train.head()

Unnamed: 0,age,hp_kW,km,Gearing_Type,make_model
8699,2.0,51.0,40000.0,Manual,Opel Corsa
5137,3.0,70.0,97976.0,Manual,Audi A1
536,1.0,66.0,14500.0,Manual,Audi A1
876,3.0,55.0,25329.0,Manual,Opel Corsa
8690,2.0,125.0,37125.0,Automatic,Opel Insignia


In [16]:
y_train.head()

8699     8900
5137    11490
536     16800
876      7899
8690    19499
Name: price, dtype: int64

In [17]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder

In [18]:
cat = X_train.select_dtypes("object").columns
cat

Index(['Gearing_Type', 'make_model'], dtype='object')

In [19]:
ord_enc = OrdinalEncoder()
column_trans = make_column_transformer((ord_enc, cat), remainder='passthrough') # remainder: # Keep other columns rather than cat in original form.
                                                                                # remainder: # if you want to scale rest of cat columns use "minmax" istead passthroug
    
    # Apply ord_enc to categoric variables and remain in original form rest of them.
    # ord_enc for DT based algorithms
    # For Linear reg, log reg vs. use one hot encoder 

In [20]:
train_numpy=column_trans.fit_transform(X_train)
validation_numpy=column_trans.transform(X_val)

In [21]:
train_numpy

array([[1.00000000e+00, 3.00000000e+00, 2.00000000e+00, 5.10000000e+01,
        4.00000000e+04],
       [1.00000000e+00, 0.00000000e+00, 3.00000000e+00, 7.00000000e+01,
        9.79760000e+04],
       [1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 6.60000000e+01,
        1.45000000e+04],
       ...,
       [1.00000000e+00, 2.00000000e+00, 0.00000000e+00, 6.60000000e+01,
        1.00000000e+01],
       [0.00000000e+00, 1.00000000e+00, 1.00000000e+00, 8.50000000e+01,
        1.90000000e+04],
       [1.00000000e+00, 4.00000000e+00, 0.00000000e+00, 1.03000000e+02,
        1.64736261e+03]])

In [22]:
# let's fix the orders, categorical features are shifted to the begining
train = pd.DataFrame(train_numpy, columns = ["Gearing_Type", 'make_model', "age", "hp_kW", "km"], index = X_train.index)
validation = pd.DataFrame(validation_numpy, columns = ["Gearing_Type", 'make_model', "age", "hp_kW", "km"], index = X_val.index)

In [23]:
train.head()

Unnamed: 0,Gearing_Type,make_model,age,hp_kW,km
8699,1.0,3.0,2.0,51.0,40000.0
5137,1.0,0.0,3.0,70.0,97976.0
536,1.0,0.0,1.0,66.0,14500.0
876,1.0,3.0,3.0,55.0,25329.0
8690,0.0,4.0,2.0,125.0,37125.0


In [24]:
train = pd.concat([pd.Series(y_train, index = X_train.index, name="price", dtype=int), train], axis=1)
validation = pd.concat([pd.Series(y_val, index = X_val.index, name="price", dtype=int), validation], axis=1)

In [25]:
train.head()

Unnamed: 0,price,Gearing_Type,make_model,age,hp_kW,km
8699,8900,1.0,3.0,2.0,51.0,40000.0
5137,11490,1.0,0.0,3.0,70.0,97976.0
536,16800,1.0,0.0,1.0,66.0,14500.0
876,7899,1.0,3.0,3.0,55.0,25329.0
8690,19499,0.0,4.0,2.0,125.0,37125.0


In [26]:
validation.head()

Unnamed: 0,price,Gearing_Type,make_model,age,hp_kW,km
3132,15480,0.0,2.0,3.0,100.0,56587.0
8123,24900,1.0,7.0,1.0,96.0,20000.0
10948,27400,0.0,0.0,0.0,85.0,10.0
1986,11975,1.0,2.0,2.0,74.0,38500.0
7487,12450,1.0,4.0,3.0,103.0,45000.0


In [27]:
train.to_csv('train.csv', index=False, header=False)
validation.to_csv('validation.csv', index=False, header=False)

In [28]:
import sagemaker, boto3, os
bucket = 'richard-eu12'
prefix = 'sagemaker-autoscout'

In [29]:
# write datasets to the s3 bucket
boto3.Session().resource('s3').Bucket(bucket).Object(prefix + '/data/train.csv').upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(prefix + '/data/validation.csv').upload_file('validation.csv')

In [30]:
! aws s3 ls {bucket}/{prefix}/data --recursive

2023-01-11 09:08:17     298744 sagemaker-autoscout/data/train.csv
2023-01-11 09:08:17      99546 sagemaker-autoscout/data/validation.csv


# Train Model

In [31]:
import sagemaker
region = sagemaker.Session().boto_region_name
print("AWS Region: {}".format(region))

role = sagemaker.get_execution_role()
print("RoleArn:{}".format(role))

AWS Region: us-east-1
RoleArn:arn:aws:iam::046402772087:role/service-role/AmazonSageMaker-ExecutionRole-20230102T122328


In [32]:
sagemaker.__version__

'2.117.0'

In [33]:
from sagemaker.debugger import Rule, rule_configs
from sagemaker.session import TrainingInput

In [34]:
s3_output_location = 's3://{}/{}/{}'.format(bucket, prefix, 'output')

In [36]:
container = sagemaker.image_uris.retrieve("xgboost",region, version="1.2-2")
print(container)

683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.2-2


In [38]:
#container2 = sagemaker.image_uris.retrieve("xgboost",region, version="latest")
#print(container2)

811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest


In [55]:
#help(sagemaker.image_uris.retrieve)

In [37]:
xgb_model = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type='ml.m4.xlarge',
    volume_size = 1,
    output_path = s3_output_location,
    sagemaker_session=sagemaker.Session(),
    rules = [Rule.sagemaker(rule_configs.create_xgboost_report())]
    )

In [57]:
#help(xgb_model.set_hyperparameters)

In [38]:
xgb_model.set_hyperparameters(max_depth=4,
                              eta=0.5,
                              num_round=200,
                              objective = "reg:squarederror", 
                              early_stopping_rounds=10)  #objective = "reg:squarederror", "reg:linear" for latest xgboost!
                                                               

In [39]:
from sagemaker.session import TrainingInput

train_input = TrainingInput(
's3://{}/{}/{}'.format(bucket, prefix, 'data/train.csv'), content_type='csv'
)

validation_input = TrainingInput(
's3://{}/{}/{}'.format(bucket, prefix, 'data/validation.csv'), content_type='csv'
)

In [40]:
data_channels = {'train': train_input, 'validation': validation_input}

In [41]:
xgb_model.fit(data_channels)

2023-01-11 09:10:17 Starting - Starting the training job...CreateXgboostReport: InProgress
ProfilerReport-1673428217: InProgress
...
2023-01-11 09:11:05 Starting - Preparing the instances for training............
2023-01-11 09:13:06 Downloading - Downloading input data...
2023-01-11 09:13:46 Training - Downloading the training image......
2023-01-11 09:14:48 Uploading - Uploading generated training model.[34m[2023-01-11 09:14:43.119 ip-10-2-176-163.ec2.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2023-01-11:09:14:43:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2023-01-11:09:14:43:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34m[2023-01-11:09:14:43:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2023-01-11:09:14:43:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2023-01-11:09:14:43:INFO] Determined delimiter of CSV input is ','


2023-01-11 09:15:06 Completed - Training job completed
Training seconds: 128
Billable seconds: 128


# Create Endpoint

In [42]:
# this code creates an endpoint
import sagemaker               
from sagemaker.serializers import CSVSerializer
predictor = xgb_model.deploy(initial_instance_count=1,
                             instance_type='ml.m5.xlarge',
                            serializer = CSVSerializer())

-----!

In [43]:
predictor.endpoint_name

'sagemaker-xgboost-2023-01-11-09-19-03-337'

# Prepare the test data

In [44]:
test_data.head()

Unnamed: 0,price,age,hp_kW,km,Gearing_Type,make_model
0,14500,2.0,141.0,80000.0,Automatic,Audi A1
1,16790,3.0,66.0,16200.0,Automatic,Audi A1
2,15090,3.0,85.0,63668.0,Automatic,Audi A1
3,17990,2.0,70.0,16103.0,Automatic,Audi A1
4,17990,3.0,92.0,26415.0,Automatic,Audi A1


In [45]:
X=test_data.drop(["price"], axis = 1)
y=test_data['price']

In [46]:
test_numpy=column_trans.transform(X)

In [47]:
test_numpy

array([[0.0000e+00, 0.0000e+00, 2.0000e+00, 1.4100e+02, 8.0000e+04],
       [0.0000e+00, 0.0000e+00, 3.0000e+00, 6.6000e+01, 1.6200e+04],
       [0.0000e+00, 0.0000e+00, 3.0000e+00, 8.5000e+01, 6.3668e+04],
       ...,
       [0.0000e+00, 7.0000e+00, 0.0000e+00, 1.6500e+02, 1.0000e+01],
       [0.0000e+00, 7.0000e+00, 0.0000e+00, 1.1800e+02, 1.0000e+01],
       [0.0000e+00, 7.0000e+00, 0.0000e+00, 1.6500e+02, 9.9000e+03]])

In [49]:
test = pd.DataFrame(test_numpy, columns = ["Gearing_Type", 'make_model', "age", "hp_kW", "km"], index = X.index)

In [50]:
test.head()

Unnamed: 0,Gearing_Type,make_model,age,hp_kW,km
0,0.0,0.0,2.0,141.0,80000.0
1,0.0,0.0,3.0,66.0,16200.0
2,0.0,0.0,3.0,85.0,63668.0
3,0.0,0.0,2.0,70.0,16103.0
4,0.0,0.0,3.0,92.0,26415.0


In [51]:
test.shape

(3183, 5)

In [52]:
test.to_csv('test.csv', index=False, header=False)

# Run predictions using numpy input using predictor (Method-1)

In [53]:
test.head()

Unnamed: 0,Gearing_Type,make_model,age,hp_kW,km
0,0.0,0.0,2.0,141.0,80000.0
1,0.0,0.0,3.0,66.0,16200.0
2,0.0,0.0,3.0,85.0,63668.0
3,0.0,0.0,2.0,70.0,16103.0
4,0.0,0.0,3.0,92.0,26415.0


In [62]:
result = predictor.predict([0.0,0.0,2.0,141.0,80000.0])
result

b'17277.77734375\n'

In [65]:
results = predictor.predict(test.to_numpy())

In [67]:
#results

In [68]:
results = np.fromstring(results, sep='\n')   #sep='\n' or sep=','

In [69]:
results

array([17277.77734375, 15529.5546875 , 15698.14648438, ...,
       35725.3203125 , 33972.03125   , 41680.46875   ])

In [70]:
eval_metrics(y,results)

 r2_score: 0.9215298241147185 
 mae: 1258.7995276419651 
 mse: 4423819.441034105 
 rmse: 2103.2877694300664


In [71]:
test_data["predicted_price"] = results

In [73]:
test_data.sample(5, random_state=41)

Unnamed: 0,price,age,hp_kW,km,Gearing_Type,make_model,predicted_price
1846,13990,1.0,66.0,12099.0,Automatic,Opel Corsa,13579.380859
1671,12500,3.0,66.0,52000.0,Automatic,Opel Corsa,10877.949219
1497,13900,1.0,81.0,13149.0,Manual,Opel Astra,14157.275391
149,15950,3.0,66.0,53900.0,Semi-automatic,Audi A1,16644.998047
2352,16800,1.0,103.0,23401.0,Manual,Opel Insignia,19443.669922


# Run predictions only using the endpoint name (Method-2): 

In [74]:
endpoint_name = predictor.endpoint_name
endpoint_name

'sagemaker-xgboost-2023-01-11-09-19-03-337'

In [75]:
with open('test.csv', 'r') as f:
    payload = f.read().strip('\n')

In [253]:
# payload="0.0,0.0,2.0,141.0,80000.0"     # by this one you can try a single value to be predicted

In [76]:
import boto3

# Create a low-level client representing Amazon SageMaker Runtime
sagemaker_runtime = boto3.client("sagemaker-runtime", region_name=boto3.Session().region_name)

# The name of the endpoint. The name must be unique within an AWS Region in your AWS account. 

# After you deploy a model into production using SageMaker hosting 
# services, your client applications use this API to get inferences 
# from the model hosted at the specified endpoint.
response = sagemaker_runtime.invoke_endpoint(
                            EndpointName=endpoint_name, 
                            Body=payload, # Replace with your own data.
                            ContentType = 'text/csv')

# Optional - Print the response body and decode it so it is human read-able.
#print(response['Body'].read().decode('utf-8'))

In [80]:
#response

In [77]:
results2 = response['Body'].read()   # results2 = response['Body'].read().decode('utf-8') if required

In [71]:
#results2

In [78]:
results2 = np.fromstring(results2, sep='\n')

In [79]:
results2

array([17277.77734375, 15529.5546875 , 15698.14648438, ...,
       35725.3203125 , 33972.03125   , 41680.46875   ])

In [80]:
results2.shape

(3183,)

In [81]:
y.shape

(3183,)

In [82]:
eval_metrics(y,results2)

 r2_score: 0.9215298241147185 
 mae: 1258.7995276419651 
 mse: 4423819.441034105 
 rmse: 2103.2877694300664


In [83]:
test_data["predicted_price2"] = results2

In [84]:
test_data.sample(5, random_state=41)

Unnamed: 0,price,age,hp_kW,km,Gearing_Type,make_model,predicted_price,predicted_price2
1846,13990,1.0,66.0,12099.0,Automatic,Opel Corsa,13579.380859,13579.380859
1671,12500,3.0,66.0,52000.0,Automatic,Opel Corsa,10877.949219,10877.949219
1497,13900,1.0,81.0,13149.0,Manual,Opel Astra,14157.275391,14157.275391
149,15950,3.0,66.0,53900.0,Semi-automatic,Audi A1,16644.998047,16644.998047
2352,16800,1.0,103.0,23401.0,Manual,Opel Insignia,19443.669922,19443.669922


# jason info:

In [94]:
event = {
  "data": "0.0,0.0,2.0,141.0,80000.0"
}

data = json.loads(json.dumps(event))

payload = data["data"]
payload

'0.0,0.0,2.0,141.0,80000.0'

In [95]:
type(event)

dict

In [96]:
type(payload)

str

In [97]:
my_dict = {
    "age": 2,
    "hp_kW": 141,
    "km": 80000,
    'Gearing_Type':'Automatic',
    "make_model": 'Audi A1'}

In [98]:
df = pd.DataFrame.from_dict([my_dict])

In [99]:
df2 = column_trans.transform(df)

In [100]:
df2

array([[0.00e+00, 0.00e+00, 2.00e+00, 1.41e+02, 8.00e+04]])

In [101]:
df2 = pd.DataFrame(df2)

In [102]:
df2

Unnamed: 0,0,1,2,3,4
0,0.0,0.0,2.0,141.0,80000.0


In [103]:
df2.to_csv('df2.csv', index=False, header=False)

In [104]:
with open('df2.csv', 'r') as f:
    payload = f.read().strip('\n')

In [105]:
payload

'0.0,0.0,2.0,141.0,80000.0'

In [106]:
event = {
  "data": payload
  
}

In [107]:
event

{'data': '0.0,0.0,2.0,141.0,80000.0'}