# Create a S3 bucket

In [6]:
import boto3
s3=boto3.client("s3")
bucket_name="HousePricePrediction"
try:
    s3.create_bucket(Bucket=bucket_name)
    print("the bucket has been created successfully")
except Exception as e:
    print("there is an error :",e)

the bucket has been created successfully


# Extract,Load,Transform dataset


In [5]:
import pandas as pd
House_dataset=pd.read_csv("s3://HousePricePrediction/xgboost-as-a-built-in-algo/train/train.csv")


In [6]:
#Remove the id column
House_dataset=House_dataset.drop(axis=1,columns=['Id'])
Columns=House_dataset.columns

In [7]:
#Choosing features to work with
features=['HouseStyle','YearBuilt','Heating','CentralAir','Electrical','FullBath',
          'OverallQual','YearRemodAdd','TotalBsmtSF','GrLivArea',
          'TotRmsAbvGrd','GarageCars','GarageArea']

In [8]:
print(House_dataset[features].isnull().sum()) 
House_dataset['Electrical'].fillna('SBrkr',inplace=True) #Replacing the null value with the most used value 

HouseStyle      0
YearBuilt       0
Heating         0
CentralAir      0
Electrical      1
FullBath        0
OverallQual     0
YearRemodAdd    0
TotalBsmtSF     0
GrLivArea       0
TotRmsAbvGrd    0
GarageCars      0
GarageArea      0
dtype: int64


# New dataset with the most important features

In [9]:
House_dataset=House_dataset[['SalePrice','HouseStyle','YearBuilt','Heating','CentralAir','Electrical','FullBath','OverallQual',
                             'YearRemodAdd','TotalBsmtSF','GrLivArea','TotRmsAbvGrd','GarageCars','GarageArea']]


In [11]:
#Spliting dataset into test and train datasets
from sklearn.model_selection import train_test_split
train_dataset,test_dataset=train_test_split(House_dataset,train_size=0.8,test_size=0.2, random_state=2)

In [None]:
#Transforming categorical variables into numerical variables
from sklearn.preprocessing import LabelEncoder
for df in [train_dataset,test_dataset]:
    for i in ['HouseStyle','Heating','CentralAir','Electrical']:
        label_encoder=LabelEncoder().fit(df[i])
        df[i]=label_encoder.transform(df[i])

# Upload the new dataset to the S3 bucket

In [27]:
train_path="s3://stockpricepredictionbucket/xgboost-as-a-built-in-algo/train/train.csv"
test_path="s3://stockpricepredictionbucket/xgboost-as-a-built-in-algo/test/test.csv"
train_dataset.to_csv(train_path,header=False,index=False)
test_dataset.to_csv(test_path,header=False,index=False)

# Set a path and upload dataset to S3 bucket

In [None]:
import os
import csv
from sagemaker.inputs import TrainingInput
prefix = 'xgboost-as-a-built-in-algo'
content_type="csv"
train_input=TrainingInput("s3://{}/{}/{}/".format(bucket_name,prefix,'train'),content_type=content_type)
test_input=TrainingInput("s3://{}/{}/{}/".format(bucket_name,prefix,'test'),content_type=content_type)
#Reading file from S3 bucket
obj = s3.get_object(Bucket=bucket_name, Key='xgboost-as-a-built-in-algo/train/train.csv')
data = obj['Body'].read().decode('utf-8').splitlines()
records = csv.reader(data) 
for eachRecord in records: #6
    print(eachRecord)


# Build XGBoost Model 

In [29]:
import sagemaker
from sagemaker import image_uris
import sagemaker.estimator 
from sagemaker.session import Session


In [37]:
xgboost_container=image_uris.retrieve('xgboost',boto3.Session().region_name,'1.2-2')
display(xgboost_container)

'683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.2-2'

In [42]:
#Hyperparameters
hyperparameters={
    "max_depth":"5",
    "eta":"0.2",
    "gamma":"4",
    "min_child_weight":"6",
    "subsample":"0.7",
    "objective":"reg:squarederror",
    "early_stopping_rounds":10,
    "num_round":1000
}

# Set an output path where the trainig model will be saved

In [43]:
model_path="s3://{}/{}/{}/".format(bucket_name,prefix,"model")

# constructing a SageMaker estimator

In [44]:
estimator=sagemaker.estimator.Estimator(image_uri=xgboost_container,
                            hyperparameters=hyperparameters,
                            role=sagemaker.get_execution_role(),
                            instance_count=1,
                            instance_type='ml.m4.xlarge',
                            volume_size=5,
                            output_path=model_path,
                            use_spot_instances=True,
                            max_run=300,
                            max_wait=600
                               
)

# Execute the XGBoost training job

In [None]:
estimator.fit({'train':train_input,'validation':test_input})

# Deploy XGBoost model as an endpoint

In [46]:
from sagemaker.serializers import CSVSerializer
xgboost_endpoint=estimator.deploy(initial_instance_count=1,instance_type="ml.m4.xlarge",serializer=CSVSerializer())

---------!

In [47]:
xgboost_endpoint.endpoint_name #will be used in the lambda function

'sagemaker-xgboost-2022-08-03-16-46-04-244'