### Import libraries and dataset

In [1]:
# import libraries
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np
import pandas as pd
import matplotlib.pyplot as plttea
from IPython.display import Image
from IPython.display import display
from time import gmtime, strftime
from sagemaker.predictor import csv_serializer

In [2]:
# Define IAM role
role = get_execution_role()
prefix = 'sagemaker/C2-capstone'
my_region = boto3.session.Session().region_name # set the region of the instance

In [3]:
# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
xgboost_container = sagemaker.image_uris.retrieve("xgboost", my_region, "latest")
    
print("Success - the MySageMakerInstance is in the " + my_region + " region. You will use the " + xgboost_container + " container for your SageMaker endpoint.")


Success - the MySageMakerInstance is in the us-east-1 region. You will use the 811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest container for your SageMaker endpoint.


In [4]:
# Run code
# d. Create the S3 bucket to store your data. Copy and paste the following code into the next code cell and choose Run.

# Note: Make sure to replace the bucket_name your-s3-bucket-name with a unique S3 bucket name. If you don't receive a success message after running the code, change the bucket name and try again.
bucket_name = 'c2capstonesagem' # <--- CHANGE THIS VARIABLE TO A UNIQUE NAME FOR YOUR BUCKET
s3 = boto3.resource('s3')
try:
    if  my_region == 'us-east-1':
      s3.create_bucket(Bucket=bucket_name)
    else: 
      s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={ 'LocationConstraint': my_region })
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ',e)

S3 bucket created successfully


In [5]:
bucket='c2capstonesagem'
subfolder ='input'
from sagemaker import get_execution_role
role=get_execution_role()
conn=boto3.client('s3')
contents=conn.list_objects(Bucket=bucket,Prefix=subfolder)['Contents']
for f in contents:
    print(f['Key'])

input/
input/BlackFriday.csv
input/Result.csv


In [9]:
try:
  urllib.request.urlretrieve ("https://c2capstonesagem.s3.amazonaws.com/input/BlackFriday.csv", "blackfriday.csv")
  print('Success: downloaded blackfriday.csv.')
except Exception as e:
  print('Data load error: ',e)

try:
  dataset = pd.read_csv('./blackfriday.csv',index_col=0)
  print('Success: Data loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)

Success: downloaded blackfriday.csv.
Success: Data loaded into dataframe.


### Replacing '+' in 'Age' and 'Stay_In_Current_City_Years'

In [11]:
dataset['Age'] = dataset['Age'].apply(lambda x : str(x).replace('55+', '55'))

In [12]:
dataset['Stay_In_Current_City_Years'] = dataset['Stay_In_Current_City_Years'].apply(lambda x : str(x).replace('4+', '4'))

### Dropping irrelevant features

In [13]:
dataset.drop('Product_Category_3', axis = 1, inplace = True)

In [14]:
dataset.drop('User_ID', axis = 1, inplace = True)

In [15]:
dataset.drop('Product_ID', axis = 1, inplace = True)

In [32]:
dataset.drop('State_Code', axis = 1, inplace = True)
dataset.drop('City_Code', axis = 1, inplace = True)
dataset.drop('Zip_Code', axis = 1, inplace = True)

### Feature Encoding

In [33]:
from sklearn.preprocessing import LabelEncoder

In [34]:
label_encoder_gender = LabelEncoder()
dataset['Gender'] = label_encoder_gender.fit_transform(dataset['Gender'])

In [35]:
label_encoder_age = LabelEncoder()
dataset['Age'] = label_encoder_age.fit_transform(dataset['Age'])

In [36]:
label_encoder_city = LabelEncoder()
dataset['City_Category'] = label_encoder_city.fit_transform(dataset['City_Category'])

### Fixing null values in 'Product_Category_2' 

In [37]:
dataset['Product_Category_2'].fillna(dataset['Product_Category_2'].median(), inplace = True)

### Convert 'Stay_In_Current_City_Years' into numeric data type

In [38]:
dataset['Stay_In_Current_City_Years'] = dataset['Stay_In_Current_City_Years'].astype('int')

### Separating train into X and Y

In [39]:
X = dataset.drop("Purchase", axis = 1)

In [40]:
X

Unnamed: 0,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Gender,Age,Occupation
1,0,2,0,3,9.0,0,0,10
2,0,2,0,1,6.0,0,0,10
3,0,2,0,12,9.0,0,0,10
4,0,2,0,12,14.0,0,0,10
5,2,4,0,8,9.0,1,6,16
...,...,...,...,...,...,...,...,...
550064,1,1,1,20,9.0,1,5,13
550065,2,3,0,20,9.0,0,2,1
550066,1,4,1,20,9.0,0,2,15
550067,2,2,0,20,9.0,0,6,1


In [28]:
Y = dataset["Purchase"]

In [41]:
from sklearn.ensemble import ExtraTreesRegressor
selector = ExtraTreesRegressor()

In [42]:
selector.fit(X, Y)

ExtraTreesRegressor()

In [43]:
feature_imp = selector.feature_importances_

In [44]:
for index, val in enumerate(feature_imp):
    print(index, round((val * 100), 2))

0 0.73
1 2.56
2 0.62
3 79.61
4 8.97
5 0.49
6 2.1
7 4.93


In [45]:
X.drop(['Gender', 'City_Category', 'Marital_Status'], axis = 1, inplace = True)

### Feature Scaling

In [46]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [47]:
for col in X.columns:
  X[col] = scaler.fit_transform(X[col].values.reshape(-1, 1))

### Creating a train test split

In [48]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

In [49]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("Y_train shape:", Y_train.shape)
print("Y_test shape:", Y_test.shape)

X_train shape: (440054, 5)
X_test shape: (110014, 5)
Y_train shape: (440054,)
Y_test shape: (110014,)


## Data Modelling

### Linear Regression

In [50]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()

In [51]:
lin_reg.fit(X_train, Y_train)

LinearRegression()

In [52]:
Y_pred_lin_reg = lin_reg.predict(X_test)

In [53]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [54]:
print("Linear Regression: ")
print("RMSE:",np.sqrt(mean_squared_error(Y_test, Y_pred_lin_reg)))
print("R2 score:", r2_score(Y_test, Y_pred_lin_reg))

Linear Regression: 
RMSE: 4699.473872577099
R2 score: 0.12103412745405762


### KNN Regression

In [55]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()

In [56]:
knn.fit(X_train, Y_train)

KNeighborsRegressor()

In [57]:
Y_pred_knn = knn.predict(X_test)

In [58]:
print("KNN regression: ")
print("RMSE:",np.sqrt(mean_squared_error(Y_test, Y_pred_knn)))
print("R2 score:", r2_score(Y_test, Y_pred_knn))

KNN regression: 
RMSE: 3271.3403941275433
R2 score: 0.574082968532854


### Decision Tree Regression

In [59]:
from sklearn.tree import DecisionTreeRegressor
dec_tree = DecisionTreeRegressor()

In [60]:
dec_tree.fit(X_train, Y_train)

DecisionTreeRegressor()

In [61]:
Y_pred_dec = dec_tree.predict(X_test)

In [62]:
print("Decision tree regression: ")
print("RMSE:",np.sqrt(mean_squared_error(Y_test, Y_pred_dec)))
print("R2 score:", r2_score(Y_test, Y_pred_dec))

Decision tree regression: 
RMSE: 3064.917314573144
R2 score: 0.6261382285026303


### Random Forest Regressor

In [63]:
from sklearn.ensemble import RandomForestRegressor
ran_for = RandomForestRegressor()

In [64]:
ran_for.fit(X_train, Y_train)

RandomForestRegressor()

In [65]:
Y_pred_ran_for = ran_for.predict(X_test)

In [66]:
print("Random forest regression: ")
print("RMSE:",np.sqrt(mean_squared_error(Y_test, Y_pred_ran_for)))
print("R2 score:", r2_score(Y_test, Y_pred_ran_for))

Random forest regression: 
RMSE: 3033.4365847028607
R2 score: 0.6337788898438023


### XGB Regressor

In [68]:
!pip install xgboost

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting xgboost
  Downloading xgboost-1.7.2-py3-none-manylinux2014_x86_64.whl (193.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-1.7.2
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [69]:
from xgboost import XGBRegressor
xgb = XGBRegressor(random_state = 42)

In [70]:
xgb.fit(X_train, Y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
             grow_policy='depthwise', importance_type=None,
             interaction_constraints='', learning_rate=0.300000012, max_bin=256,
             max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
             max_depth=6, max_leaves=0, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=42, ...)

In [71]:
Y_pred_xgb = xgb.predict(X_test)

In [72]:
print("XGB regression: ")
print("RMSE:",np.sqrt(mean_squared_error(Y_test, Y_pred_xgb)))
print("R2 score:", r2_score(Y_test, Y_pred_xgb))

XGB regression: 
RMSE: 2981.465793966605
R2 score: 0.6462200660793491


<div class="alert alert-block alert-info">

<h3 style="font-family:verdana;"> Conclusion:</h3>

<ul>
    
<li><p style="font-family:verdana;">
In this project, we tried to build a model using various algorithms such as Linear regression, KNN regression, Decision tree regression, Random forest and XGB regressor to get the best possible prediction.
</p></li>     
        
<li><p style="font-family:verdana;">
The XGB regressor gives us the best rmse value and r2 score for this problem.
</p></li>    

   

</ul>

</div>