## Importing the essential libraries over here

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Importing the dataset over here

In [2]:
data=pd.read_csv('customer_shopping_data.csv')

In [3]:
data.head()

Unnamed: 0,invoice_no,customer_id,gender,age,category,quantity,price,payment_method,invoice_date,shopping_mall
0,I138884,C241288,Female,28,Clothing,5,1500.4,Credit Card,5/8/2022,Kanyon
1,I317333,C111565,Male,21,Shoes,3,1800.51,Debit Card,12/12/2021,Forum Istanbul
2,I127801,C266599,Male,20,Clothing,1,300.08,Cash,9/11/2021,Metrocity
3,I173702,C988172,Female,66,Shoes,5,3000.85,Credit Card,16/05/2021,Metropol AVM
4,I337046,C189076,Female,53,Books,4,60.6,Cash,24/10/2021,Kanyon


In [4]:
data.shape

(99457, 10)

## Taking care of duplicate observations if present over here

In [5]:
data.duplicated().sum()

0

## Taking care of missing values if present over here

In [6]:
data.isnull().sum()

invoice_no        0
customer_id       0
gender            0
age               0
category          0
quantity          0
price             0
payment_method    0
invoice_date      0
shopping_mall     0
dtype: int64

In [7]:
missing_values=[feature for feature in data.columns if data[feature].isnull().sum()>1]
for feature in missing_values:
  print(feature)

## Filtering all the numerical features over here

In [8]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!='O']
for feature in numerical_features:
  print(feature)

age
quantity
price


In [9]:
data[numerical_features]

Unnamed: 0,age,quantity,price
0,28,5,1500.40
1,21,3,1800.51
2,20,1,300.08
3,66,5,3000.85
4,53,4,60.60
...,...,...,...
99452,45,5,58.65
99453,27,2,10.46
99454,63,2,10.46
99455,56,4,4200.00


## Filtering all the categorical features over here

In [10]:
cat_features=[feature for feature in data.columns if data[feature].dtype=="O"]
for feature in cat_features:
  print(feature)

invoice_no
customer_id
gender
category
payment_method
invoice_date
shopping_mall


In [11]:
data[cat_features]

Unnamed: 0,invoice_no,customer_id,gender,category,payment_method,invoice_date,shopping_mall
0,I138884,C241288,Female,Clothing,Credit Card,5/8/2022,Kanyon
1,I317333,C111565,Male,Shoes,Debit Card,12/12/2021,Forum Istanbul
2,I127801,C266599,Male,Clothing,Cash,9/11/2021,Metrocity
3,I173702,C988172,Female,Shoes,Credit Card,16/05/2021,Metropol AVM
4,I337046,C189076,Female,Books,Cash,24/10/2021,Kanyon
...,...,...,...,...,...,...,...
99452,I219422,C441542,Female,Souvenir,Credit Card,21/09/2022,Kanyon
99453,I325143,C569580,Male,Food & Beverage,Cash,22/09/2021,Forum Istanbul
99454,I824010,C103292,Male,Food & Beverage,Debit Card,28/03/2021,Metrocity
99455,I702964,C800631,Male,Technology,Cash,16/03/2021,Istinye Park


## Encoding the categorical features over here

In [12]:
cat_features

['invoice_no',
 'customer_id',
 'gender',
 'category',
 'payment_method',
 'invoice_date',
 'shopping_mall']

In [13]:
invoice_mapping={}
for invoice,index in enumerate(data['invoice_no'].unique()):
  invoice_mapping[index]=invoice
print(invoice_mapping)

{'I138884': 0, 'I317333': 1, 'I127801': 2, 'I173702': 3, 'I337046': 4, 'I227836': 5, 'I121056': 6, 'I293112': 7, 'I293455': 8, 'I326945': 9, 'I306368': 10, 'I139207': 11, 'I640508': 12, 'I179802': 13, 'I336189': 14, 'I688768': 15, 'I294687': 16, 'I195744': 17, 'I993048': 18, 'I992454': 19, 'I183746': 20, 'I412481': 21, 'I823067': 22, 'I252275': 23, 'I174250': 24, 'I195396': 25, 'I196704': 26, 'I217053': 27, 'I655874': 28, 'I209744': 29, 'I161949': 30, 'I331891': 31, 'I768348': 32, 'I109053': 33, 'I167211': 34, 'I339732': 35, 'I147062': 36, 'I187519': 37, 'I106674': 38, 'I473411': 39, 'I246550': 40, 'I138674': 41, 'I752693': 42, 'I826174': 43, 'I296025': 44, 'I117291': 45, 'I267193': 46, 'I205366': 47, 'I269690': 48, 'I304265': 49, 'I246562': 50, 'I202367': 51, 'I664787': 52, 'I160777': 53, 'I137794': 54, 'I148377': 55, 'I258195': 56, 'I300213': 57, 'I263803': 58, 'I335713': 59, 'I133061': 60, 'I207205': 61, 'I209289': 62, 'I157285': 63, 'I218590': 64, 'I181109': 65, 'I221715': 66, 'I20

In [14]:
data['invoice_no']=data['invoice_no'].map(invoice_mapping)

In [15]:
customer_id_mapping={}
for customer_id,index in enumerate(data['customer_id'].unique()):
  customer_id_mapping[index]=customer_id
print(customer_id_mapping)

data['customer_id']=data['customer_id'].map(customer_id_mapping)

{'C241288': 0, 'C111565': 1, 'C266599': 2, 'C988172': 3, 'C189076': 4, 'C657758': 5, 'C151197': 6, 'C176086': 7, 'C159642': 8, 'C283361': 9, 'C240286': 10, 'C191708': 11, 'C225330': 12, 'C312861': 13, 'C555402': 14, 'C362288': 15, 'C300786': 16, 'C330667': 17, 'C218149': 18, 'C196845': 19, 'C220180': 20, 'C125696': 21, 'C322947': 22, 'C313348': 23, 'C204553': 24, 'C285161': 25, 'C289625': 26, 'C192344': 27, 'C447138': 28, 'C251229': 29, 'C159164': 30, 'C501658': 31, 'C176727': 32, 'C232624': 33, 'C164092': 34, 'C276887': 35, 'C245456': 36, 'C450287': 37, 'C204279': 38, 'C452806': 39, 'C716788': 40, 'C155059': 41, 'C306662': 42, 'C607615': 43, 'C120164': 44, 'C134449': 45, 'C317818': 46, 'C241642': 47, 'C126436': 48, 'C653385': 49, 'C227070': 50, 'C317478': 51, 'C237330': 52, 'C626042': 53, 'C133687': 54, 'C841663': 55, 'C213742': 56, 'C962515': 57, 'C112279': 58, 'C158837': 59, 'C336576': 60, 'C716161': 61, 'C439382': 62, 'C123427': 63, 'C224743': 64, 'C119549': 65, 'C187266': 66, 'C17

In [16]:
gender_mapping={}
for gender,index in enumerate(data['gender'].unique()):
  gender_mapping[index]=gender
print(gender_mapping)

data['gender']=data['gender'].map(gender_mapping)

{'Female': 0, 'Male': 1}


In [17]:
category_mapping={}
for category,index in enumerate(data['category'].unique()):
  category_mapping[index]=category
print(category_mapping)

data['category']=data['category'].map(category_mapping)

{'Clothing': 0, 'Shoes': 1, 'Books': 2, 'Cosmetics': 3, 'Food & Beverage': 4, 'Toys': 5, 'Technology': 6, 'Souvenir': 7}


In [18]:
payment_method_mapping={}
for payment_method,index in enumerate(data['payment_method'].unique()):
  payment_method_mapping[index]=payment_method
print(payment_method_mapping)

data['payment_method']=data['payment_method'].map(payment_method_mapping)

{'Credit Card': 0, 'Debit Card': 1, 'Cash': 2}


In [19]:
invoice_date_mapping={}
for invoice_date,index in enumerate(data['invoice_date'].unique()):
  invoice_date_mapping[index]=invoice_date
print(invoice_date_mapping)

data['invoice_date']=data['invoice_date'].map(invoice_date_mapping)

{'5/8/2022': 0, '12/12/2021': 1, '9/11/2021': 2, '16/05/2021': 3, '24/10/2021': 4, '24/05/2022': 5, '13/03/2022': 6, '13/01/2021': 7, '4/11/2021': 8, '22/08/2021': 9, '25/12/2022': 10, '28/10/2022': 11, '31/07/2022': 12, '17/11/2022': 13, '3/6/2022': 14, '7/11/2021': 15, '16/01/2021': 16, '5/1/2022': 17, '26/07/2021': 18, '7/3/2023': 19, '15/02/2023': 20, '1/5/2021': 21, '18/06/2022': 22, '26/10/2021': 23, '16/12/2022': 24, '20/04/2022': 25, '10/10/2022': 26, '23/08/2022': 27, '29/04/2021': 28, '4/7/2022': 29, '21/11/2021': 30, '23/02/2022': 31, '11/7/2021': 32, '30/08/2022': 33, '4/1/2023': 34, '21/06/2022': 35, '8/7/2022': 36, '27/02/2022': 37, '19/12/2022': 38, '10/9/2021': 39, '14/02/2022': 40, '28/04/2022': 41, '20/06/2022': 42, '21/04/2022': 43, '9/12/2022': 44, '12/1/2023': 45, '7/11/2022': 46, '7/2/2022': 47, '13/06/2021': 48, '23/08/2021': 49, '24/02/2023': 50, '22/02/2022': 51, '12/3/2021': 52, '2/1/2022': 53, '26/11/2021': 54, '21/04/2021': 55, '23/06/2021': 56, '14/12/2021'

In [20]:
shopping_mall_mapping={}
for shopping_mall,index in enumerate(data['shopping_mall'].unique()):
  shopping_mall_mapping[index]=shopping_mall
print(shopping_mall_mapping)

data['shopping_mall']=data['shopping_mall'].map(shopping_mall_mapping)

{'Kanyon': 0, 'Forum Istanbul': 1, 'Metrocity': 2, 'Metropol AVM': 3, 'Istinye Park': 4, 'Mall of Istanbul': 5, 'Emaar Square Mall': 6, 'Cevahir AVM': 7, 'Viaport Outlet': 8, 'Zorlu Center': 9}


In [21]:
data

Unnamed: 0,invoice_no,customer_id,gender,age,category,quantity,price,payment_method,invoice_date,shopping_mall
0,0,0,0,28,0,5,1500.40,0,0,0
1,1,1,1,21,1,3,1800.51,1,1,1
2,2,2,1,20,0,1,300.08,2,2,2
3,3,3,0,66,1,5,3000.85,0,3,3
4,4,4,0,53,2,4,60.60,2,4,0
...,...,...,...,...,...,...,...,...,...,...
99452,99452,99452,0,45,7,5,58.65,0,102,0
99453,99453,99453,1,27,4,2,10.46,2,422,1
99454,99454,99454,1,63,4,2,10.46,1,163,2
99455,99455,99455,1,56,6,4,4200.00,2,543,4


In [22]:
data['Price']=data['price']

In [23]:
data.drop("price",axis=1,inplace=True)

In [34]:
data.drop("invoice_no",axis=1,inplace=True)

In [35]:
data

Unnamed: 0,customer_id,gender,age,category,quantity,payment_method,invoice_date,shopping_mall,Price
0,0,0,28,0,5,0,0,0,1500.40
1,1,1,21,1,3,1,1,1,1800.51
2,2,1,20,0,1,2,2,2,300.08
3,3,0,66,1,5,0,3,3,3000.85
4,4,0,53,2,4,2,4,0,60.60
...,...,...,...,...,...,...,...,...,...
99452,99452,0,45,7,5,0,102,0,58.65
99453,99453,1,27,4,2,2,422,1,10.46
99454,99454,1,63,4,2,1,163,2,10.46
99455,99455,1,56,6,4,2,543,4,4200.00


## Creating the features and labels over here

In [36]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

## Splitting the dataset into training set and testing set to avoid the problem of overfitting over here

In [37]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the model on the training set over here

In [38]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

regressor=XGBRegressor()
regressor.fit(X_train,y_train)

## Predicting the results on the testing dataset over here

In [39]:
y_pred=regressor.predict((X_test))
np.set_printoptions(precision=2)
print(np.concatenate((y_test.reshape(len(y_test),1),y_pred.reshape(len(y_pred),1)),1))

[[  35.84   35.84]
 [  15.69   15.69]
 [2100.   2100.  ]
 ...
 [ 162.64  162.64]
 [1200.32 1200.32]
 [  40.66   40.66]]


## Finding the difference between the actual and predicted values and displaying them in the form of dataframe over here

In [42]:
actual_vs_predicted = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
actual_vs_predicted['Absolute Difference'] = abs(actual_vs_predicted['Actual'] - actual_vs_predicted['Predicted'])
actual_vs_predicted

Unnamed: 0,Actual,Predicted,Absolute Difference
0,35.84,35.839989,0.000011
1,15.69,15.689973,0.000027
2,2100.00,2099.999756,0.000244
3,1500.40,1500.399902,0.000098
4,600.16,600.160034,0.000034
...,...,...,...
19887,121.98,121.980026,0.000026
19888,71.68,71.680016,0.000016
19889,162.64,162.639954,0.000046
19890,1200.32,1200.319824,0.000176


## Checking the accuracy of the model using r2_score over here

In [40]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.9999999999999852