In [1]:
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from datasist.structdata import detect_outliers

In [2]:
df=pd.read_csv('Ecommerce_Customers.csv')
df

Unnamed: 0,Email,Address,Avatar,Avg. Session Length,Time on App,Time on Website,Length of Membership,Yearly Amount Spent
0,mstephenson@fernandez.com,"835 Frank Tunnel\nWrightmouth, MI 82180-9605",Violet,34.497268,12.655651,39.577668,4.082621,587.951054
1,hduke@hotmail.com,"4547 Archer Common\nDiazchester, CA 06566-8576",DarkGreen,31.926272,11.109461,37.268959,2.664034,392.204933
2,pallen@yahoo.com,"24645 Valerie Unions Suite 582\nCobbborough, D...",Bisque,33.000915,11.330278,37.110597,4.104543,487.547505
3,riverarebecca@gmail.com,"1414 David Throughway\nPort Jason, OH 22070-1220",SaddleBrown,34.305557,13.717514,36.721283,3.120179,581.852344
4,mstephens@davidson-herman.com,"14023 Rodriguez Passage\nPort Jacobville, PR 3...",MediumAquaMarine,33.330673,12.795189,37.536653,4.446308,599.406092
...,...,...,...,...,...,...,...,...
495,lewisjessica@craig-evans.com,"4483 Jones Motorway Suite 872\nLake Jamiefurt,...",Tan,33.237660,13.566160,36.417985,3.746573,573.847438
496,katrina56@gmail.com,"172 Owen Divide Suite 497\nWest Richard, CA 19320",PaleVioletRed,34.702529,11.695736,37.190268,3.576526,529.049004
497,dale88@hotmail.com,"0787 Andrews Ranch Apt. 633\nSouth Chadburgh, ...",Cornsilk,32.646777,11.499409,38.332576,4.958264,551.620145
498,cwilson@hotmail.com,"680 Jennifer Lodge Apt. 808\nBrendachester, TX...",Teal,33.322501,12.391423,36.840086,2.336485,456.469510


In [3]:
df.columns=df.columns.str.lower()
df.columns

Index(['email', 'address', 'avatar', 'avg. session length', 'time on app',
       'time on website', 'length of membership', 'yearly amount spent'],
      dtype='object')

In [4]:
df.drop(['email', 'address', 'avatar'],axis=1,inplace=True)
df

Unnamed: 0,avg. session length,time on app,time on website,length of membership,yearly amount spent
0,34.497268,12.655651,39.577668,4.082621,587.951054
1,31.926272,11.109461,37.268959,2.664034,392.204933
2,33.000915,11.330278,37.110597,4.104543,487.547505
3,34.305557,13.717514,36.721283,3.120179,581.852344
4,33.330673,12.795189,37.536653,4.446308,599.406092
...,...,...,...,...,...
495,33.237660,13.566160,36.417985,3.746573,573.847438
496,34.702529,11.695736,37.190268,3.576526,529.049004
497,32.646777,11.499409,38.332576,4.958264,551.620145
498,33.322501,12.391423,36.840086,2.336485,456.469510


In [5]:
px.scatter(df,'length of membership','yearly amount spent')

In [6]:
df.corr()

Unnamed: 0,avg. session length,time on app,time on website,length of membership,yearly amount spent
avg. session length,1.0,-0.027826,-0.034987,0.060247,0.355088
time on app,-0.027826,1.0,0.082388,0.029143,0.499328
time on website,-0.034987,0.082388,1.0,-0.047582,-0.002641
length of membership,0.060247,0.029143,-0.047582,1.0,0.809084
yearly amount spent,0.355088,0.499328,-0.002641,0.809084,1.0


In [7]:
x=df.drop('yearly amount spent',axis=1)
y=df['yearly amount spent']

In [8]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [9]:
scaler=StandardScaler()
x_train_scaled=scaler.fit_transform(x_train)
x_test_scaled=scaler.transform(x_test)

In [10]:
poly_obj=PolynomialFeatures(degree=4)
x_train_scaled_poly=poly_obj.fit_transform(x_train_scaled)
x_test_scaled_poly=poly_obj.transform(x_test_scaled)

In [11]:
model=LinearRegression()

In [12]:
model.fit(x_train_scaled,y_train)

In [13]:
model.score(x_train_scaled,y_train)

0.9856751943321411

In [14]:
model.score(x_test_scaled,y_test)

0.9776438083359987

In [15]:
model.fit(x_train_scaled_poly,y_train)
model.score(x_train_scaled_poly,y_train)

0.9880197154689843

In [16]:
model.score(x_test_scaled_poly,y_test)

0.9675163880256479

In [17]:
model=KNeighborsRegressor(n_neighbors=5)

In [18]:
model.fit(x_train_scaled,y_train)

In [19]:
model.score(x_train_scaled,y_train)

0.940836797708728

In [20]:
model.score(x_test_scaled,y_test)

0.9199774173249804

In [21]:
models={
    'lr':LinearRegression(),
    'KNN':KNeighborsRegressor(n_neighbors=5),
    'SVR':SVR()
}

In [22]:
for name,model in models.items():
    print('using:',name)
    model.fit(x_train_scaled,y_train)
    print('train accuracy: ',model.score(x_train_scaled,y_train))
    print('test accuracy: ',model.score(x_test_scaled,y_test))
    print('-'*20)

using: lr
train accuracy:  0.9856751943321411
test accuracy:  0.9776438083359987
--------------------
using: KNN
train accuracy:  0.940836797708728
test accuracy:  0.9199774173249804
--------------------
using: SVR
train accuracy:  0.40144920995809164
test accuracy:  0.4715275194465798
--------------------


In [23]:
model.fit(x_train_scaled,y_train)


In [24]:
features=x.columns.to_list()

In [25]:
features

['avg. session length',
 'time on app',
 'time on website',
 'length of membership']

In [26]:
import joblib

In [27]:
joblib.dump(model,'ecomm_model.h5')
joblib.dump(scaler,'ecomm_scaler.h5')
joblib.dump(features,'ecomm_features.h5')

['ecomm_features.h5']