# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# Loading dataset

In [2]:
data = pd.read_csv('online_shoppers_intention.csv')

In [3]:
data.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [4]:
data1 = data.copy()

# Encoding the Categorical Columns to Numerical

In [5]:
from sklearn.preprocessing import LabelEncoder

In [6]:
LB = LabelEncoder()

In [7]:
data1['Month'] = LB.fit_transform(data['Month'])
data1['VisitorType'] = LB.fit_transform(data['VisitorType'])
data1['Weekend'] = LB.fit_transform(data['Weekend'])
data1['Revenue'] = LB.fit_transform(data['Revenue'])

In [8]:
data1.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,2,1,1,1,1,2,0,0
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,2,2,2,1,2,2,0,0
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,2,4,1,9,3,2,0,0
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,2,3,2,2,4,2,0,0
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,2,3,3,1,4,2,1,0


In [11]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler,MaxAbsScaler,RobustScaler,QuantileTransformer,PowerTransformer,Normalizer

In [12]:
sc = StandardScaler()
sc_data = sc.fit_transform(data1)
sc_data = pd.DataFrame(sc_data,columns=data1.columns)

In [13]:
mm = MinMaxScaler()
mm_data = mm.fit_transform(data1)
mm_data = pd.DataFrame(mm_data,columns=data1.columns)

In [51]:
ma = MaxAbsScaler()
ma_data = ma.fit_transform(data1)
ma_data = pd.DataFrame(ma_data,columns=data1.columns)

In [14]:
rs = RobustScaler()
rs_data = rs.fit_transform(data1)
rs_data = pd.DataFrame(rs_data,columns=data1.columns)

In [15]:
qt = QuantileTransformer(output_distribution='normal')
qt_data = qt.fit_transform(data1)
qt_data = pd.DataFrame(qt_data,columns=data1.columns)

In [16]:
qt_u = QuantileTransformer(output_distribution='uniform')
qtu_data = qt_u.fit_transform(data1)
qtu_data = pd.DataFrame(qtu_data,columns=data1.columns)

In [17]:
pt = PowerTransformer(method='yeo-johnson')
pt_data = pt.fit_transform(data1)
pt_data = pd.DataFrame(pt_data,columns=data1.columns)

In [104]:
nm = Normalizer()
nm_data = nm.fit_transform(data1)
nm_data =pd.DataFrame(nm_data,columns=data1.columns)

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV,KFold
import sklearn.metrics as metrics

# Standard Scalar

## Decision Tree

In [19]:
X = sc_data.drop('Revenue',axis=1)
Y = data1.Revenue

In [20]:
DT = DecisionTreeClassifier()

In [21]:
parameter ={'max_depth':np.arange(1,10)}
GS=GridSearchCV(DT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [22]:
GS.best_params_

{'max_depth': 2}

In [23]:
DT = DecisionTreeClassifier(max_depth=2)

###  K Fold Cross Validation

In [24]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [25]:
precision=[]
recall  =[]
roc_auc = []
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    DT.fit(xtrain,ytrain)
    y_predict = DT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
print(' Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'%(np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Precision: 0.73  Recall: 0.46  ROC-AOC: 0.71


## Random Forest

In [26]:
RT = RandomForestClassifier(random_state=0)

In [27]:
parameter ={'n_estimators':np.arange(1,10),'max_depth':np.arange(1,10)}
GS=GridSearchCV(RT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [28]:
GS.best_params_

{'max_depth': 5, 'n_estimators': 3}

In [29]:
RT = RandomForestClassifier(n_estimators=3,max_depth=5,random_state=0)

###  K Fold Cross Validation

In [30]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [31]:
precision=[]
recall  =[]
roc_auc = []
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    RT.fit(xtrain,ytrain)
    y_predict = RT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
print(' Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'%(np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Precision: 0.77  Recall: 0.32  ROC-AOC: 0.65


# MinMax Scalar

## Decision Tree

In [58]:
X = mm_data.drop('Revenue',axis=1)
Y = data1.Revenue

In [59]:
DT = DecisionTreeClassifier()

In [60]:
parameter ={'max_depth':np.arange(1,10)}
GS=GridSearchCV(DT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [61]:
GS.best_params_

{'max_depth': 2}

In [62]:
DT = DecisionTreeClassifier(max_depth=2)

###  K Fold Cross Validation

In [63]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [64]:
precision=[]
recall  =[]
roc_auc = []
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    DT.fit(xtrain,ytrain)
    y_predict = DT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
print(' Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'%(np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Precision: 0.73  Recall: 0.46  ROC-AOC: 0.71


## Random Forest

In [65]:
RT = RandomForestClassifier(random_state=0)

In [66]:
parameter ={'n_estimators':np.arange(1,10),'max_depth':np.arange(1,10)}
GS=GridSearchCV(RT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [67]:
GS.best_params_

{'max_depth': 5, 'n_estimators': 3}

In [33]:
RT = RandomForestClassifier(n_estimators=3,max_depth=5,random_state=0)

###  K Fold Cross Validation

In [34]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [50]:
precision=[]
recall  =[]
roc_auc = []
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    RT.fit(xtrain,ytrain)
    y_predict = RT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
print(' Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'%(np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Precision: 0.76  Recall: 0.45  ROC-AOC: 0.71


   # MAXABS Scalar

## Decision Tree

In [52]:
X = ma_data.drop('Revenue',axis=1)
Y = data1.Revenue

In [53]:
DT = DecisionTreeClassifier()

In [54]:
parameter ={'max_depth':np.arange(1,10)}
GS=GridSearchCV(DT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [55]:
GS.best_params_

{'max_depth': 2}

In [56]:
DT = DecisionTreeClassifier(max_depth=2)

###  K Fold Cross Validation

In [57]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [58]:
precision=[]
recall  =[]
roc_auc = []
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    DT.fit(xtrain,ytrain)
    y_predict = DT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
print(' Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'%(np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Precision: 0.73  Recall: 0.46  ROC-AOC: 0.71


## Random Forest

In [59]:
RT = RandomForestClassifier(random_state=0)

In [60]:
parameter ={'n_estimators':np.arange(1,10),'max_depth':np.arange(1,10)}
GS=GridSearchCV(RT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [61]:
GS.best_params_

{'max_depth': 5, 'n_estimators': 3}

In [62]:
RT = RandomForestClassifier(n_estimators=3,max_depth=5,random_state=0)

###  K Fold Cross Validation

In [63]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [64]:
precision=[]
recall  =[]
roc_auc = []
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    RT.fit(xtrain,ytrain)
    y_predict = RT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
print(' Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'%(np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Precision: 0.72  Recall: 0.54  ROC-AOC: 0.75


# Robust Scalar

## Decision Tree

In [36]:
X = rs_data.drop('Revenue',axis=1)
Y = data1.Revenue

In [37]:
DT = DecisionTreeClassifier()

In [38]:
parameter ={'max_depth':np.arange(1,10)}
GS=GridSearchCV(DT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [39]:
GS.best_params_

{'max_depth': 2}

In [40]:
DT = DecisionTreeClassifier(max_depth=2)

###  K Fold Cross Validation

In [41]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [42]:
precision=[]
recall  =[]
roc_auc = []
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    DT.fit(xtrain,ytrain)
    y_predict = DT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
print(' Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'%(np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Precision: 0.73  Recall: 0.46  ROC-AOC: 0.71


## Random Forest

In [43]:
RT = RandomForestClassifier(random_state=0)

In [44]:
parameter ={'n_estimators':np.arange(1,10),'max_depth':np.arange(1,10)}
GS=GridSearchCV(RT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [45]:
GS.best_params_

{'max_depth': 5, 'n_estimators': 6}

In [47]:
RT = RandomForestClassifier(n_estimators=6,max_depth=5,random_state=0)

###  K Fold Cross Validation

In [48]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [49]:
precision=[]
recall  =[]
roc_auc = []
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    RT.fit(xtrain,ytrain)
    y_predict = RT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
print(' Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'%(np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Precision: 0.76  Recall: 0.45  ROC-AOC: 0.71


# QuantileTransformer-Normal Scalar

## Decision Tree

In [65]:
X = qt_data.drop('Revenue',axis=1)
Y = data1.Revenue

In [66]:
DT = DecisionTreeClassifier()

In [67]:
parameter ={'max_depth':np.arange(1,10)}
GS=GridSearchCV(DT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [68]:
GS.best_params_

{'max_depth': 2}

In [69]:
DT = DecisionTreeClassifier(max_depth=2)

###  K Fold Cross Validation

In [70]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [71]:
precision=[]
recall  =[]
roc_auc = []
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    DT.fit(xtrain,ytrain)
    y_predict = DT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
print(' Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'%(np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Precision: 0.73  Recall: 0.46  ROC-AOC: 0.71


## Random Forest

In [72]:
RT = RandomForestClassifier(random_state=0)

In [73]:
parameter ={'n_estimators':np.arange(1,10),'max_depth':np.arange(1,10)}
GS=GridSearchCV(RT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [74]:
GS.best_params_

{'max_depth': 5, 'n_estimators': 8}

In [75]:
RT = RandomForestClassifier(n_estimators=8,max_depth=5)

###  K Fold Cross Validation

In [76]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [77]:
precision=[]
recall  =[]
roc_auc = []
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    RT.fit(xtrain,ytrain)
    y_predict = RT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
print(' Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'%(np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Precision: 0.81  Recall: 0.40  ROC-AOC: 0.69


# QuantileTransformer-Uniform Scalar

## Decision Tree

In [78]:
X = qtu_data.drop('Revenue',axis=1)
Y = data1.Revenue

In [79]:
DT = DecisionTreeClassifier()

In [80]:
parameter ={'max_depth':np.arange(1,10)}
GS=GridSearchCV(DT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [81]:
GS.best_params_

{'max_depth': 2}

In [82]:
DT = DecisionTreeClassifier(max_depth=2)

###  K Fold Cross Validation

In [83]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [84]:
precision=[]
recall  =[]
roc_auc = []
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    DT.fit(xtrain,ytrain)
    y_predict = DT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
print(' Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'%(np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Precision: 0.73  Recall: 0.46  ROC-AOC: 0.71


## Random Forest

In [85]:
RT = RandomForestClassifier(random_state=0)

In [86]:
parameter ={'n_estimators':np.arange(1,10),'max_depth':np.arange(1,10)}
GS=GridSearchCV(RT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [87]:
GS.best_params_

{'max_depth': 5, 'n_estimators': 8}

In [88]:
RT = RandomForestClassifier(n_estimators=8,max_depth=5)

###  K Fold Cross Validation

In [89]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [90]:
precision=[]
recall  =[]
roc_auc = []
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    RT.fit(xtrain,ytrain)
    y_predict = RT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
print(' Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'%(np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Precision: 0.76  Recall: 0.45  ROC-AOC: 0.71


# PowerTransformer Scalar

## Decision Tree

In [91]:
X = pt_data.drop('Revenue',axis=1)
Y = data1.Revenue

In [92]:
DT = DecisionTreeClassifier()

In [93]:
parameter ={'max_depth':np.arange(1,10)}
GS=GridSearchCV(DT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [94]:
GS.best_params_

{'max_depth': 2}

In [95]:
DT = DecisionTreeClassifier(max_depth=2)

###  K Fold Cross Validation

In [96]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [97]:
precision=[]
recall  =[]
roc_auc = []
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    DT.fit(xtrain,ytrain)
    y_predict = DT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
print(' Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'%(np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Precision: 0.73  Recall: 0.46  ROC-AOC: 0.71


## Random Forest

In [98]:
RT = RandomForestClassifier(random_state=0)

In [99]:
parameter ={'n_estimators':np.arange(1,10),'max_depth':np.arange(1,10)}
GS=GridSearchCV(RT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [100]:
GS.best_params_

{'max_depth': 5, 'n_estimators': 8}

In [101]:
RT = RandomForestClassifier(n_estimators=8,max_depth=5)

###  K Fold Cross Validation

In [102]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [103]:
precision=[]
recall  =[]
roc_auc = []
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    RT.fit(xtrain,ytrain)
    y_predict = RT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
print(' Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'%(np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Precision: 0.76  Recall: 0.48  ROC-AOC: 0.73


# Normalizer

## Decision Tree

In [105]:
X = nm_data.drop('Revenue',axis=1)
Y = data1.Revenue

In [106]:
DT = DecisionTreeClassifier()

In [107]:
parameter ={'max_depth':np.arange(1,10)}
GS=GridSearchCV(DT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [108]:
GS.best_params_

{'max_depth': 1}

In [109]:
DT = DecisionTreeClassifier(max_depth=1)

###  K Fold Cross Validation

In [110]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [111]:
precision=[]
recall  =[]
roc_auc = []
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    DT.fit(xtrain,ytrain)
    y_predict = DT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
print(' Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'%(np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Precision: 0.56  Recall: 0.80  ROC-AOC: 0.84


## Random Forest

In [112]:
RT = RandomForestClassifier(random_state=0)

In [113]:
parameter ={'n_estimators':np.arange(1,10),'max_depth':np.arange(1,10)}
GS=GridSearchCV(RT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [114]:
GS.best_params_

{'max_depth': 9, 'n_estimators': 9}

In [115]:
RT = RandomForestClassifier(n_estimators=9,max_depth=9)

###  K Fold Cross Validation

In [116]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [117]:
precision=[]
recall  =[]
roc_auc = []
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    RT.fit(xtrain,ytrain)
    y_predict = RT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
print(' Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'%(np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Precision: 0.73  Recall: 0.54  ROC-AOC: 0.75
