# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# Loading dataset

In [2]:
data = pd.read_csv('online_shoppers_intention.csv')

In [3]:
data.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [4]:
data1 = data.copy()

# Encoding the Categorical Columns to Numerical

In [5]:
from sklearn.preprocessing import LabelEncoder

In [6]:
LB = LabelEncoder()

In [7]:
data1['Month'] = LB.fit_transform(data['Month'])
data1['VisitorType'] = LB.fit_transform(data['VisitorType'])
data1['Weekend'] = LB.fit_transform(data['Weekend'])
data1['Revenue'] = LB.fit_transform(data['Revenue'])

In [8]:
data1.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,2,1,1,1,1,2,0,0
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,2,2,2,1,2,2,0,0
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,2,4,1,9,3,2,0,0
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,2,3,2,2,4,2,0,0
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,2,3,3,1,4,2,1,0


In [9]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler,MaxAbsScaler,RobustScaler,QuantileTransformer,PowerTransformer,Normalizer

In [10]:
sc = StandardScaler()
sc_data = sc.fit_transform(data1)
sc_data = pd.DataFrame(sc_data,columns=data1.columns)

In [11]:
mm = MinMaxScaler()
mm_data = mm.fit_transform(data1)
mm_data = pd.DataFrame(mm_data,columns=data1.columns)

In [12]:
ma = MaxAbsScaler()
ma_data = ma.fit_transform(data1)
ma_data = pd.DataFrame(ma_data,columns=data1.columns)

In [13]:
rs = RobustScaler()
rs_data = rs.fit_transform(data1)
rs_data = pd.DataFrame(rs_data,columns=data1.columns)

In [14]:
qt = QuantileTransformer(output_distribution='normal')
qt_data = qt.fit_transform(data1)
qt_data = pd.DataFrame(qt_data,columns=data1.columns)

In [15]:
qt_u = QuantileTransformer(output_distribution='uniform')
qtu_data = qt_u.fit_transform(data1)
qtu_data = pd.DataFrame(qtu_data,columns=data1.columns)

In [16]:
pt = PowerTransformer(method='yeo-johnson')
pt_data = pt.fit_transform(data1)
pt_data = pd.DataFrame(pt_data,columns=data1.columns)

In [17]:
nm = Normalizer()
nm_data = nm.fit_transform(data1)
nm_data =pd.DataFrame(nm_data,columns=data1.columns)

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV,KFold
import sklearn.metrics as metrics

# Standard Scalar

## Logistic Regression

In [19]:
X = sc_data.drop('Revenue',axis=1)
Y = data1.Revenue

In [20]:
LR = LogisticRegression()

###  K Fold Cross Validation

In [21]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [22]:
precision=[]
recall  =[]
roc_auc = []
accuracy=[]
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    LR.fit(xtrain,ytrain)
    y_predict = LR.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
    accuracy.append(metrics.accuracy_score(ytest,y_predict))
print(' Accuracy: %0.02f  Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'
      %(np.mean(accuracy),np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Accuracy: 0.88  Precision: 0.75  Recall: 0.38  ROC-AOC: 0.68


In [23]:
# Storing results in a dataframe
resultsDf = pd.DataFrame({'Scaling':['Standard'],'Model':['Logistic Regression'], 
                          'Accuracy': [np.mean(accuracy)], 'Precision': [np.mean(precision)],
                          'Recall':[np.mean(recall)],'ROC-AUC':[np.mean(roc_auc)]})
resultsDf = resultsDf[['Scaling','Model', 'Accuracy', 'Precision','Recall','ROC-AUC']]
resultsDf

Unnamed: 0,Scaling,Model,Accuracy,Precision,Recall,ROC-AUC
0,Standard,Logistic Regression,0.884509,0.753793,0.376252,0.676892


## Decision Tree

In [24]:
DT = DecisionTreeClassifier()

In [25]:
parameter ={'max_depth':np.arange(1,10)}
GS=GridSearchCV(DT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [26]:
GS.best_params_

{'max_depth': 2}

In [27]:
DT = DecisionTreeClassifier(max_depth=2)

###  K Fold Cross Validation

In [28]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [29]:
precision=[]
recall  =[]
roc_auc = []
accuracy=[]
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    DT.fit(xtrain,ytrain)
    y_predict = DT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
    accuracy.append(metrics.accuracy_score(ytest,y_predict))
print(' Accuracy: %0.02f  Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'
      %(np.mean(accuracy),np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Accuracy: 0.89  Precision: 0.73  Recall: 0.46  ROC-AOC: 0.71


In [30]:
# Storing results in a dataframe
tempResultsDf = pd.DataFrame({'Scaling':['Standard'],'Model':['Decision Tree'], 
                          'Accuracy': [np.mean(accuracy)], 'Precision': [np.mean(precision)],
                          'Recall':[np.mean(recall)],'ROC-AUC':[np.mean(roc_auc)]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Scaling','Model', 'Accuracy', 'Precision','Recall','ROC-AUC']]
resultsDf

Unnamed: 0,Scaling,Model,Accuracy,Precision,Recall,ROC-AUC
0,Standard,Logistic Regression,0.884509,0.753793,0.376252,0.676892
0,Standard,Decision Tree,0.890187,0.731083,0.460727,0.714861


## Random Forest

In [31]:
RT = RandomForestClassifier(random_state=0)

In [32]:
parameter ={'n_estimators':np.arange(1,10),'max_depth':np.arange(1,10)}
GS=GridSearchCV(RT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [33]:
GS.best_params_

{'max_depth': 5, 'n_estimators': 3}

In [34]:
RT = RandomForestClassifier(n_estimators=3,max_depth=5,random_state=0)

###  K Fold Cross Validation

In [35]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [36]:
precision=[]
recall  =[]
roc_auc = []
accuracy=[]
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    RT.fit(xtrain,ytrain)
    y_predict = RT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
    accuracy.append(metrics.accuracy_score(ytest,y_predict))
print(' Accuracy: %0.02f  Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'
      %(np.mean(accuracy),np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Accuracy: 0.89  Precision: 0.72  Recall: 0.54  ROC-AOC: 0.75


In [37]:
# Storing results in a dataframe
tempResultsDf = pd.DataFrame({'Scaling':['Standard'],'Model':['Random Forest'], 
                          'Accuracy': [np.mean(accuracy)], 'Precision': [np.mean(precision)],
                          'Recall':[np.mean(recall)],'ROC-AUC':[np.mean(roc_auc)]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Scaling','Model', 'Accuracy', 'Precision','Recall','ROC-AUC']]
resultsDf

Unnamed: 0,Scaling,Model,Accuracy,Precision,Recall,ROC-AUC
0,Standard,Logistic Regression,0.884509,0.753793,0.376252,0.676892
0,Standard,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,Standard,Random Forest,0.894809,0.719285,0.5411,0.750618


# MinMax Scalar

## Logistic Regression

In [38]:
X = mm_data.drop('Revenue',axis=1)
Y = data1.Revenue

In [39]:
LR = LogisticRegression()

###  K Fold Cross Validation

In [40]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [41]:
precision=[]
recall  =[]
roc_auc = []
accuracy=[]
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    LR.fit(xtrain,ytrain)
    y_predict = LR.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
    accuracy.append(metrics.accuracy_score(ytest,y_predict))
print(' Accuracy: %0.02f  Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'
      %(np.mean(accuracy),np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Accuracy: 0.88  Precision: 0.77  Recall: 0.28  ROC-AOC: 0.63


In [42]:
# Storing results in a dataframe
tempResultsDf = pd.DataFrame({'Scaling':['MinMax'],'Model':['Logistic Regression'], 
                          'Accuracy': [np.mean(accuracy)], 'Precision': [np.mean(precision)],
                          'Recall':[np.mean(recall)],'ROC-AUC':[np.mean(roc_auc)]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Scaling','Model', 'Accuracy', 'Precision','Recall','ROC-AUC']]
resultsDf

Unnamed: 0,Scaling,Model,Accuracy,Precision,Recall,ROC-AUC
0,Standard,Logistic Regression,0.884509,0.753793,0.376252,0.676892
0,Standard,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,Standard,Random Forest,0.894809,0.719285,0.5411,0.750618
0,MinMax,Logistic Regression,0.87502,0.768471,0.275069,0.629997


## Decision Tree

In [43]:
DT = DecisionTreeClassifier()

In [44]:
parameter ={'max_depth':np.arange(1,10)}
GS=GridSearchCV(DT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [45]:
GS.best_params_

{'max_depth': 2}

In [46]:
DT = DecisionTreeClassifier(max_depth=2)

###  K Fold Cross Validation

In [47]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [48]:
precision=[]
recall  =[]
roc_auc = []
accuracy=[]
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    DT.fit(xtrain,ytrain)
    y_predict = DT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
    accuracy.append(metrics.accuracy_score(ytest,y_predict))
print(' Accuracy: %0.02f  Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'
      %(np.mean(accuracy),np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Accuracy: 0.89  Precision: 0.73  Recall: 0.46  ROC-AOC: 0.71


In [49]:
# Storing results in a dataframe
tempResultsDf = pd.DataFrame({'Scaling':['MinMax'],'Model':['Decision Tree'], 
                          'Accuracy': [np.mean(accuracy)], 'Precision': [np.mean(precision)],
                          'Recall':[np.mean(recall)],'ROC-AUC':[np.mean(roc_auc)]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Scaling','Model', 'Accuracy', 'Precision','Recall','ROC-AUC']]
resultsDf

Unnamed: 0,Scaling,Model,Accuracy,Precision,Recall,ROC-AUC
0,Standard,Logistic Regression,0.884509,0.753793,0.376252,0.676892
0,Standard,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,Standard,Random Forest,0.894809,0.719285,0.5411,0.750618
0,MinMax,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MinMax,Decision Tree,0.890187,0.731083,0.460727,0.714861


## Random Forest

In [50]:
RT = RandomForestClassifier(random_state=0)

In [51]:
parameter ={'n_estimators':np.arange(1,10),'max_depth':np.arange(1,10)}
GS=GridSearchCV(RT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [52]:
GS.best_params_

{'max_depth': 5, 'n_estimators': 3}

In [53]:
RT = RandomForestClassifier(n_estimators=3,max_depth=5,random_state=0)

###  K Fold Cross Validation

In [54]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [55]:
precision=[]
recall  =[]
roc_auc = []
accuracy=[]
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    RT.fit(xtrain,ytrain)
    y_predict = RT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
    accuracy.append(metrics.accuracy_score(ytest,y_predict))
print(' Accuracy: %0.02f  Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'
      %(np.mean(accuracy),np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Accuracy: 0.89  Precision: 0.72  Recall: 0.54  ROC-AOC: 0.75


In [56]:
# Storing results in a dataframe
tempResultsDf = pd.DataFrame({'Scaling':['MinMax'],'Model':['Random Forest'], 
                          'Accuracy': [np.mean(accuracy)], 'Precision': [np.mean(precision)],
                          'Recall':[np.mean(recall)],'ROC-AUC':[np.mean(roc_auc)]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Scaling','Model', 'Accuracy', 'Precision','Recall','ROC-AUC']]
resultsDf

Unnamed: 0,Scaling,Model,Accuracy,Precision,Recall,ROC-AUC
0,Standard,Logistic Regression,0.884509,0.753793,0.376252,0.676892
0,Standard,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,Standard,Random Forest,0.894809,0.719285,0.5411,0.750618
0,MinMax,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MinMax,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,MinMax,Random Forest,0.894891,0.719843,0.5411,0.750666


   # MAXABS Scalar

## Logistic Regression

In [57]:
X = ma_data.drop('Revenue',axis=1)
Y = data1.Revenue

In [58]:
LR = LogisticRegression()

###  K Fold Cross Validation

In [59]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [60]:
precision=[]
recall  =[]
roc_auc = []
accuracy=[]
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    LR.fit(xtrain,ytrain)
    y_predict = LR.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
    accuracy.append(metrics.accuracy_score(ytest,y_predict))
print(' Accuracy: %0.02f  Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'
      %(np.mean(accuracy),np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Accuracy: 0.88  Precision: 0.77  Recall: 0.28  ROC-AOC: 0.63


In [61]:
# Storing results in a dataframe
tempResultsDf = pd.DataFrame({'Scaling':['MaxAbs'],'Model':['Logistic Regression'], 
                          'Accuracy': [np.mean(accuracy)], 'Precision': [np.mean(precision)],
                          'Recall':[np.mean(recall)],'ROC-AUC':[np.mean(roc_auc)]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Scaling','Model', 'Accuracy', 'Precision','Recall','ROC-AUC']]
resultsDf

Unnamed: 0,Scaling,Model,Accuracy,Precision,Recall,ROC-AUC
0,Standard,Logistic Regression,0.884509,0.753793,0.376252,0.676892
0,Standard,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,Standard,Random Forest,0.894809,0.719285,0.5411,0.750618
0,MinMax,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MinMax,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,MinMax,Random Forest,0.894891,0.719843,0.5411,0.750666
0,MaxAbs,Logistic Regression,0.87502,0.768471,0.275069,0.629997


## Decision Tree

In [62]:
DT = DecisionTreeClassifier()

In [63]:
parameter ={'max_depth':np.arange(1,10)}
GS=GridSearchCV(DT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [64]:
GS.best_params_

{'max_depth': 2}

In [65]:
DT = DecisionTreeClassifier(max_depth=2)

###  K Fold Cross Validation

In [66]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [67]:
precision=[]
recall  =[]
roc_auc = []
accuracy=[]
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    DT.fit(xtrain,ytrain)
    y_predict = DT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
    accuracy.append(metrics.accuracy_score(ytest,y_predict))
print(' Accuracy: %0.02f  Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'
      %(np.mean(accuracy),np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Accuracy: 0.89  Precision: 0.73  Recall: 0.46  ROC-AOC: 0.71


In [68]:
# Storing results in a dataframe
tempResultsDf = pd.DataFrame({'Scaling':['MaxABS'],'Model':['Decision Tree'], 
                          'Accuracy': [np.mean(accuracy)], 'Precision': [np.mean(precision)],
                          'Recall':[np.mean(recall)],'ROC-AUC':[np.mean(roc_auc)]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Scaling','Model', 'Accuracy', 'Precision','Recall','ROC-AUC']]
resultsDf

Unnamed: 0,Scaling,Model,Accuracy,Precision,Recall,ROC-AUC
0,Standard,Logistic Regression,0.884509,0.753793,0.376252,0.676892
0,Standard,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,Standard,Random Forest,0.894809,0.719285,0.5411,0.750618
0,MinMax,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MinMax,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,MinMax,Random Forest,0.894891,0.719843,0.5411,0.750666
0,MaxAbs,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MaxABS,Decision Tree,0.890187,0.731083,0.460727,0.714861


## Random Forest

In [69]:
RT = RandomForestClassifier(random_state=0)

In [70]:
parameter ={'n_estimators':np.arange(1,10),'max_depth':np.arange(1,10)}
GS=GridSearchCV(RT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [71]:
GS.best_params_

{'max_depth': 5, 'n_estimators': 3}

In [72]:
RT = RandomForestClassifier(n_estimators=3,max_depth=5,random_state=0)

###  K Fold Cross Validation

In [73]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [74]:
precision=[]
recall  =[]
roc_auc = []
accuracy=[]
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    RT.fit(xtrain,ytrain)
    y_predict = RT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
    accuracy.append(metrics.accuracy_score(ytest,y_predict))
print(' Accuracy: %0.02f  Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'
      %(np.mean(accuracy),np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Accuracy: 0.89  Precision: 0.72  Recall: 0.54  ROC-AOC: 0.75


In [75]:
# Storing results in a dataframe
tempResultsDf = pd.DataFrame({'Scaling':['MaxABS'],'Model':['Random Forest'], 
                          'Accuracy': [np.mean(accuracy)], 'Precision': [np.mean(precision)],
                          'Recall':[np.mean(recall)],'ROC-AUC':[np.mean(roc_auc)]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Scaling','Model', 'Accuracy', 'Precision','Recall','ROC-AUC']]
resultsDf

Unnamed: 0,Scaling,Model,Accuracy,Precision,Recall,ROC-AUC
0,Standard,Logistic Regression,0.884509,0.753793,0.376252,0.676892
0,Standard,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,Standard,Random Forest,0.894809,0.719285,0.5411,0.750618
0,MinMax,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MinMax,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,MinMax,Random Forest,0.894891,0.719843,0.5411,0.750666
0,MaxAbs,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MaxABS,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,MaxABS,Random Forest,0.894891,0.719843,0.5411,0.750666


# Robust Scalar

## Logistic Regression

In [76]:
X = rs_data.drop('Revenue',axis=1)
Y = data1.Revenue

In [77]:
LR = LogisticRegression()

###  K Fold Cross Validation

In [78]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [79]:
precision=[]
recall  =[]
roc_auc = []
accuracy=[]
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    LR.fit(xtrain,ytrain)
    y_predict = LR.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
    accuracy.append(metrics.accuracy_score(ytest,y_predict))
print(' Accuracy: %0.02f  Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'
      %(np.mean(accuracy),np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Accuracy: 0.88  Precision: 0.75  Recall: 0.38  ROC-AOC: 0.68


In [80]:
# Storing results in a dataframe
tempResultsDf = pd.DataFrame({'Scaling':['Robust'],'Model':['Logistic Regression'], 
                          'Accuracy': [np.mean(accuracy)], 'Precision': [np.mean(precision)],
                          'Recall':[np.mean(recall)],'ROC-AUC':[np.mean(roc_auc)]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Scaling','Model', 'Accuracy', 'Precision','Recall','ROC-AUC']]
resultsDf

Unnamed: 0,Scaling,Model,Accuracy,Precision,Recall,ROC-AUC
0,Standard,Logistic Regression,0.884509,0.753793,0.376252,0.676892
0,Standard,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,Standard,Random Forest,0.894809,0.719285,0.5411,0.750618
0,MinMax,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MinMax,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,MinMax,Random Forest,0.894891,0.719843,0.5411,0.750666
0,MaxAbs,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MaxABS,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,MaxABS,Random Forest,0.894891,0.719843,0.5411,0.750666
0,Robust,Logistic Regression,0.884672,0.754878,0.37677,0.677199


## Decision Tree

In [81]:
DT = DecisionTreeClassifier()

In [82]:
parameter ={'max_depth':np.arange(1,10)}
GS=GridSearchCV(DT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [83]:
GS.best_params_

{'max_depth': 2}

In [84]:
DT = DecisionTreeClassifier(max_depth=2)

###  K Fold Cross Validation

In [85]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [86]:
precision=[]
recall  =[]
roc_auc = []
accuracy=[]
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    DT.fit(xtrain,ytrain)
    y_predict = DT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
    accuracy.append(metrics.accuracy_score(ytest,y_predict))
print(' Accuracy: %0.02f  Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'
      %(np.mean(accuracy),np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Accuracy: 0.89  Precision: 0.73  Recall: 0.46  ROC-AOC: 0.71


In [87]:
# Storing results in a dataframe
tempResultsDf = pd.DataFrame({'Scaling':['Robust'],'Model':['Decision Tree'], 
                          'Accuracy': [np.mean(accuracy)], 'Precision': [np.mean(precision)],
                          'Recall':[np.mean(recall)],'ROC-AUC':[np.mean(roc_auc)]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Scaling','Model', 'Accuracy', 'Precision','Recall','ROC-AUC']]
resultsDf

Unnamed: 0,Scaling,Model,Accuracy,Precision,Recall,ROC-AUC
0,Standard,Logistic Regression,0.884509,0.753793,0.376252,0.676892
0,Standard,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,Standard,Random Forest,0.894809,0.719285,0.5411,0.750618
0,MinMax,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MinMax,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,MinMax,Random Forest,0.894891,0.719843,0.5411,0.750666
0,MaxAbs,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MaxABS,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,MaxABS,Random Forest,0.894891,0.719843,0.5411,0.750666
0,Robust,Logistic Regression,0.884672,0.754878,0.37677,0.677199


## Random Forest

In [88]:
RT = RandomForestClassifier(random_state=0)

In [89]:
parameter ={'n_estimators':np.arange(1,10),'max_depth':np.arange(1,10)}
GS=GridSearchCV(RT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [90]:
GS.best_params_

{'max_depth': 5, 'n_estimators': 6}

In [91]:
RT = RandomForestClassifier(n_estimators=6,max_depth=5,random_state=0)

###  K Fold Cross Validation

In [92]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [93]:
precision=[]
recall  =[]
roc_auc = []
accuracy=[]
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    RT.fit(xtrain,ytrain)
    y_predict = RT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
    accuracy.append(metrics.accuracy_score(ytest,y_predict))
print(' Accuracy: %0.02f  Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'
      %(np.mean(accuracy),np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Accuracy: 0.89  Precision: 0.76  Recall: 0.45  ROC-AOC: 0.71


In [94]:
# Storing results in a dataframe
tempResultsDf = pd.DataFrame({'Scaling':['Robust'],'Model':['Random Forest'], 
                          'Accuracy': [np.mean(accuracy)], 'Precision': [np.mean(precision)],
                          'Recall':[np.mean(recall)],'ROC-AUC':[np.mean(roc_auc)]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Scaling','Model', 'Accuracy', 'Precision','Recall','ROC-AUC']]
resultsDf

Unnamed: 0,Scaling,Model,Accuracy,Precision,Recall,ROC-AUC
0,Standard,Logistic Regression,0.884509,0.753793,0.376252,0.676892
0,Standard,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,Standard,Random Forest,0.894809,0.719285,0.5411,0.750618
0,MinMax,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MinMax,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,MinMax,Random Forest,0.894891,0.719843,0.5411,0.750666
0,MaxAbs,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MaxABS,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,MaxABS,Random Forest,0.894891,0.719843,0.5411,0.750666
0,Robust,Logistic Regression,0.884672,0.754878,0.37677,0.677199


# QuantileTransformer-Normal Scalar

## Logistic Regression

In [95]:
X = qt_data.drop('Revenue',axis=1)
Y = data1.Revenue

In [96]:
LR = LogisticRegression()

###  K Fold Cross Validation

In [97]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [98]:
precision=[]
recall  =[]
roc_auc = []
accuracy=[]
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    LR.fit(xtrain,ytrain)
    y_predict = LR.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
    accuracy.append(metrics.accuracy_score(ytest,y_predict))
print(' Accuracy: %0.02f  Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'
      %(np.mean(accuracy),np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Accuracy: 0.89  Precision: 0.67  Recall: 0.61  ROC-AOC: 0.78


In [99]:
# Storing results in a dataframe
tempResultsDf = pd.DataFrame({'Scaling':['Quantile-N'],'Model':['Logistic Regression'], 
                          'Accuracy': [np.mean(accuracy)], 'Precision': [np.mean(precision)],
                          'Recall':[np.mean(recall)],'ROC-AUC':[np.mean(roc_auc)]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Scaling','Model', 'Accuracy', 'Precision','Recall','ROC-AUC']]
resultsDf

Unnamed: 0,Scaling,Model,Accuracy,Precision,Recall,ROC-AUC
0,Standard,Logistic Regression,0.884509,0.753793,0.376252,0.676892
0,Standard,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,Standard,Random Forest,0.894809,0.719285,0.5411,0.750618
0,MinMax,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MinMax,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,MinMax,Random Forest,0.894891,0.719843,0.5411,0.750666
0,MaxAbs,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MaxABS,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,MaxABS,Random Forest,0.894891,0.719843,0.5411,0.750666
0,Robust,Logistic Regression,0.884672,0.754878,0.37677,0.677199


## Decision Tree

In [100]:
DT = DecisionTreeClassifier()

In [101]:
parameter ={'max_depth':np.arange(1,10)}
GS=GridSearchCV(DT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [102]:
GS.best_params_

{'max_depth': 2}

In [103]:
DT = DecisionTreeClassifier(max_depth=2)

###  K Fold Cross Validation

In [104]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [105]:
precision=[]
recall  =[]
roc_auc = []
accuracy=[]
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    DT.fit(xtrain,ytrain)
    y_predict = DT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
    accuracy.append(metrics.accuracy_score(ytest,y_predict))
print(' Accuracy: %0.02f  Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'
      %(np.mean(accuracy),np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Accuracy: 0.89  Precision: 0.73  Recall: 0.46  ROC-AOC: 0.71


In [106]:
# Storing results in a dataframe
tempResultsDf = pd.DataFrame({'Scaling':['Quantile-N'],'Model':['Decision Tree'], 
                          'Accuracy': [np.mean(accuracy)], 'Precision': [np.mean(precision)],
                          'Recall':[np.mean(recall)],'ROC-AUC':[np.mean(roc_auc)]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Scaling','Model', 'Accuracy', 'Precision','Recall','ROC-AUC']]
resultsDf

Unnamed: 0,Scaling,Model,Accuracy,Precision,Recall,ROC-AUC
0,Standard,Logistic Regression,0.884509,0.753793,0.376252,0.676892
0,Standard,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,Standard,Random Forest,0.894809,0.719285,0.5411,0.750618
0,MinMax,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MinMax,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,MinMax,Random Forest,0.894891,0.719843,0.5411,0.750666
0,MaxAbs,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MaxABS,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,MaxABS,Random Forest,0.894891,0.719843,0.5411,0.750666
0,Robust,Logistic Regression,0.884672,0.754878,0.37677,0.677199


## Random Forest

In [107]:
RT = RandomForestClassifier(random_state=0)

In [108]:
parameter ={'n_estimators':np.arange(1,10),'max_depth':np.arange(1,10)}
GS=GridSearchCV(RT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [109]:
GS.best_params_

{'max_depth': 5, 'n_estimators': 8}

In [110]:
RT = RandomForestClassifier(n_estimators=8,max_depth=5)

###  K Fold Cross Validation

In [111]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [112]:
precision=[]
recall  =[]
roc_auc = []
accuracy=[]
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    RT.fit(xtrain,ytrain)
    y_predict = RT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
    accuracy.append(metrics.accuracy_score(ytest,y_predict))
print(' Accuracy: %0.02f  Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'
      %(np.mean(accuracy),np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Accuracy: 0.90  Precision: 0.78  Recall: 0.48  ROC-AOC: 0.73


In [113]:
# Storing results in a dataframe
tempResultsDf = pd.DataFrame({'Scaling':['Quantile-N'],'Model':['Random Forest'], 
                          'Accuracy': [np.mean(accuracy)], 'Precision': [np.mean(precision)],
                          'Recall':[np.mean(recall)],'ROC-AUC':[np.mean(roc_auc)]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Scaling','Model', 'Accuracy', 'Precision','Recall','ROC-AUC']]
resultsDf

Unnamed: 0,Scaling,Model,Accuracy,Precision,Recall,ROC-AUC
0,Standard,Logistic Regression,0.884509,0.753793,0.376252,0.676892
0,Standard,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,Standard,Random Forest,0.894809,0.719285,0.5411,0.750618
0,MinMax,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MinMax,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,MinMax,Random Forest,0.894891,0.719843,0.5411,0.750666
0,MaxAbs,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MaxABS,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,MaxABS,Random Forest,0.894891,0.719843,0.5411,0.750666
0,Robust,Logistic Regression,0.884672,0.754878,0.37677,0.677199


# QuantileTransformer-Uniform Scalar

## Logistic Regression

In [114]:
X = qtu_data.drop('Revenue',axis=1)
Y = data1.Revenue

In [115]:
LR = LogisticRegression()

###  K Fold Cross Validation

In [116]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [117]:
precision=[]
recall  =[]
roc_auc = []
accuracy=[]
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    LR.fit(xtrain,ytrain)
    y_predict = LR.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
    accuracy.append(metrics.accuracy_score(ytest,y_predict))
print(' Accuracy: %0.02f  Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'
      %(np.mean(accuracy),np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Accuracy: 0.89  Precision: 0.68  Recall: 0.61  ROC-AOC: 0.78


In [118]:
# Storing results in a dataframe
tempResultsDf = pd.DataFrame({'Scaling':['Quantile-U'],'Model':['Logistic Regression'], 
                          'Accuracy': [np.mean(accuracy)], 'Precision': [np.mean(precision)],
                          'Recall':[np.mean(recall)],'ROC-AUC':[np.mean(roc_auc)]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Scaling','Model', 'Accuracy', 'Precision','Recall','ROC-AUC']]
resultsDf

Unnamed: 0,Scaling,Model,Accuracy,Precision,Recall,ROC-AUC
0,Standard,Logistic Regression,0.884509,0.753793,0.376252,0.676892
0,Standard,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,Standard,Random Forest,0.894809,0.719285,0.5411,0.750618
0,MinMax,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MinMax,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,MinMax,Random Forest,0.894891,0.719843,0.5411,0.750666
0,MaxAbs,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MaxABS,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,MaxABS,Random Forest,0.894891,0.719843,0.5411,0.750666
0,Robust,Logistic Regression,0.884672,0.754878,0.37677,0.677199


## Decision Tree

In [119]:
DT = DecisionTreeClassifier()

In [120]:
parameter ={'max_depth':np.arange(1,10)}
GS=GridSearchCV(DT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [121]:
GS.best_params_

{'max_depth': 2}

In [122]:
DT = DecisionTreeClassifier(max_depth=2)

###  K Fold Cross Validation

In [123]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [124]:
precision=[]
recall  =[]
roc_auc = []
accuracy=[]
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    DT.fit(xtrain,ytrain)
    y_predict = DT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
    accuracy.append(metrics.accuracy_score(ytest,y_predict))
print(' Accuracy: %0.02f  Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'
      %(np.mean(accuracy),np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Accuracy: 0.89  Precision: 0.73  Recall: 0.46  ROC-AOC: 0.71


In [125]:
# Storing results in a dataframe
tempResultsDf = pd.DataFrame({'Scaling':['Quantile-U'],'Model':['Decision Tree'], 
                          'Accuracy': [np.mean(accuracy)], 'Precision': [np.mean(precision)],
                          'Recall':[np.mean(recall)],'ROC-AUC':[np.mean(roc_auc)]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Scaling','Model', 'Accuracy', 'Precision','Recall','ROC-AUC']]
resultsDf

Unnamed: 0,Scaling,Model,Accuracy,Precision,Recall,ROC-AUC
0,Standard,Logistic Regression,0.884509,0.753793,0.376252,0.676892
0,Standard,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,Standard,Random Forest,0.894809,0.719285,0.5411,0.750618
0,MinMax,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MinMax,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,MinMax,Random Forest,0.894891,0.719843,0.5411,0.750666
0,MaxAbs,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MaxABS,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,MaxABS,Random Forest,0.894891,0.719843,0.5411,0.750666
0,Robust,Logistic Regression,0.884672,0.754878,0.37677,0.677199


## Random Forest

In [126]:
RT = RandomForestClassifier(random_state=0)

In [127]:
parameter ={'n_estimators':np.arange(1,10),'max_depth':np.arange(1,10)}
GS=GridSearchCV(RT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [128]:
GS.best_params_

{'max_depth': 5, 'n_estimators': 8}

In [129]:
RT = RandomForestClassifier(n_estimators=8,max_depth=5)

###  K Fold Cross Validation

In [130]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [131]:
precision=[]
recall  =[]
roc_auc = []
accuracy=[]
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    RT.fit(xtrain,ytrain)
    y_predict = RT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
    accuracy.append(metrics.accuracy_score(ytest,y_predict))
print(' Accuracy: %0.02f  Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'
      %(np.mean(accuracy),np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Accuracy: 0.90  Precision: 0.77  Recall: 0.47  ROC-AOC: 0.72


In [132]:
# Storing results in a dataframe
tempResultsDf = pd.DataFrame({'Scaling':['Quantile-U'],'Model':['Random Forest'], 
                          'Accuracy': [np.mean(accuracy)], 'Precision': [np.mean(precision)],
                          'Recall':[np.mean(recall)],'ROC-AUC':[np.mean(roc_auc)]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Scaling','Model', 'Accuracy', 'Precision','Recall','ROC-AUC']]
resultsDf

Unnamed: 0,Scaling,Model,Accuracy,Precision,Recall,ROC-AUC
0,Standard,Logistic Regression,0.884509,0.753793,0.376252,0.676892
0,Standard,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,Standard,Random Forest,0.894809,0.719285,0.5411,0.750618
0,MinMax,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MinMax,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,MinMax,Random Forest,0.894891,0.719843,0.5411,0.750666
0,MaxAbs,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MaxABS,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,MaxABS,Random Forest,0.894891,0.719843,0.5411,0.750666
0,Robust,Logistic Regression,0.884672,0.754878,0.37677,0.677199


# PowerTransformer Scalar

## Logistic Regression

In [133]:
X = pt_data.drop('Revenue',axis=1)
Y = data1.Revenue

In [134]:
LR = LogisticRegression()

###  K Fold Cross Validation

In [135]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [136]:
precision=[]
recall  =[]
roc_auc = []
accuracy=[]
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    LR.fit(xtrain,ytrain)
    y_predict = LR.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
    accuracy.append(metrics.accuracy_score(ytest,y_predict))
print(' Accuracy: %0.02f  Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'
      %(np.mean(accuracy),np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Accuracy: 0.89  Precision: 0.66  Recall: 0.63  ROC-AOC: 0.78


In [137]:
# Storing results in a dataframe
tempResultsDf = pd.DataFrame({'Scaling':['Power Transform'],'Model':['Logistic Regression'], 
                          'Accuracy': [np.mean(accuracy)], 'Precision': [np.mean(precision)],
                          'Recall':[np.mean(recall)],'ROC-AUC':[np.mean(roc_auc)]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Scaling','Model', 'Accuracy', 'Precision','Recall','ROC-AUC']]
resultsDf

Unnamed: 0,Scaling,Model,Accuracy,Precision,Recall,ROC-AUC
0,Standard,Logistic Regression,0.884509,0.753793,0.376252,0.676892
0,Standard,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,Standard,Random Forest,0.894809,0.719285,0.5411,0.750618
0,MinMax,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MinMax,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,MinMax,Random Forest,0.894891,0.719843,0.5411,0.750666
0,MaxAbs,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MaxABS,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,MaxABS,Random Forest,0.894891,0.719843,0.5411,0.750666
0,Robust,Logistic Regression,0.884672,0.754878,0.37677,0.677199


## Decision Tree

In [138]:
DT = DecisionTreeClassifier()

In [139]:
parameter ={'max_depth':np.arange(1,10)}
GS=GridSearchCV(DT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [140]:
GS.best_params_

{'max_depth': 2}

In [141]:
DT = DecisionTreeClassifier(max_depth=2)

###  K Fold Cross Validation

In [142]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [143]:
precision=[]
recall  =[]
roc_auc = []
accuracy=[]
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    DT.fit(xtrain,ytrain)
    y_predict = DT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
    accuracy.append(metrics.accuracy_score(ytest,y_predict))
print(' Accuracy: %0.02f  Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'
      %(np.mean(accuracy),np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Accuracy: 0.89  Precision: 0.73  Recall: 0.46  ROC-AOC: 0.71


In [144]:
# Storing results in a dataframe
tempResultsDf = pd.DataFrame({'Scaling':['Power Transform'],'Model':['Decision Tree'], 
                          'Accuracy': [np.mean(accuracy)], 'Precision': [np.mean(precision)],
                          'Recall':[np.mean(recall)],'ROC-AUC':[np.mean(roc_auc)]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Scaling','Model', 'Accuracy', 'Precision','Recall','ROC-AUC']]
resultsDf

Unnamed: 0,Scaling,Model,Accuracy,Precision,Recall,ROC-AUC
0,Standard,Logistic Regression,0.884509,0.753793,0.376252,0.676892
0,Standard,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,Standard,Random Forest,0.894809,0.719285,0.5411,0.750618
0,MinMax,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MinMax,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,MinMax,Random Forest,0.894891,0.719843,0.5411,0.750666
0,MaxAbs,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MaxABS,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,MaxABS,Random Forest,0.894891,0.719843,0.5411,0.750666
0,Robust,Logistic Regression,0.884672,0.754878,0.37677,0.677199


## Random Forest

In [145]:
RT = RandomForestClassifier(random_state=0)

In [146]:
parameter ={'n_estimators':np.arange(1,10),'max_depth':np.arange(1,10)}
GS=GridSearchCV(RT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [147]:
GS.best_params_

{'max_depth': 5, 'n_estimators': 8}

In [148]:
RT = RandomForestClassifier(n_estimators=8,max_depth=5)

###  K Fold Cross Validation

In [149]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [150]:
precision=[]
recall  =[]
roc_auc = []
accuracy=[]
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    RT.fit(xtrain,ytrain)
    y_predict = RT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
    accuracy.append(metrics.accuracy_score(ytest,y_predict))
print(' Accuracy: %0.02f  Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'
      %(np.mean(accuracy),np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Accuracy: 0.89  Precision: 0.77  Recall: 0.44  ROC-AOC: 0.71


In [151]:
# Storing results in a dataframe
tempResultsDf = pd.DataFrame({'Scaling':['Power Transform'],'Model':['Random Forest'], 
                          'Accuracy': [np.mean(accuracy)], 'Precision': [np.mean(precision)],
                          'Recall':[np.mean(recall)],'ROC-AUC':[np.mean(roc_auc)]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Scaling','Model', 'Accuracy', 'Precision','Recall','ROC-AUC']]
resultsDf

Unnamed: 0,Scaling,Model,Accuracy,Precision,Recall,ROC-AUC
0,Standard,Logistic Regression,0.884509,0.753793,0.376252,0.676892
0,Standard,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,Standard,Random Forest,0.894809,0.719285,0.5411,0.750618
0,MinMax,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MinMax,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,MinMax,Random Forest,0.894891,0.719843,0.5411,0.750666
0,MaxAbs,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MaxABS,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,MaxABS,Random Forest,0.894891,0.719843,0.5411,0.750666
0,Robust,Logistic Regression,0.884672,0.754878,0.37677,0.677199


# Normalizer

## Logistic Regression

In [152]:
X = nm_data.drop('Revenue',axis=1)
Y = data1.Revenue

In [153]:
LR = LogisticRegression()

###  K Fold Cross Validation

In [154]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [155]:
precision=[]
recall  =[]
roc_auc = []
accuracy=[]
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    LR.fit(xtrain,ytrain)
    y_predict = LR.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
    accuracy.append(metrics.accuracy_score(ytest,y_predict))
print(' Accuracy: %0.02f  Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'
      %(np.mean(accuracy),np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Accuracy: 0.86  Precision: 0.90  Recall: 0.11  ROC-AOC: 0.55


In [156]:
# Storing results in a dataframe
tempResultsDf = pd.DataFrame({'Scaling':['Normalizer'],'Model':['Logistic Regression'], 
                          'Accuracy': [np.mean(accuracy)], 'Precision': [np.mean(precision)],
                          'Recall':[np.mean(recall)],'ROC-AUC':[np.mean(roc_auc)]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Scaling','Model', 'Accuracy', 'Precision','Recall','ROC-AUC']]
resultsDf

Unnamed: 0,Scaling,Model,Accuracy,Precision,Recall,ROC-AUC
0,Standard,Logistic Regression,0.884509,0.753793,0.376252,0.676892
0,Standard,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,Standard,Random Forest,0.894809,0.719285,0.5411,0.750618
0,MinMax,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MinMax,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,MinMax,Random Forest,0.894891,0.719843,0.5411,0.750666
0,MaxAbs,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MaxABS,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,MaxABS,Random Forest,0.894891,0.719843,0.5411,0.750666
0,Robust,Logistic Regression,0.884672,0.754878,0.37677,0.677199


## Decision Tree

In [157]:
DT = DecisionTreeClassifier()

In [158]:
parameter ={'max_depth':np.arange(1,10)}
GS=GridSearchCV(DT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [159]:
GS.best_params_

{'max_depth': 1}

In [160]:
DT = DecisionTreeClassifier(max_depth=1)

###  K Fold Cross Validation

In [161]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [162]:
precision=[]
recall  =[]
accuracy=[]
roc_auc = []
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    DT.fit(xtrain,ytrain)
    y_predict = DT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
    accuracy.append(metrics.accuracy_score(ytest,y_predict))
print(' Accuracy: %0.02f Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'
      %(np.mean(accuracy),np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Accuracy: 0.87 Precision: 0.56  Recall: 0.80  ROC-AOC: 0.84


In [163]:
# Storing results in a dataframe
tempResultsDf = pd.DataFrame({'Scaling':['Normalizer'],'Model':['Decision Tree'], 
                          'Accuracy': [np.mean(accuracy)], 'Precision': [np.mean(precision)],
                          'Recall':[np.mean(recall)],'ROC-AUC':[np.mean(roc_auc)]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Scaling','Model', 'Accuracy', 'Precision','Recall','ROC-AUC']]
resultsDf

Unnamed: 0,Scaling,Model,Accuracy,Precision,Recall,ROC-AUC
0,Standard,Logistic Regression,0.884509,0.753793,0.376252,0.676892
0,Standard,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,Standard,Random Forest,0.894809,0.719285,0.5411,0.750618
0,MinMax,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MinMax,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,MinMax,Random Forest,0.894891,0.719843,0.5411,0.750666
0,MaxAbs,Logistic Regression,0.87502,0.768471,0.275069,0.629997
0,MaxABS,Decision Tree,0.890187,0.731083,0.460727,0.714861
0,MaxABS,Random Forest,0.894891,0.719843,0.5411,0.750666
0,Robust,Logistic Regression,0.884672,0.754878,0.37677,0.677199


## Random Forest

In [164]:
RT = RandomForestClassifier(random_state=0)

In [165]:
parameter ={'n_estimators':np.arange(1,10),'max_depth':np.arange(1,10)}
GS=GridSearchCV(RT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [166]:
GS.best_params_

{'max_depth': 9, 'n_estimators': 9}

In [167]:
RT = RandomForestClassifier(n_estimators=9,max_depth=9)

###  K Fold Cross Validation

In [168]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [169]:
precision=[]
recall  =[]
roc_auc = []
accuracy=[]
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    RT.fit(xtrain,ytrain)
    y_predict = RT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
    accuracy.append(metrics.accuracy_score(ytest,y_predict))
print(' Accuracy: %0.02f  Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'
      %(np.mean(accuracy),np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Accuracy: 0.90  Precision: 0.73  Recall: 0.53  ROC-AOC: 0.75


In [170]:
# Storing results in a dataframe
tempResultsDf = pd.DataFrame({'Scaling':['Normalizer'],'Model':['Random Forest'], 
                          'Accuracy': [np.mean(accuracy)], 'Precision': [np.mean(precision)],
                          'Recall':[np.mean(recall)],'ROC-AUC':[np.mean(roc_auc)]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Scaling','Model', 'Accuracy', 'Precision','Recall','ROC-AUC']]
resultsDf.index = [i for i in range(len(resultsDf))]
resultsDf

Unnamed: 0,Scaling,Model,Accuracy,Precision,Recall,ROC-AUC
0,Standard,Logistic Regression,0.884509,0.753793,0.376252,0.676892
1,Standard,Decision Tree,0.890187,0.731083,0.460727,0.714861
2,Standard,Random Forest,0.894809,0.719285,0.5411,0.750618
3,MinMax,Logistic Regression,0.87502,0.768471,0.275069,0.629997
4,MinMax,Decision Tree,0.890187,0.731083,0.460727,0.714861
5,MinMax,Random Forest,0.894891,0.719843,0.5411,0.750666
6,MaxAbs,Logistic Regression,0.87502,0.768471,0.275069,0.629997
7,MaxABS,Decision Tree,0.890187,0.731083,0.460727,0.714861
8,MaxABS,Random Forest,0.894891,0.719843,0.5411,0.750666
9,Robust,Logistic Regression,0.884672,0.754878,0.37677,0.677199


In [171]:
resultsDf.max()

Scaling           Standard
Model        Random Forest
Accuracy          0.897405
Precision         0.897099
Recall            0.801836
ROC-AUC           0.844407
dtype: object

# Inference:
    From all the scaling methods Standard scaling has provided us the best scores
    From all the model scores of Random Forest is par 80 for all the metrics.
    It provided us a good Auc as well only .16 percent has chances of misclassification.