In [38]:
# importing libraries

import numpy as np
import pandas as pd
from datetime import datetime as dt
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.stats import chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from functools import reduce
from statsmodels.stats.outliers_influence import variance_inflation_factor
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

%matplotlib inline

In [39]:
# reading datafile
data = pd.read_excel("Win_Prediction_Data.xlsx")

In [41]:
# looking into the dataset
data.head()

Unnamed: 0,Client Category,Solution Type,Deal Date,Sector,Location,VP Name,Manager Name,Deal Cost,Deal Status Code
0,Telecom,Solution 7,2012-03-27,Sector 24,L5,Ekta Zutshi,Gopa Trilochana,150000.0,Won
1,Telecom,Solution 7,2012-09-25,Sector 24,L5,Ekta Zutshi,Gopa Trilochana,744705.88,Won
2,Internal,Solution 59,2011-08-01,Sector 20,Others,Ekta Zutshi,Russell Dahlen,60000.0,Lost
3,Internal,Solution 59,2011-04-28,Sector 20,Others,Ekta Zutshi,Russell Dahlen,60000.0,Lost
4,Internal,Solution 32,2011-06-03,Sector 20,Others,Ekta Zutshi,Russell Dahlen,80882.35,Lost


In [4]:
# dataset info about features and rows
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10061 entries, 0 to 10060
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Client Category   9982 non-null   object        
 1   Solution Type     10061 non-null  object        
 2   Deal Date         10061 non-null  datetime64[ns]
 3   Sector            10061 non-null  object        
 4   Location          10061 non-null  object        
 5   VP Name           10061 non-null  object        
 6   Manager Name      10061 non-null  object        
 7   Deal Cost         10061 non-null  float64       
 8   Deal Status Code  10061 non-null  object        
dtypes: datetime64[ns](1), float64(1), object(7)
memory usage: 707.5+ KB


In [5]:
# describtion of numeric features
data.describe()

Unnamed: 0,Deal Cost
count,10061.0
mean,766896.9
std,1619894.0
min,0.0
25%,189705.9
50%,382352.9
75%,764705.9
max,36764710.0


In [40]:
# converting string columns into samecase, lowercase string
def same_case(*col):
    for i in col:
        data[i]=data[i].str.lower()
same_case('Client Category','Solution Type','Sector','Location','VP Name','Manager Name')    

In [7]:
# displaying the duplicate records in dataset
display("no. of duplicate records ",data.duplicated().sum(), "duplicate records ",data.loc[data.duplicated()])

'no. of duplicate records '

14

'duplicate records '

Unnamed: 0,Client Category,Solution Type,Deal Date,Sector,Location,VP Name,Manager Name,Deal Cost,Deal Status Code
3036,domestic public bank,solution 32,2015-03-20,sector 2,l7,rosanna maynez,earline langton,735294.12,Lost
3385,tech,solution 32,2015-07-31,sector 2,l10,long bergstrom,emil knudsen,235294.12,Lost
3880,telecom,solution 12,2013-08-30,sector 24,others,mangesh k. singh,sohil kumar,588235.29,Lost
5037,tech,solution 13,2012-05-08,sector 23,l5,sargar deep rao,karan dubey,0.0,Lost
7637,govt,solution 34,2018-02-21,sector 4,l10,mervin harwood,akshay sharma,602941.18,Lost
7705,internal,solution 9,2015-01-06,sector 20,others,sargar deep rao,brenton launius,2120294.12,Lost
7718,services_based,solution 26,2018-02-11,sector 17,l10,molly eakes,vansu dev,264705.88,Lost
8020,internal,solution 9,2015-07-02,sector 2,l9,sargar deep rao,rahul kocher,264705.88,Lost
8167,infrastructure,solution 9,2015-05-18,sector 24,l10,mervin harwood,nahar singh,352941.18,Lost
8177,services_based,solution 9,2015-10-03,sector 23,l1,ankita aggarwal,bhagwati prasad,147058.82,Lost


In [41]:
# removing duplicate records
def dup_remove(df):
    df.drop_duplicates(inplace = True)
    assert df.duplicated().sum() == 0
    
dup_remove(data)

In [42]:
# checking the missing values in all features
def check_missing(df):
    print(df.isna().sum())
check_missing(data)

Client Category     79
Solution Type        0
Deal Date            0
Sector               0
Location             0
VP Name              0
Manager Name         0
Deal Cost            0
Deal Status Code     0
dtype: int64


In [10]:
## mostly occured category in client feature
data["Client Category"].mode()[0]

'others'

In [43]:
# replacing the missing client category by its mode category as number of missing values is 0.007 percent of the total count
# and no other trend or pattern is observed.
def impute_missing_by_mode(df,col):
    df[col].fillna(df[col].mode()[0],inplace=True)
    
    
impute_missing_by_mode(data,"Client Category")

In [12]:
# counting the number of categories in each feature
for i in data.columns[:]:
    print(i,":",len(data[i].unique()),'labels')

Client Category : 41 labels
Solution Type : 67 labels
Deal Date : 2555 labels
Sector : 25 labels
Location : 13 labels
VP Name : 43 labels
Manager Name : 278 labels
Deal Cost : 1469 labels
Deal Status Code : 2 labels


In [13]:
# displaying all categories of features, featurewise.
def value_count(df):
    for i in df.columns:
        print(i,end="\n\n")
        print(data[i].value_counts())
        
value_count(data)

Client Category

others                   1841
internal                 1451
services_based           1200
tech                      911
domestic public bank      418
international bank        375
consulting                352
finance                   339
telecom                   326
power ind                 264
domestic private bank     261
insurance                 247
consumer good             185
automobiles               178
infrastructure            151
domestic bank             134
retail_market             126
govt                      120
hospitality               119
manufacturing             117
pharma                    110
healthcare                 99
electronics                81
media_journal              71
industries                 66
research development       63
energy                     57
knowledge                  50
management                 43
govt bank special          41
payment                    40
energy                     37
e-commerce             

In [7]:
## assosiation between catagorical features and catagorical target
def chisquare_test(data,target):
    from scipy.stats import chi2
    for i in data[data.columns.difference([target])]:
        crosstab = pd.crosstab(data[i], data[target])
        val = stats.chi2_contingency(crosstab)
        Observed_values = crosstab.values
        Expected_values = val[3]
        no_of_rows=len(crosstab.iloc[:,0])
        no_of_columns=len(crosstab.iloc[0,:])
        ddof=(no_of_rows-1)*(no_of_columns-1)
        print("Degree of Freedom:-",ddof)
        alpha = 0.05
        chi_square=sum([(o-e)**2./e for o,e in zip(Observed_values,Expected_values)])
        chi_square_statistic=chi_square[0]+chi_square[1]
        print("chi-square statistic:-",chi_square_statistic)

        critical_value=chi2.ppf(q=1-alpha,df=ddof)
        print('critical_value:',critical_value)

        if chi_square_statistic>=critical_value:
            print("Reject H0,There is a relationship between ",i, " and", target)
        else:
            print("Retain H0,There is no relationship between ",i, " and", target)
chisquare_test(data[data.columns.difference(['Deal Cost'])],"Deal Status Code")

Degree of Freedom:- 40
chi-square statistic:- 336.08990336417054
critical_value: 55.75847927888702
Reject H0,There is a relationship between  Client Category  and Deal Status Code
Degree of Freedom:- 2554
chi-square statistic:- 2745.2033065952082
critical_value: 2672.684138420052
Reject H0,There is a relationship between  Deal Date  and Deal Status Code
Degree of Freedom:- 12
chi-square statistic:- 135.13775740237162
critical_value: 21.02606981748307
Reject H0,There is a relationship between  Location  and Deal Status Code
Degree of Freedom:- 277
chi-square statistic:- 1203.8302583598988
critical_value: 316.8185115385132
Reject H0,There is a relationship between  Manager Name  and Deal Status Code
Degree of Freedom:- 24
chi-square statistic:- 167.99910626155884
critical_value: 36.41502850180731
Reject H0,There is a relationship between  Sector  and Deal Status Code
Degree of Freedom:- 66
chi-square statistic:- 649.7308294387966
critical_value: 85.96490744123096
Reject H0,There is a rel

In [44]:
# labeling the target : WIN :1 AND LOST 0
def encoding(df,*a):
    from sklearn.preprocessing import LabelEncoder
    encode = LabelEncoder()
    for i in a:
        df[i] = encode.fit_transform(df[i])
encoding(data,"Deal Status Code")

In [9]:
# checking the correlation between target and deal cost
display(data.corr())
## not much correlation between the features is observed.

Unnamed: 0,Deal Cost,Deal Status Code
Deal Cost,1.0,0.008474
Deal Status Code,0.008474,1.0


In [45]:
# encoding the manager and vp name features by mean encoding:
# aroung 257 categories in manager
# around 43 categories in vp name 
def mean_encoding(df,target,i):
    mean_nominal=df.groupby(i)[target].mean().to_dict()
    df[i]=df[i].map(mean_nominal)
    return df[i]
data["VP Name"] = mean_encoding(data,"Deal Status Code","VP Name")
data["Manager Name"] = mean_encoding(data,"Deal Status Code","Manager Name")

In [46]:
# one hot encoding in soltuion type,sector,location and client category
def dummy_encode(data,*a):
    df = pd.concat([pd.get_dummies(data[col],drop_first = True,prefix = col) for col in a],axis =1)
    return df
df = dummy_encode(data,"Solution Type","Sector","Location","Client Category")

In [47]:
# removing outliers in deal cost
def Outliers(df):
    for i in df.columns:
        if (df[i].dtype == 'float64'):
            Q1,Q3 = np.percentile(df[i],[25,75])
            IQR = Q3 - Q1
            lower = Q1- (1.5*IQR)
            upper = Q3 +(1.5*IQR)
            df[i] = df[i].apply(lambda x : lower if x < Q1 else upper if x > Q3 else x)

Outliers(data)

In [48]:
df[['Deal Cost','Deal Status Code','VP Name','Manager Name']]=data[['Deal Cost','Deal Status Code','VP Name','Manager Name']]

In [49]:
# feature engineering : generating the deal quaters from the new data having only unique dates
# and sum of win deals of unique dates and then order labelling them according to the win deals.
data['Month'] = data['Deal Date'].dt.month
bins = [0,3,6,9,12]
labels = ["quater 1","quater 2","quater 3","quater 4"]
data['quater'] = pd.cut(data['Month'], bins,labels=labels)
display((data.groupby(['quater'])['Deal Status Code'].sum()).sort_values(ascending = True))
a = {'quater 1':2,'quater 2':1,'quater 3':3,'quater 4':4}

data['Quater'] = data.quater.map(a)
df['Quater'] = data['Quater']

quater
quater 2     863
quater 1     908
quater 3     959
quater 4    1025
Name: Deal Status Code, dtype: int32

In [50]:
# scaling columns having count value greater than 1 , between 0 and 1.
def scaled_data(df,*a):
    to_scale = [col for col in a if df[col].max() > 1]
    mms = MinMaxScaler()
    scaled = mms.fit_transform(df[to_scale])
    scaled = pd.DataFrame(scaled, columns=to_scale)

    # Replace original columns with scaled ones
    for col in scaled:
        df[col] = scaled[col]
scaled_data(df,'Deal Cost','Quater')

In [16]:
# checking if there are any missing values after scaling.

check_missing(df)

Solution Type_solution 10     0
Solution Type_solution 11     0
Solution Type_solution 12     0
Solution Type_solution 13     0
Solution Type_solution 14     0
                             ..
Deal Cost                    14
Deal Status Code              0
VP Name                       0
Manager Name                  0
Quater                       14
Length: 147, dtype: int64


In [51]:
# removing the missing values as only 14 values are missing out of 10030 records
df.dropna(inplace = True)

In [53]:
# checking the skewness and kurtosis of numeric features, no high values observed.
df.agg(['skew', 'kurtosis']).transpose().loc[['Deal Cost']]

Unnamed: 0,skew,kurtosis
Deal Cost,0.117139,-1.058907


In [54]:
# splitting the features and target 
def feature_target_split(df,target):
    x = df[df.columns.difference([target])]
    y = df[target]
    return x,y
x,y = feature_target_split(df,"Deal Status Code")

In [20]:
# feature selection by extra tree classifier
# variable importance of features
def extratree_classifier(features, labels):
    model = ExtraTreesClassifier()
    model.fit(features, labels)
    Extra_Tree = pd.DataFrame(model.feature_importances_, columns = ["Extratrees"], index=features.columns)
    Extra_Tree = Extra_Tree.reset_index()
    display(Extra_Tree.sort_values(['Extratrees'],ascending=0))
    return Extra_Tree
Extra_Tree = extratree_classifier(x,y)

Unnamed: 0,index,Extratrees
40,Deal Cost,0.150501
53,Manager Name,0.130461
54,Quater,0.093287
145,VP Name,0.079456
103,Solution Type_solution 32,0.019104
...,...,...
129,Solution Type_solution 56,0.000044
124,Solution Type_solution 51,0.000043
126,Solution Type_solution 53,0.000026
91,Solution Type_solution 21,0.000021


In [21]:
#Recursive Feature Elimination
def rfe_selection(features, labels):
    model = LogisticRegression()
    rfe = RFE(model, 460)
    fit = rfe.fit(features, labels)
    recursive = pd.DataFrame(rfe.support_, columns = ["RFE"], index=features.columns)
    recursive = recursive.reset_index()
    display(recursive[recursive['RFE'] == True])
    return recursive
recursive = rfe_selection(x,y)

Unnamed: 0,index,RFE
0,Client Category_automobiles,True
1,Client Category_consulting,True
2,Client Category_consumer good,True
3,Client Category_domestic bank,True
4,Client Category_domestic private bank,True
...,...,...
141,Solution Type_solution 67,True
142,Solution Type_solution 7,True
143,Solution Type_solution 8,True
144,Solution Type_solution 9,True


In [22]:
# L1 feature Selection
def l_select(features, labels):
    lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(features, labels)
    model = SelectFromModel(lsvc,prefit=True)
    l1 = pd.DataFrame(model.get_support(), columns = ["L1"], index=features.columns)
    l1 = l1.reset_index()
    display(l1[l1['L1'] == True])
    return l1
l1 = l_select(x,y)

Unnamed: 0,index,L1
19,Client Category_internal,True
35,Client Category_services_based,True
38,Client Category_tech,True
40,Deal Cost,True
43,Location_l12,True
47,Location_l5,True
51,Location_l9,True
53,Manager Name,True
69,Sector_sector 23,True
81,Solution Type_solution 12,True


In [23]:
# combining all together
def combine_feature_select(fs):
    
    final_results = reduce(lambda left,right: pd.merge(left,right,on='index'), fs)
    return final_results


fs = [Extra_Tree, recursive, l1]
final_results  = combine_feature_select(fs)

In [24]:
# score table having final count of variable importance of the features
def score_table():
    columns = ['Extra_Tree', 'recursive', 'l1']
    score_table = pd.DataFrame({},[])
    score_table['index'] = final_results['index']
    score_table['Extratrees'] = final_results['Extratrees'].apply(lambda x: 0 if x <= 0.5 else 1)
    score_table['RFE'] = final_results['RFE'].astype(int)
    score_table['L1'] = final_results['L1'].astype(int)
    score_table['final_score'] = score_table.sum(axis=1)
    display(score_table.sort_values('final_score',ascending=0))
    return score_table
score_table = score_table()

Unnamed: 0,index,Extratrees,RFE,L1,final_score
145,VP Name,0,1,1,2
19,Client Category_internal,0,1,1,2
81,Solution Type_solution 12,0,1,1,2
69,Sector_sector 23,0,1,1,2
53,Manager Name,0,1,1,2
...,...,...,...,...,...
45,Location_l3,0,1,0,1
44,Location_l2,0,1,0,1
42,Location_l11,0,1,0,1
41,Location_l10,0,1,0,1


In [25]:
# printing important variables
print("Important variables likely to convert opportunity to win",x[list(score_table[score_table['final_score']>1]['index'])].columns)

Important variables likely to convert opportunity to win Index(['Client Category_internal', 'Client Category_services_based',
       'Client Category_tech', 'Deal Cost', 'Location_l12', 'Location_l5',
       'Location_l9', 'Manager Name', 'Sector_sector 23',
       'Solution Type_solution 12', 'Solution Type_solution 32',
       'Solution Type_solution 39', 'Solution Type_solution 67',
       'Solution Type_solution 8', 'Solution Type_solution 9', 'VP Name'],
      dtype='object')


In [26]:
# selecting features not having 0 score in variable importance score table
def selected_features(x,score_table):
    x = x[list(score_table[score_table['final_score'] >= 1]['index'])]
    return x
x = selected_features(x,score_table)
x.shape

(10033, 146)

In [27]:
## checking multicollinearity through vif

from statsmodels.stats.outliers_influence import variance_inflation_factor
def calculate_vif(x):
    vif = pd.DataFrame()
    vif["x"] = x.columns
    vif["VIF"] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]    
    return(vif)


vif = calculate_vif(x)

while vif['VIF'][vif['VIF'] > 4].any():
    remove = vif.sort_values('VIF',ascending=0)['x'][:1]
    x.drop(remove,axis=1,inplace=True)
    vif = calculate_vif(x)
x.shape

(10033, 142)

In [55]:
# since there is imbalance between classes , treating using smote technique
def smote(data,target):
    print("before smote",data[target].value_counts(normalize = True)*100)
    sm = SMOTE(random_state=400)
    x_sm, y_sm = sm.fit_resample(x, y)
    print("after smote",y_sm.value_counts(normalize=True) * 100)
    return x_sm,y_sm
x_sm,y_sm = smote(data,"Deal Status Code")

before smote 0    62.625659
1    37.374341
Name: Deal Status Code, dtype: float64
after smote 1    50.0
0    50.0
Name: Deal Status Code, dtype: float64


In [56]:
## train-test split
x_train,x_test,y_train,y_test = train_test_split(x_sm,y_sm,test_size = 0.3,random_state = 400)

In [30]:
# logistic regression function
def logistic(x_train,y_train,x_test,y_test):
    p = {'penalty':['l1','none'],'C': [1.0,0.01,0.1,0.5],'solver':["lbfgs","sag","saga"],
         'max_iter':[50,100]}
    model = LogisticRegression()
    model_final = GridSearchCV(model,param_grid = p, cv = 4, verbose = 10)
    model_final.fit(x_train,y_train)
    print(model_final.best_params_)
    pred_train_prob = model_final.predict_proba(x_train)
    pred_test_prob = model_final.predict_proba(x_test)
    fpr, tpr, _ = metrics.roc_curve(np.array(y_train), pred_train_prob[:,1])
    auc_train = metrics.auc(fpr,tpr)
    fpr, tpr, _ = metrics.roc_curve(np.array(y_test), pred_test_prob[:,1])
    auc_test = metrics.auc(fpr,tpr)
    pred_train = np.where(pred_train_prob[:,1] <=0.49,0,1)
    pred_test = np.where(pred_test_prob[:,1] <=0.49,0,1)
    return pred_train,pred_test,auc_train,auc_test
logit_y_train_pred,logit_y_test_pred,auc_train,auc_test = logistic(x_train,y_train,x_test,y_test)

Fitting 4 folds for each of 48 candidates, totalling 192 fits
[CV] C=1.0, max_iter=50, penalty=l1, solver=lbfgs ....................
[CV]  C=1.0, max_iter=50, penalty=l1, solver=lbfgs, score=nan, total=   0.0s
[CV] C=1.0, max_iter=50, penalty=l1, solver=lbfgs ....................
[CV]  C=1.0, max_iter=50, penalty=l1, solver=lbfgs, score=nan, total=   0.0s
[CV] C=1.0, max_iter=50, penalty=l1, solver=lbfgs ....................
[CV]  C=1.0, max_iter=50, penalty=l1, solver=lbfgs, score=nan, total=   0.0s
[CV] C=1.0, max_iter=50, penalty=l1, solver=lbfgs ....................
[CV]  C=1.0, max_iter=50, penalty=l1, solver=lbfgs, score=nan, total=   0.0s
[CV] C=1.0, max_iter=50, penalty=l1, solver=sag ......................
[CV]  C=1.0, max_iter=50, penalty=l1, solver=sag, score=nan, total=   0.0s
[CV] C=1.0, max_iter=50, penalty=l1, solver=sag ......................
[CV]  C=1.0, max_iter=50, penalty=l1, solver=sag, score=nan, total=   0.0s
[CV] C=1.0, max_iter=50, penalty=l1, solver=sag ......

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.0s remaining:    0.0s


[CV]  C=1.0, max_iter=50, penalty=l1, solver=saga, score=0.687, total=   0.7s
[CV] C=1.0, max_iter=50, penalty=l1, solver=saga .....................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.6s remaining:    0.0s


[CV]  C=1.0, max_iter=50, penalty=l1, solver=saga, score=0.690, total=   0.7s
[CV] C=1.0, max_iter=50, penalty=l1, solver=saga .....................
[CV]  C=1.0, max_iter=50, penalty=l1, solver=saga, score=0.678, total=   0.7s
[CV] C=1.0, max_iter=50, penalty=l1, solver=saga .....................
[CV]  C=1.0, max_iter=50, penalty=l1, solver=saga, score=0.677, total=   0.7s
[CV] C=1.0, max_iter=50, penalty=none, solver=lbfgs ..................
[CV]  C=1.0, max_iter=50, penalty=none, solver=lbfgs, score=0.693, total=   0.1s
[CV] C=1.0, max_iter=50, penalty=none, solver=lbfgs ..................
[CV]  C=1.0, max_iter=50, penalty=none, solver=lbfgs, score=0.687, total=   0.1s
[CV] C=1.0, max_iter=50, penalty=none, solver=lbfgs ..................
[CV]  C=1.0, max_iter=50, penalty=none, solver=lbfgs, score=0.677, total=   0.1s
[CV] C=1.0, max_iter=50, penalty=none, solver=lbfgs ..................
[CV]  C=1.0, max_iter=50, penalty=none, solver=lbfgs, score=0.678, total=   0.1s
[CV] C=1.0, max_

[CV]  C=0.01, max_iter=50, penalty=none, solver=sag, score=0.694, total=   0.4s
[CV] C=0.01, max_iter=50, penalty=none, solver=sag ...................
[CV]  C=0.01, max_iter=50, penalty=none, solver=sag, score=0.692, total=   0.4s
[CV] C=0.01, max_iter=50, penalty=none, solver=sag ...................
[CV]  C=0.01, max_iter=50, penalty=none, solver=sag, score=0.681, total=   0.4s
[CV] C=0.01, max_iter=50, penalty=none, solver=sag ...................
[CV]  C=0.01, max_iter=50, penalty=none, solver=sag, score=0.678, total=   0.4s
[CV] C=0.01, max_iter=50, penalty=none, solver=saga ..................
[CV]  C=0.01, max_iter=50, penalty=none, solver=saga, score=0.693, total=   0.5s
[CV] C=0.01, max_iter=50, penalty=none, solver=saga ..................
[CV]  C=0.01, max_iter=50, penalty=none, solver=saga, score=0.692, total=   0.5s
[CV] C=0.01, max_iter=50, penalty=none, solver=saga ..................
[CV]  C=0.01, max_iter=50, penalty=none, solver=saga, score=0.681, total=   0.5s
[CV] C=0.01

[CV]  C=0.1, max_iter=50, penalty=none, solver=saga, score=0.678, total=   0.5s
[CV] C=0.1, max_iter=100, penalty=l1, solver=lbfgs ...................
[CV]  C=0.1, max_iter=100, penalty=l1, solver=lbfgs, score=nan, total=   0.0s
[CV] C=0.1, max_iter=100, penalty=l1, solver=lbfgs ...................
[CV]  C=0.1, max_iter=100, penalty=l1, solver=lbfgs, score=nan, total=   0.0s
[CV] C=0.1, max_iter=100, penalty=l1, solver=lbfgs ...................
[CV]  C=0.1, max_iter=100, penalty=l1, solver=lbfgs, score=nan, total=   0.0s
[CV] C=0.1, max_iter=100, penalty=l1, solver=lbfgs ...................
[CV]  C=0.1, max_iter=100, penalty=l1, solver=lbfgs, score=nan, total=   0.0s
[CV] C=0.1, max_iter=100, penalty=l1, solver=sag .....................
[CV]  C=0.1, max_iter=100, penalty=l1, solver=sag, score=nan, total=   0.0s
[CV] C=0.1, max_iter=100, penalty=l1, solver=sag .....................
[CV]  C=0.1, max_iter=100, penalty=l1, solver=sag, score=nan, total=   0.0s
[CV] C=0.1, max_iter=100, pena

[CV]  C=0.5, max_iter=100, penalty=l1, solver=saga, score=0.691, total=   0.4s
[CV] C=0.5, max_iter=100, penalty=l1, solver=saga ....................
[CV]  C=0.5, max_iter=100, penalty=l1, solver=saga, score=0.684, total=   0.4s
[CV] C=0.5, max_iter=100, penalty=l1, solver=saga ....................
[CV]  C=0.5, max_iter=100, penalty=l1, solver=saga, score=0.673, total=   0.4s
[CV] C=0.5, max_iter=100, penalty=l1, solver=saga ....................
[CV]  C=0.5, max_iter=100, penalty=l1, solver=saga, score=0.669, total=   0.4s
[CV] C=0.5, max_iter=100, penalty=none, solver=lbfgs .................
[CV]  C=0.5, max_iter=100, penalty=none, solver=lbfgs, score=0.693, total=   0.2s
[CV] C=0.5, max_iter=100, penalty=none, solver=lbfgs .................
[CV]  C=0.5, max_iter=100, penalty=none, solver=lbfgs, score=0.692, total=   0.2s
[CV] C=0.5, max_iter=100, penalty=none, solver=lbfgs .................
[CV]  C=0.5, max_iter=100, penalty=none, solver=lbfgs, score=0.682, total=   0.2s
[CV] C=0.5, 

[Parallel(n_jobs=1)]: Done 192 out of 192 | elapsed:  1.0min finished


{'C': 1.0, 'max_iter': 100, 'penalty': 'none', 'solver': 'lbfgs'}


In [31]:
# function to find precision ,recall ,f1-score ,auc of model
def accuracy_measure(y_actual,y_pred):
    print(metrics.classification_report(y_actual,y_pred))
    print("AUC Score train",auc_train)
    print("AUC Score test",auc_test)

In [140]:
accuracy_measure(y_train,logit_y_train_pred)
accuracy_measure(y_test,logit_y_test_pred)

              precision    recall  f1-score   support

           0       0.70      0.68      0.69      4371
           1       0.70      0.71      0.70      4429

    accuracy                           0.70      8800
   macro avg       0.70      0.70      0.70      8800
weighted avg       0.70      0.70      0.70      8800

AUC Score train 0.7734767300583667
AUC Score test 0.7684448512508595
              precision    recall  f1-score   support

           0       0.71      0.67      0.69      1915
           1       0.68      0.71      0.69      1857

    accuracy                           0.69      3772
   macro avg       0.69      0.69      0.69      3772
weighted avg       0.69      0.69      0.69      3772

AUC Score train 0.7734767300583667
AUC Score test 0.7684448512508595


In [133]:
# random forest function with hyperparameter tunning
def Random_forest(x_train,y_train,x_test,y_test):
    n_estimators = [int(x) for x in np.linspace(start = 10, stop = 1000, num = 20)]
    criterion = ['gini','entropy']
    max_features = ['auto', 'sqrt','log2']
    max_depth = [int(x) for x in np.linspace(3, 20, num = 5)]
    max_depth.append(None)
    min_samples_split = [2, 5, 7,10]
    min_samples_leaf = [1, 2, 4,5]
    bootstrap = [True, False]
    oob_score = [True,False]

    random_grid = {'n_estimators': n_estimators,
                   'criterion':criterion,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap,
                   'oob_score':oob_score}

    rf = RandomForestClassifier()

    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 4, verbose=10, random_state=400, n_jobs = -1)
    rf_random.fit(x_train, y_train)
    print(rf_random.best_params_)
    pred_train = rf_random.predict(x_train)
    pred_test = rf_random.predict(x_test)
    fpr, tpr, _ = metrics.roc_curve(np.array(y_train), rf_random.predict_proba(x_train)[:,1])
    auc_train = metrics.auc(fpr,tpr)
    fpr, tpr, _ = metrics.roc_curve(np.array(y_test), rf_random.predict_proba(x_test)[:,1])
    auc_test = metrics.auc(fpr,tpr)

    return pred_train,pred_test,auc_train,auc_test
rf_y_train_pred,rf_y_test_pred,,auc_train,auc_test = Random_forest(x_train,y_train,x_test,y_test)

Fitting 4 folds for each of 100 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   33.7s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   44.6s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   47.7s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   59.2s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  5

{'oob_score': False, 'n_estimators': 478, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': None, 'criterion': 'gini', 'bootstrap': True}


In [134]:
accuracy_measure(y_train,rf_y_train_pred)
accuracy_measure(y_test,rf_y_test_pred)

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4371
           1       0.99      0.99      0.99      4429

    accuracy                           0.99      8800
   macro avg       0.99      0.99      0.99      8800
weighted avg       0.99      0.99      0.99      8800

              precision    recall  f1-score   support

           0       0.79      0.79      0.79      1915
           1       0.78      0.78      0.78      1857

    accuracy                           0.78      3772
   macro avg       0.78      0.78      0.78      3772
weighted avg       0.78      0.78      0.78      3772



In [141]:
# gradient boosting algorithm with hyperparameter tunning
def gradient_boost(x_train,y_train,x_test,y_test):
    
    n_estimators = [int(x) for x in np.linspace(start = 10, stop = 1000, num = 20)]
    max_features = ['auto', 'sqrt']
    max_depth = [int(x) for x in np.linspace(3, 10, num = 1)]
    max_depth.append(None)
    min_samples_split = [2, 5, 10]
    min_samples_leaf = [1, 2, 4]
    loss=['deviance','exponential']
    learning_rate = [0.1,0.01,0.5]
    criterion =['friedman_mse','mse','mae']


    grid = {       'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'loss': loss,
                   'learning_rate': learning_rate,
                    'criterion' :criterion}
    gb = GradientBoostingClassifier()

    gf_tune = GridSearchCV(estimator = gb, param_grid = grid, cv = 2, verbose=2, n_jobs = -1)
    gf_tune.fit(x_train, y_train)

    print(gf_tune.best_params_)
    pred_train = gf_tune.predict(x_train)
    pred_test = gf_tune.predict(x_test)
    fpr, tpr, _ = metrics.roc_curve(np.array(y_train),gf_tune.predict_proba(x_train)[:,1])
    auc_train = metrics.auc(fpr,tpr)
    fpr, tpr, _ = metrics.roc_curve(np.array(y_test), gf_tune.predict_proba(x_test)[:,1])
    auc_test = metrics.auc(fpr,tpr)
    return pred_train,pred_test,auc_train,auc_test


gb_y_train_pred,gb_y_test_pred,auc_train,auc_test = Random_forest(x_train,y_train,x_test,y_test)
    

Fitting 4 folds for each of 100 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   14.1s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   28.0s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   35.4s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   38.4s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   47.0s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   57.5s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  4

{'oob_score': False, 'n_estimators': 478, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': None, 'criterion': 'gini', 'bootstrap': True}


ValueError: not enough values to unpack (expected 4, got 2)

In [36]:
#accuracy_measure(y_train,gb_y_train_pred)
#accuracy_measure(y_test,gb_y_test_pred)
x.columns

Index(['Client Category_automobiles', 'Client Category_consulting',
       'Client Category_consumer good', 'Client Category_domestic bank',
       'Client Category_domestic private bank',
       'Client Category_domestic public bank', 'Client Category_e-commerce',
       'Client Category_electronics', 'Client Category_energy',
       'Client Category_energy ',
       ...
       'Solution Type_solution 61', 'Solution Type_solution 62',
       'Solution Type_solution 63', 'Solution Type_solution 64',
       'Solution Type_solution 65', 'Solution Type_solution 66',
       'Solution Type_solution 67', 'Solution Type_solution 7',
       'Solution Type_solution 8', 'Solution Type_solution 9'],
      dtype='object', length=142)

In [33]:
import tensorflow as tf
from tensorflow import keras
import kerastuner as kt
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

In [43]:
## sequential model with hyperparamter tunning
def model_builder(hp):
    
    model = keras.Sequential()
    model.add(keras.layers.Flatten(input_shape=(142, 1)))

  # Tune the number of units in the first Dense layer
  # Choose an optimal value between 32-512
    hp_units = hp.Int('units', min_value=32, max_value=512, step=32)
    model.add(keras.layers.Dense(units=hp_units, activation='relu'))
    model.add(keras.layers.Dense(10))

  # Tune the learning rate for the optimizer
  # Choose an optimal value from 0.01, 0.001, or 0.0001
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

    model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=['accuracy'])

    return model

In [44]:
tuner = kt.Hyperband(model_builder,
                     objective='val_accuracy',
                     max_epochs=10,
                     factor=3,
                     directory='my_dir',
                     project_name='intro_to_kt')

In [45]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

In [46]:
tuner.search(x_train, y_train, epochs=50, validation_split=0.2, callbacks=[stop_early])

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is {best_hps.get('units')} and the optimal learning rate for the optimizer
is {best_hps.get('learning_rate')}.
""")

Trial 30 Complete [00h 00m 02s]
val_accuracy: 0.7136363387107849

Best val_accuracy So Far: 0.7136363387107849
Total elapsed time: 00h 00m 44s
INFO:tensorflow:Oracle triggered exit

The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is 288 and the optimal learning rate for the optimizer
is 0.01.



In [47]:
# Build the model with the optimal hyperparameters and train it on the data for 50 epochs
model = tuner.hypermodel.build(best_hps)
history = model.fit(x_train, y_train, epochs=50, validation_split=0.2)

val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Best epoch: 42


In [48]:
hypermodel = tuner.hypermodel.build(best_hps)

# Retrain the model
hypermodel.fit(x_test, y_test, epochs=best_epoch)

Epoch 1/42
Epoch 2/42
Epoch 3/42
Epoch 4/42
Epoch 5/42
Epoch 6/42
Epoch 7/42
Epoch 8/42
Epoch 9/42
Epoch 10/42
Epoch 11/42
Epoch 12/42
Epoch 13/42
Epoch 14/42
Epoch 15/42
Epoch 16/42
Epoch 17/42
Epoch 18/42
Epoch 19/42
Epoch 20/42
Epoch 21/42
Epoch 22/42
Epoch 23/42
Epoch 24/42
Epoch 25/42
Epoch 26/42
Epoch 27/42
Epoch 28/42
Epoch 29/42
Epoch 30/42
Epoch 31/42
Epoch 32/42
Epoch 33/42
Epoch 34/42
Epoch 35/42
Epoch 36/42
Epoch 37/42
Epoch 38/42
Epoch 39/42
Epoch 40/42
Epoch 41/42
Epoch 42/42


<tensorflow.python.keras.callbacks.History at 0x2208ea52948>

In [49]:
eval_result = hypermodel.evaluate(x_test, y_test)
print("[test loss, test accuracy]:", eval_result)

[test loss, test accuracy]: [0.13930368423461914, 0.9427359700202942]


In [None]:
# calculating total loss by each model.
data_new  = pd.read_excel("Win_Prediction_Data.xlsx")

def loss(y_train,y_test,y_train_pred,y_test_pred,data):
    
    ind_test = y_test[y_test!= y_test_pred].index
    ind_train = y_train[y_train!= y_train_pred].index
    training_loss = data['Deal Cost'].iloc[ind_test[ind_test<len(data)]].sum()
    testing_loss = data['Deal Cost'].iloc[ind_train[ind_train<len(data)]].sum()
    total_loss = training_loss + testing_loss
    return total_loss
loss_by_logistic_model = loss(y_train,y_test,logit_y_train_pred,logit_y_test_pred,data_new)
loss_by_randomforest_model = loss(y_train,y_test,rf_y_train_pred,rf_y_test_pred,data_new)
loss_by_gradientboost_model = loss(y_train,y_test,gb_y_train_pred,gb_y_test_pred,data_new)