## Model Testing

In [15]:
def LogicResult_allcoef(X,y,Penalty='l2',c=1,Solver='lbfgs'):
    
    X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.2, \
                                                        stratify=y, random_state=0)
    
    scaler_train=StandardScaler()
    scaler_train.fit(X_train)
    X_train_scaled = pd.DataFrame(scaler_train.transform(X_train),columns = X_train.columns)
    
    scaler_test=StandardScaler()
    scaler_test.fit(X_test)
    X_test_scaled = pd.DataFrame(scaler_test.transform(X_test),columns = X_test.columns)
    
    #Lasso
    classifier = LogisticRegression(random_state=0, penalty=Penalty,solver=Solver,C=c)
    classifier.fit(X_train_scaled, y_train)
    
    coef_table = pd.DataFrame(list(X_train.columns)).copy()
    coef_table.insert(len(coef_table.columns),"Coefs",classifier.coef_.transpose())
    coef_table.columns=['Variable','Coefs']
    coef_table['Coefs_ABS']=coef_table['Coefs'].abs()
    coef=coef_table.sort_values('Coefs_ABS',ascending=False).reset_index(drop=True)
    coef=coef.drop('Coefs_ABS',axis=1)

    
    y_pred = classifier.predict(X_test_scaled)
    acc = accuracy_score(y_test,y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test,y_pred)
    results = pd.DataFrame([['Logistic Regression', acc,prec,rec,f1]],\
                       columns=['Model', 'Accuracy', 'Precision', 'Recall','F1 Score'])
    return coef,results

In [16]:
def LogicResult_allcoef_10fold(X,y,Penalty,c,Solver):

    scoring = ['accuracy','precision', 'recall','f1']

    classifier = LogisticRegression(random_state=0, penalty=Penalty,solver=Solver,C=c)
    clf = make_pipeline(preprocessing.StandardScaler(), classifier)
    scores = cross_validate(clf, X, y, scoring=scoring,cv=10)

    return scores

In [17]:
pen=['l1','l2']
c = [0.01,0.1, 1, 10, 100, 1000]
solver = ['liblinear', 'saga']
def best_Model(X,y,penalty,C,Solver,hyper_param=False):
    rank = pd.DataFrame(columns = ['accuracy','precision','recall','f1'])
    for i in penalty:
        for j in C:
            for k in Solver:
                tmp_score = LogicResult_allcoef_10fold(X,y,i,j,k)
                tmp_name = 'Logistic {i1} C={j1} {k1}'.format(i1=i,j1=j,k1=k)
                rank.loc[tmp_name,'accuracy'] = tmp_score['test_accuracy'].mean()
                rank.loc[tmp_name,'precision'] = tmp_score['test_precision'].mean()
                rank.loc[tmp_name,'recall'] = tmp_score['test_recall'].mean()
                rank.loc[tmp_name,'f1'] = tmp_score['test_f1'].mean()
                rank.loc[tmp_name,'penalty']=i
                rank.loc[tmp_name,'c']=j
                rank.loc[tmp_name,'solver']=k
    #if all score are the same, prefer lasso to ridge            
    rank = rank.sort_values(['recall','accuracy','f1','penalty'],ascending = False)
    if hyper_param == False:
        return rank
    else:
        best_pen =rank.head(1)['penalty'].values[0]
        best_c =rank.head(1)['c'].values[0]
        best_sol =rank.head(1)['solver'].values[0]
        return best_pen, best_c, best_sol

In [18]:
def LogicResult_allcoef_next_year(X_cur,y_cur,X_next,y_next,Penalty,c,Solver):

    scaler_current=StandardScaler()
    scaler_current.fit(X_cur)
    X_cur_scaled = pd.DataFrame(scaler_current.transform(X_cur),columns = X_cur.columns)
    
    scaler_next=StandardScaler()
    scaler_next.fit(X_next)
    X_next_scaled = pd.DataFrame(scaler_next.transform(X_next),columns = X_next.columns)
    
    #Lasso
    classifier = LogisticRegression(random_state=0, penalty=Penalty,solver=Solver,C=c)
    classifier.fit(X_cur_scaled, y_cur)
    
    coef_table = pd.DataFrame(list(X_cur.columns)).copy()
    coef_table.insert(len(coef_table.columns),"Coefs",classifier.coef_.transpose())
    coef_table.columns=['Variable','Coefs']
    coef_table['Coefs_ABS']=coef_table['Coefs'].abs()
    coef=coef_table.sort_values('Coefs_ABS',ascending=False).reset_index(drop=True)
    coef=coef.drop('Coefs_ABS',axis=1)

    
    y_pred = classifier.predict(X_next_scaled)
    acc = accuracy_score(y_next,y_pred)
    prec = precision_score(y_next, y_pred)
    rec = recall_score(y_next, y_pred)
    f1 = f1_score(y_next,y_pred)
    results = pd.DataFrame([['Logistic Regression', acc,prec,rec,f1]],\
                       columns=['Model', 'Accuracy', 'Precision', 'Recall','F1 Score'])
    return coef,results

In [19]:
def SFM_RFE_featrue_selection(X,y,fit_all=False):
    if fit_all == False:
        X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.2, \
                                                        stratify=y, random_state=0)
    
        scaler_train=StandardScaler()
        scaler_train.fit(X_train)
        X_train_scaled = pd.DataFrame(scaler_train.transform(X_train),columns = X_train.columns)

    
        #SFM
        selector_sfm = SelectFromModel(estimator=LogisticRegression()).fit(X_train_scaled, y_train)
        sfm_feature=X_train.columns[selector_sfm.get_support()]
    
        #RFE
        selector_rfe = RFE(estimator=LogisticRegression(), n_features_to_select=len(sfm_feature), step=1)\
        .fit(X_train_scaled, y_train)
        rfe_feature=X_train.columns[selector_rfe.get_support()]
    
        selected_feature = [x for x in sfm_feature if x in rfe_feature]
        if len(selected_feature)< 5:
            selected_feature_long=rfe_feature.union(sfm_feature)
            return selected_feature_long
        else:
            return selected_feature
    else:
        scaler_train=StandardScaler()
        scaler_train.fit(X)
        X_scaled = pd.DataFrame(scaler_train.transform(X),columns = X.columns)
        
        #SFM
        selector_sfm = SelectFromModel(estimator=LogisticRegression()).fit(X_scaled, y)
        sfm_feature=X.columns[selector_sfm.get_support()]
    
        #RFE
        selector_rfe = RFE(estimator=LogisticRegression(), n_features_to_select=len(sfm_feature), step=1)\
        .fit(X_scaled, y)
        rfe_feature=X.columns[selector_rfe.get_support()]
    
        selected_feature = [x for x in sfm_feature if x in rfe_feature]
        if len(selected_feature)< 5:
            selected_feature_long=rfe_feature.union(sfm_feature)
            return selected_feature_long
        else:
            return selected_feature

# Feature Reduction Result

## 3 types of Model
Type 1: <br>
Model based on 2015-2019 data/ 80% training and 20 testing <br>
Target period :2018/6 - 2019/6 <br>
Feature period: 2015/1- 2018/6 <br>
<br>
Type 2:<br>
Model based on 2015-2019, but test on 2015-2020 <br>
<br>
Type 3 <br>
Model based on 2015-2020 data/ 80% training and 20 testing <br>
Target period :2019/6 - 2020/6 <br>
Feature period: 2015/1- 2019/6 <br>

## Tier 1

### Type 1 Model

In [20]:
rank1_18_all= best_Model(X1,y1,pen,c,solver)
best_pen,best_c,best_sol = best_Model(X1,y1,pen,c,solver,hyper_param=True)
coef1_18_all,evaluation1_18_all=LogicResult_allcoef(X1,y1,best_pen,best_c,best_sol)
evaluation1_18_all

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.965152,0.714286,0.192308,0.30303


In [21]:
rank1_18_all.head(1)

Unnamed: 0,accuracy,precision,recall,f1,penalty,c,solver
Logistic l2 C=1 liblinear,0.965152,0.660931,0.3,0.394294,l2,1.0,liblinear


In [22]:
sfm_rfe_18_1=SFM_RFE_featrue_selection(X1,y1)
rank1_18= best_Model(X1[sfm_rfe_18_1],y1,pen,c,solver)
best_pen,best_c,best_sol = best_Model(X1[sfm_rfe_18_1],y1,pen,c,solver,hyper_param=True)
coef1_18,evaluation1_18=LogicResult_allcoef(X1[sfm_rfe_18_1],y1,best_pen,best_c,best_sol)
evaluation1_18

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.965152,0.666667,0.230769,0.342857


In [23]:
rank1_18.head(1)

Unnamed: 0,accuracy,precision,recall,f1,penalty,c,solver
Logistic l2 C=1 saga,0.964848,0.659524,0.276923,0.376112,l2,1.0,saga


### Type 2 Model

In [24]:
best_pen,best_c,best_sol = best_Model(X1,y1,pen,c,solver,hyper_param=True)
coef1_18_next_all,evaluation1_18_next_all=LogicResult_allcoef_next_year(X1,y1,X1_19\
                                                      ,y1_19,best_pen,best_c,best_sol)
evaluation1_18_next_all

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.939701,0.866667,0.252918,0.391566


In [25]:
sfm_rfe_18_1_next=SFM_RFE_featrue_selection(X1,y1,fit_all=True)
best_pen,best_c,best_sol = best_Model(X1[sfm_rfe_18_1_next],y1,pen,c,solver,hyper_param=True)
coef1_18_next,evaluation1_18_next=LogicResult_allcoef_next_year(X1[sfm_rfe_18_1_next],y1,X1_19[sfm_rfe_18_1_next]\
                                                      ,y1_19,best_pen,best_c,best_sol)
evaluation1_18_next

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.939403,0.885714,0.241245,0.379205


### Type 3 Model

In [26]:
rank1_19_all= best_Model(X1_19,y1_19,pen,c,solver)
best_pen,best_c,best_sol = best_Model(X1_19,y1_19,pen,c,solver,hyper_param=True)
coef1_19_all,evaluation1_19_all=LogicResult_allcoef(X1_19,y1_19,best_pen,best_c,best_sol)
evaluation1_19_all

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.962687,0.825,0.647059,0.725275


In [27]:
rank1_19_all.head(1)

Unnamed: 0,accuracy,precision,recall,f1,penalty,c,solver
Logistic l1 C=1 liblinear,0.957612,0.836026,0.560308,0.667848,l1,1.0,liblinear


In [28]:
sfm_rfe_19_1=SFM_RFE_featrue_selection(X1_19,y1_19)
rank1_19= best_Model(X1_19[sfm_rfe_19_1],y1_19,pen,c,solver)
best_pen,best_c,best_sol = best_Model(X1_19[sfm_rfe_19_1],y1_19,pen,c,solver,hyper_param=True)
coef1_19,evaluation1_19=LogicResult_allcoef(X1_19[sfm_rfe_19_1],y1_19,best_pen,best_c,best_sol)
evaluation1_19

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.965672,0.888889,0.627451,0.735632


In [29]:
rank1_19.head(1)

Unnamed: 0,accuracy,precision,recall,f1,penalty,c,solver
Logistic l2 C=1 liblinear,0.959403,0.862805,0.560154,0.677214,l2,1.0,liblinear


In [30]:
sfm_rfe_18_1

['Recency',
 'Aggregate_index',
 'PRODUCT_MODEL_mode_Multiple',
 'Main_Product_mode_CIJ',
 'Main_Product_mode_TTO',
 'Lastest_Amt_Ratio',
 'Visit_per_TRX']

In [31]:
sfm_rfe_18_1_next

['Recency',
 'Aggregate_index',
 'PRODUCT_MODEL_mode_Multiple',
 'Main_Product_mode_CIJ',
 'Main_Product_mode_TTO',
 'Contract',
 'Lastest_Amt_Ratio',
 'Visit_per_TRX']

In [32]:
sfm_rfe_19_1

Index(['CUSTOMER_CLASS_OEM', 'Contract', 'Lastest_Amt_Ratio',
       'Main_Product_mode_TTO', 'NO_OF_SVC_INCIDENTS', 'Recency',
       'SALES_CHANNEL_mode_Esker', 'Visit_per_TRX'],
      dtype='object')

## Tier 2

### Type 1 Model

In [33]:
rank2_18_all= best_Model(X2,y2,pen,c,solver)
best_pen,best_c,best_sol = best_Model(X2,y2,pen,c,solver,hyper_param=True)
coef2_18_all,evaluation2_18_all=LogicResult_allcoef(X2,y2,best_pen,best_c,best_sol)
evaluation2_18_all

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.889485,0.77907,0.443709,0.565401


In [34]:
rank2_18_all.head(1)

Unnamed: 0,accuracy,precision,recall,f1,penalty,c,solver
Logistic l2 C=1 liblinear,0.893541,0.790747,0.465316,0.57708,l2,1.0,liblinear


In [35]:
sfm_rfe_18_2=SFM_RFE_featrue_selection(X2,y2)
rank2_18= best_Model(X2[sfm_rfe_18_2],y2,pen,c,solver)
best_pen,best_c,best_sol = best_Model(X2[sfm_rfe_18_2],y2,pen,c,solver,hyper_param=True)
coef2_18,evaluation2_18=LogicResult_allcoef(X2[sfm_rfe_18_2],y2,best_pen,best_c,best_sol)
evaluation2_18

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.888412,0.783133,0.430464,0.555556


In [36]:
rank2_18.head(1)

Unnamed: 0,accuracy,precision,recall,f1,penalty,c,solver
Logistic l2 C=0.1 liblinear,0.896331,0.812294,0.465368,0.585322,l2,0.1,liblinear


### Type 2 Model

In [37]:
best_pen,best_c,best_sol = best_Model(X2,y2,pen,c,solver,hyper_param=True)
coef2_18_next_all,evaluation2_18_next_all=LogicResult_allcoef_next_year(X2,y2,X2_19\
                                                      ,y2_19,best_pen,best_c,best_sol)
evaluation2_18_next_all

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.862921,0.848987,0.432052,0.572671


In [38]:
sfm_rfe_18_2_next=SFM_RFE_featrue_selection(X2,y2,fit_all=True)
best_pen,best_c,best_sol = best_Model(X2[sfm_rfe_18_2_next],y2,pen,c,solver,hyper_param=True)
coef2_18_next,evaluation2_18_next=LogicResult_allcoef_next_year(X2[sfm_rfe_18_2_next],y2,X2_19[sfm_rfe_18_2_next]\
                                                      ,y2_19,best_pen,best_c,best_sol)
evaluation2_18_next

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.862323,0.848148,0.429241,0.570006


### Type 3 Model

In [39]:
rank2_19_all= best_Model(X2_19,y2_19,pen,c,solver)
best_pen,best_c,best_sol = best_Model(X2_19,y2_19,pen,c,solver,hyper_param=True)
coef2_19_all,evaluation2_19_all=LogicResult_allcoef(X2_19,y2_19,best_pen,best_c,best_sol)
evaluation2_19_all

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.871514,0.823077,0.502347,0.623907


In [40]:
rank2_19_all.head(1)

Unnamed: 0,accuracy,precision,recall,f1,penalty,c,solver
Logistic l2 C=100 saga,0.878461,0.829577,0.551129,0.653221,l2,100.0,saga


In [41]:
sfm_rfe_19_2=SFM_RFE_featrue_selection(X2_19,y2_19)
rank2_19= best_Model(X2_19[sfm_rfe_19_2],y2_19,pen,c,solver)
best_pen,best_c,best_sol = best_Model(X2_19[sfm_rfe_19_2],y2_19,pen,c,solver,hyper_param=True)
coef2_19,evaluation2_19=LogicResult_allcoef(X2_19[sfm_rfe_19_2],y2_19,best_pen,best_c,best_sol)
evaluation2_19

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.86753,0.822581,0.478873,0.605341


In [42]:
rank2_19.head(1)

Unnamed: 0,accuracy,precision,recall,f1,penalty,c,solver
Logistic l2 C=10 liblinear,0.880254,0.844537,0.546429,0.654731,l2,10.0,liblinear


In [43]:
sfm_rfe_18_2

['Frequency',
 'Recency',
 'PRODUCT_MODEL_mode_Multiple',
 'NO_OF_SVC_INCIDENTS',
 'Avg_AMT']

In [44]:
sfm_rfe_18_2_next

Index(['Avg_AMT', 'Frequency', 'Main_Product_mode_CIJ',
       'Main_Product_mode_TIJ', 'Main_Product_mode_TTO', 'NO_OF_SVC_INCIDENTS',
       'PRODUCT_MODEL_mode_Multiple', 'Recency'],
      dtype='object')

In [45]:
sfm_rfe_19_2

['Frequency',
 'Recency',
 'Main_Product_mode_CIJ',
 'Lastest_Amt_Ratio',
 'Visit_per_TRX',
 'Avg_AMT']

## Tier 3

### Type 1 Model

In [46]:
rank3_18_all= best_Model(X3,y3,pen,c,solver)
best_pen,best_c,best_sol = best_Model(X3,y3,pen,c,solver,hyper_param=True)
coef3_18_all,evaluation3_18_all=LogicResult_allcoef(X3,y3,best_pen,best_c,best_sol)
evaluation3_18_all

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.692593,0.719072,0.830357,0.770718


In [47]:
rank3_18_all.head(1)

Unnamed: 0,accuracy,precision,recall,f1,penalty,c,solver
Logistic l2 C=0.01 saga,0.702063,0.751956,0.829819,0.764997,l2,0.01,saga


In [48]:
sfm_rfe_18_3=SFM_RFE_featrue_selection(X3,y3)
rank3_18= best_Model(X3[sfm_rfe_18_3],y3,pen,c,solver)
best_pen,best_c,best_sol = best_Model(X3[sfm_rfe_18_3],y3,pen,c,solver,hyper_param=True)
coef3_18,evaluation3_18=LogicResult_allcoef(X3[sfm_rfe_18_3],y3,best_pen,best_c,best_sol)
evaluation3_18

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.690741,0.726542,0.806548,0.764457


In [49]:
rank3_18.head(1)

Unnamed: 0,accuracy,precision,recall,f1,penalty,c,solver
Logistic l2 C=0.01 saga,0.707255,0.753009,0.838772,0.771437,l2,0.01,saga


### Type 2 Model

In [50]:
best_pen,best_c,best_sol = best_Model(X3,y3,pen,c,solver,hyper_param=True)
coef3_18_next_all,evaluation3_18_next_all=LogicResult_allcoef_next_year(X3,y3,X3_19\
                                                      ,y3_19,best_pen,best_c,best_sol)
evaluation3_18_next_all

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.757126,0.751295,0.885496,0.812894


In [51]:
sfm_rfe_18_3_next=SFM_RFE_featrue_selection(X3,y3,fit_all=True)
best_pen,best_c,best_sol = best_Model(X3[sfm_rfe_18_3_next],y3,pen,c,solver,hyper_param=True)
coef3_18_next,evaluation3_18_next=LogicResult_allcoef_next_year(X3[sfm_rfe_18_3_next],y3,X3_19[sfm_rfe_18_3_next]\
                                                      ,y3_19,best_pen,best_c,best_sol)
evaluation3_18_next

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.755609,0.747755,0.890076,0.812732


### Type 3 Model

In [52]:
rank3_19_all= best_Model(X3_19,y3_19,pen,c,solver)
best_pen,best_c,best_sol = best_Model(X3_19,y3_19,pen,c,solver,hyper_param=True)
coef3_19_all,evaluation3_19_all=LogicResult_allcoef(X3_19,y3_19,best_pen,best_c,best_sol)
evaluation3_19_all

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.745455,0.762238,0.832061,0.79562


In [53]:
rank3_19_all.head(1)

Unnamed: 0,accuracy,precision,recall,f1,penalty,c,solver
Logistic l2 C=0.01 saga,0.740162,0.793575,0.817735,0.779527,l2,0.01,saga


In [54]:
sfm_rfe_19_3=SFM_RFE_featrue_selection(X3_19,y3_19)
rank3_19= best_Model(X3_19[sfm_rfe_19_3],y3_19,pen,c,solver)
best_pen,best_c,best_sol = best_Model(X3_19[sfm_rfe_19_3],y3_19,pen,c,solver,hyper_param=True)
coef3_19,evaluation3_19=LogicResult_allcoef(X3_19[sfm_rfe_19_3],y3_19,best_pen,best_c,best_sol)
evaluation3_19

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.75303,0.763761,0.847328,0.803378


In [55]:
rank3_19.head(1)

Unnamed: 0,accuracy,precision,recall,f1,penalty,c,solver
Logistic l2 C=0.01 saga,0.751387,0.799654,0.828962,0.792391,l2,0.01,saga


In [56]:
sfm_rfe_18_3

['Frequency',
 'Recency',
 'CUSTOMER_CLASS_DISTRIBUTOR',
 'CUSTOMER_CLASS_OEM',
 'Avg_AMT',
 'Avg_Duration']

In [57]:
sfm_rfe_18_3_next

Index(['Avg_AMT', 'Avg_Duration', 'CUSTOMER_CLASS_OEM', 'Frequency',
       'Main_Product_mode_CIJ', 'Recency'],
      dtype='object')

In [58]:
sfm_rfe_19_3

['Frequency',
 'Recency',
 'CUSTOMER_CLASS_OEM',
 'Visit_per_TRX',
 'Avg_AMT',
 'Avg_Duration']