
# Import Python libraries.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score,f1_score
from sklearn.ensemble import GradientBoostingClassifier

import random
import os

In [2]:
seed = 232323#2021
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
set_seed(seed)

# Import Datasets

In [3]:
train_data = pd.read_csv("train_encoded_folds.csv").drop("ID",axis=1)
test_data = pd.read_csv("test_encoded.csv").drop("ID",axis=1)


In [4]:
train_data.head()

Unnamed: 0,sex,Aged,Married,TotalDependents,ServiceSpan,MobileService,4GService,CyberProtection,HardwareSupport,TechnicalAssistance,FilmSubscription,QuarterlyPayment,GrandPayment,CustomerAttrition,SettlementProcess_Bank,SettlementProcess_Card,SettlementProcess_Check,SettlementProcess_Electronic,kfold
0,0,0,0,1,48,1,2,0,1,1,0,96.259802,4628.129119,1,0,0,1,0,0
1,1,0,1,0,45,1,2,0,0,0,1,18.87647,922.085998,0,1,0,0,0,6
2,1,1,1,0,59,1,2,1,0,1,0,106.016057,6205.891129,1,0,1,0,0,2
3,1,0,0,0,14,1,2,0,0,0,0,80.108839,1248.655391,0,0,0,1,0,5
4,0,0,1,1,25,1,2,0,1,0,1,51.529773,1285.321277,0,0,0,1,0,6


In [5]:
test_data.head(3)

Unnamed: 0,sex,Aged,Married,TotalDependents,ServiceSpan,MobileService,4GService,CyberProtection,HardwareSupport,TechnicalAssistance,FilmSubscription,QuarterlyPayment,GrandPayment,SettlementProcess_Bank,SettlementProcess_Card,SettlementProcess_Check,SettlementProcess_Electronic
0,1,0,0,0,8,1,2,0,0,0,1,21.299937,151.092519,0,0,1,0
1,0,0,0,0,26,1,1,0,1,0,1,56.609623,1553.276979,0,0,1,0
2,1,0,0,0,66,1,1,0,0,0,1,25.16769,1684.262416,0,1,0,0


# Convert datatype of selected fields.

# Drop the dependent variable from the train/test dataset. 

In [6]:
 ytrain = train_data[['CustomerAttrition','kfold']]
 xtrain = train_data.drop(['kfold','CustomerAttrition'],axis=1)

 concat = pd.concat([xtrain,test_data],axis = 0)
 mini = concat.min(axis=0)
 maxi = concat.max(axis=0)
 concat = (concat - mini)/maxi
 xtrain = concat[:len(xtrain)]
 test_data = concat[len(xtrain):]
 train_data = pd.concat([xtrain,ytrain],axis=1)

# Train your model

In [7]:
def train(model,xtrain,ytrain,xval,yval):
    model.fit(xtrain,ytrain)
#     y_pred = model.predict(xtrain)
#     acc = accuracy_score(ytrain, y_pred)
#     f1  = f1_score(ytrain,y_pred)
#     print(f'Train acc: {acc} f1: {f1}')
    y_pred = model.predict(xval)
    acc = accuracy_score(yval, y_pred)
    f1  = f1_score(yval,y_pred)
#     print(f'Val acc: {acc} f1: {f1}')
    return model,y_pred,acc

def get_pred(model,xtest):
    return model.predict_proba(xtest)[:,1:].reshape(-1)

In [8]:
folds = 9

y_test_oof = []
y_pred_oof = []

all_preds = {}

for itr,fold in enumerate(range(folds)):
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True).drop(['kfold'],axis=1)
    X_test  = train_data[train_data.kfold == fold].reset_index(drop=True).drop(['kfold'],axis=1)
    
    y_train = X_train["CustomerAttrition"]
    X_train = X_train.drop("CustomerAttrition",axis=1)
    
    y_test = X_test["CustomerAttrition"]
    X_test = X_test.drop("CustomerAttrition",axis=1)
    
    
    
    
    print(f"Train size {len(X_train)} | Val size {len(X_test)}")
    print()
    
    
    model=GradientBoostingClassifier(learning_rate=0.1, min_samples_split=500,min_samples_leaf=50,max_depth=9,max_features='log2',subsample=0.9,random_state=42)
#     model=GradientBoostingClassifier()
    model,y_pred, acc = train(model,X_train,y_train,X_test,y_test)
    
    
    print(f"Doing Prediction for acc {acc}")
    all_preds[str(acc) + str(itr)] = get_pred(model,test_data)
    
    y_test_oof.extend(y_test)
    y_pred_oof.extend(y_pred)
   
    
    print()
    


Train size 5634 | Val size 703

Doing Prediction for acc 0.786628733997155

Train size 5635 | Val size 702

Doing Prediction for acc 0.7692307692307693

Train size 5634 | Val size 703

Doing Prediction for acc 0.7837837837837838

Train size 5635 | Val size 702

Doing Prediction for acc 0.8062678062678063

Train size 5632 | Val size 705

Doing Prediction for acc 0.7971631205673759

Train size 5634 | Val size 703

Doing Prediction for acc 0.7880512091038406

Train size 5635 | Val size 702

Doing Prediction for acc 0.811965811965812

Train size 5622 | Val size 715

Doing Prediction for acc 0.8167832167832167

Train size 5635 | Val size 702

Doing Prediction for acc 0.7948717948717948



In [9]:
oof = accuracy_score(y_test_oof, y_pred_oof)
print('OOF acc: %f' % oof)

# sample_sub = pd.read_csv("Sample Submission.csv")
base = np.zeros(len(test_data))
for val in all_preds.values():
    base += np.array(val)

base/=folds

dummy_t = test_data.copy()
dummy_t['CustomerAttrition'] = 1*(base >=0.5)



# sample_sub['CustomerAttrition'] = 1*(base > (folds//2))

# sample_sub.replace({1:'Yes', 0:'No'}, inplace=True)
# sample_sub.to_csv("submission_gbc_0.795013.csv",index=False)

OOF acc: 0.795013


In [10]:
folds = 9

y_test_oof = []
y_pred_oof = []

all_preds = {}

for itr,fold in enumerate(range(folds)):
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True).drop(['kfold'],axis=1)
    X_train = pd.concat([X_train,dummy_t],axis=0).reset_index(drop=True)
    X_train = X_train.sample(frac=1.0).reset_index(drop=True)
    X_test  = train_data[train_data.kfold == fold].reset_index(drop=True).drop(['kfold'],axis=1)
    
    y_train = X_train["CustomerAttrition"]
    X_train = X_train.drop("CustomerAttrition",axis=1)
    
    y_test = X_test["CustomerAttrition"]
    X_test = X_test.drop("CustomerAttrition",axis=1)
    
    
    
    
    print(f"Train size {len(X_train)} | Val size {len(X_test)}")
    print()
    
    
    model=GradientBoostingClassifier(learning_rate=0.1, min_samples_split=500,min_samples_leaf=50,max_depth=9,max_features='log2',subsample=0.9,random_state=42)
#     model=GradientBoostingClassifier()
    model,y_pred, acc = train(model,X_train,y_train,X_test,y_test)
    
    
    print(f"Doing Prediction for acc {acc}")
    all_preds[str(acc) + str(itr)] = get_pred(model,test_data)
    
    y_test_oof.extend(y_test)
    y_pred_oof.extend(y_pred)
   
    
    print()
    


Train size 6339 | Val size 703

Doing Prediction for acc 0.7908961593172119

Train size 6340 | Val size 702

Doing Prediction for acc 0.7692307692307693

Train size 6339 | Val size 703

Doing Prediction for acc 0.7923186344238976

Train size 6340 | Val size 702

Doing Prediction for acc 0.8062678062678063

Train size 6337 | Val size 705

Doing Prediction for acc 0.8056737588652483

Train size 6339 | Val size 703

Doing Prediction for acc 0.7965860597439545

Train size 6340 | Val size 702

Doing Prediction for acc 0.811965811965812

Train size 6327 | Val size 715

Doing Prediction for acc 0.8181818181818182

Train size 6340 | Val size 702

Doing Prediction for acc 0.7962962962962963



In [11]:
oof = accuracy_score(y_test_oof, y_pred_oof)
print('OOF acc: %f' % oof)

sample_sub = pd.read_csv("Sample Submission.csv")
base = np.zeros(len(test_data))
for val in all_preds.values():
    base += np.array(val)

base/=folds

sample_sub['CustomerAttrition'] = 1*(base >=0.5)

sample_sub.replace({1:'Yes', 0:'No'}, inplace=True)
sample_sub.to_csv("submission_gbc1_retraintest232323.csv",index=False)

OOF acc: 0.798643


In [None]:
def trainseed(seed):
    set_seed(seed)
    print(f"Seed : {seed}")
    
    folds = 9

    y_test_oof = []
    y_pred_oof = []

    all_preds = {}

    for itr,fold in enumerate(range(folds)):
        X_train = train_data[train_data.kfold != fold].reset_index(drop=True).drop(['kfold'],axis=1)
        X_test  = train_data[train_data.kfold == fold].reset_index(drop=True).drop(['kfold'],axis=1)

        y_train = X_train["CustomerAttrition"]
        X_train = X_train.drop("CustomerAttrition",axis=1)

        y_test = X_test["CustomerAttrition"]
        X_test = X_test.drop("CustomerAttrition",axis=1)




#         print(f"Train size {len(X_train)} | Val size {len(X_test)}")
#         print()


        model=GradientBoostingClassifier(learning_rate=0.1, min_samples_split=500,min_samples_leaf=50,max_depth=9,max_features='log2',subsample=0.9,random_state=42)
    #     model=GradientBoostingClassifier()
        model,y_pred, acc = train(model,X_train,y_train,X_test,y_test)


#         print(f"Doing Prediction for acc {acc}")
        all_preds[str(acc) + str(itr)] = get_pred(model,test_data)

        y_test_oof.extend(y_test)
        y_pred_oof.extend(y_pred)


#         print()
    
    oof = accuracy_score(y_test_oof, y_pred_oof)
    print('OOF acc: %f' % oof)

    # sample_sub = pd.read_csv("Sample Submission.csv")
    base = np.zeros(len(test_data))
    for val in all_preds.values():
        base += np.array(val)

    base/=folds

    dummy_t = test_data.copy()
    dummy_t['CustomerAttrition'] = 1*(base >=0.5)

    folds = 9

    y_test_oof = []
    y_pred_oof = []

    all_preds = {}

    for itr,fold in enumerate(range(folds)):
        X_train = train_data[train_data.kfold != fold].reset_index(drop=True).drop(['kfold'],axis=1)
        X_train = pd.concat([X_train,dummy_t],axis=0).reset_index(drop=True)
        X_train = X_train.sample(frac=1.0).reset_index(drop=True)
        X_test  = train_data[train_data.kfold == fold].reset_index(drop=True).drop(['kfold'],axis=1)

        y_train = X_train["CustomerAttrition"]
        X_train = X_train.drop("CustomerAttrition",axis=1)

        y_test = X_test["CustomerAttrition"]
        X_test = X_test.drop("CustomerAttrition",axis=1)




#         print(f"Train size {len(X_train)} | Val size {len(X_test)}")
#         print()


        model=GradientBoostingClassifier(learning_rate=0.1, min_samples_split=500,min_samples_leaf=50,max_depth=9,max_features='log2',subsample=0.9,random_state=42)
    #     model=GradientBoostingClassifier()
        model,y_pred, acc = train(model,X_train,y_train,X_test,y_test)


#         print(f"Doing Prediction for acc {acc}")
        all_preds[str(acc) + str(itr)] = get_pred(model,test_data)

        y_test_oof.extend(y_test)
        y_pred_oof.extend(y_pred)


#         print()
    
    oof = accuracy_score(y_test_oof, y_pred_oof)
    print('OOF acc: %f' % oof)

    sample_sub = pd.read_csv("Sample Submission.csv")
    base = np.zeros(len(test_data))
    for val in all_preds.values():
        base += np.array(val)

    base/=folds


    

In [None]:
seeds = [232323,2020,486,21,26,309]
for se in seeds:
    trainseed(se)

In [None]:
seeds = [232323,20,486,21,26,309]
for se in seeds:
    trainseed(se)

In [None]:
trainseed(24242424)

In [None]:
sample_sub

In [None]:
feat_importances = pd.Series(model.feature_importances_, index=X_train.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.show()