In [13]:
import numpy as np
import pandas as pd

In [14]:
# Encodings with -1 and without one hot encoding
train_1 = pd.read_csv("train_1encoded.csv")
train_1['SettlementProcess'].replace({'Card':0, 'Bank':1, 'Check':2, 'Electronic':3}, inplace=True)
test_1  = pd.read_csv("test_1encoded.csv")
test_1['SettlementProcess'].replace({'Card':0, 'Bank':1, 'Check':2, 'Electronic':3}, inplace=True)

# Encodings without -1 and with one hot encoding
train_2 = pd.read_csv("train_encoded_noh.csv")
train_2['SettlementProcess'].replace({'Card':0, 'Bank':1, 'Check':2, 'Electronic':3}, inplace=True)
test_2  = pd.read_csv("test_encoded_noh.csv")
test_2['SettlementProcess'].replace({'Card':0, 'Bank':1, 'Check':2, 'Electronic':3}, inplace=True)

# Trivial encoding based on prev without 1 and one hot encoding is done
train_3 = pd.read_csv("train_encoded.csv")
test_3  = pd.read_csv("test_encoded.csv")

# Introduce few new features train_1  ----->

train_4 = train_1.copy()
train_4['G/Q'] = train_4['GrandPayment']/train_4['QuarterlyPayment']
train_4['G/S'] = train_4['GrandPayment']/train_4['ServiceSpan']
train_4['Q/S'] = train_4['QuarterlyPayment']/train_4['ServiceSpan']
train_4['Services_add'] = train_4['MobileService']+train_4['4GService']+ train_4['CyberProtection']+ train_4['HardwareSupport']+train_4['TechnicalAssistance'] + train_4['FilmSubscription']
train_4['SAMT'] = train_4['sex']*(1-train_4['Aged'])*(1-train_4['Married']) +train_4['TotalDependents']

test_4 = test_1.copy()
test_4['G/Q'] = test_4['GrandPayment']/test_4['QuarterlyPayment']
test_4['G/S'] = test_4['GrandPayment']/test_4['ServiceSpan']
test_4['Q/S'] = test_4['QuarterlyPayment']/test_4['ServiceSpan']
test_4['Services_add'] = test_4['MobileService']+test_4['4GService']+ test_4['CyberProtection']+ test_4['HardwareSupport']+test_4['TechnicalAssistance'] + test_4['FilmSubscription']
test_4['SAMT'] = test_4['sex']*(1-test_4['Aged'])*(1-test_4['Married']) +test_4['TotalDependents']

# Introduce few new features train_2  ----->

train_5 = train_2.copy()
train_5['G/Q'] = train_5['GrandPayment']/train_5['QuarterlyPayment']
train_5['G/S'] = train_5['GrandPayment']/train_5['ServiceSpan']
train_5['Q/S'] = train_5['QuarterlyPayment']/train_5['ServiceSpan']
train_5['Services_add']= train_5['MobileService']+train_5['4GService']+ train_5['CyberProtection']+ train_5['HardwareSupport']+train_5['TechnicalAssistance'] + train_5['FilmSubscription']
train_5['SAMT'] = train_5['sex']*(1-train_5['Aged'])*(1-train_5['Married']) +train_5['TotalDependents']

test_5 = test_2.copy()
test_5['G/Q'] = test_5['GrandPayment']/test_5['QuarterlyPayment']
test_5['G/S'] = test_5['GrandPayment']/test_5['ServiceSpan']
test_5['Q/S'] = test_5['QuarterlyPayment']/test_5['ServiceSpan']
test_5['Services_add'] = test_5['MobileService']+test_5['4GService']+ test_5['CyberProtection']+ test_5['HardwareSupport']+test_5['TechnicalAssistance'] + test_5['FilmSubscription']
test_5['SAMT'] = test_5['sex']*(1-test_5['Aged'])*(1-test_5['Married']) +test_5['TotalDependents']

# Introduce few new features train_3  ----->

train_6 = train_3.copy()
train_6['G/Q'] = train_6['GrandPayment']/train_6['QuarterlyPayment']
train_6['G/S'] = train_6['GrandPayment']/train_6['ServiceSpan']
train_6['Q/S'] = train_6['QuarterlyPayment']/train_6['ServiceSpan']
train_6['Services_add'] = train_6['MobileService']+train_6['4GService']+ train_6['CyberProtection']+ train_6['HardwareSupport']+train_6['TechnicalAssistance'] + train_6['FilmSubscription']
train_6['SAMT'] = train_6['sex']*(1-train_6['Aged'])*(1-train_6['Married']) +train_6['TotalDependents']

test_6 = test_3.copy()
test_6['G/Q'] = test_6['GrandPayment']/test_6['QuarterlyPayment']
test_6['G/S'] = test_6['GrandPayment']/test_6['ServiceSpan']
test_6['Q/S'] = test_6['QuarterlyPayment']/test_6['ServiceSpan']
test_6['Services_add'] = test_6['MobileService']+test_6['4GService']+ test_6['CyberProtection']+ test_6['HardwareSupport']+test_6['TechnicalAssistance'] + test_6['FilmSubscription']
test_6['SAMT'] = test_6['sex']*(1-test_6['Aged'])*(1-test_6['Married']) +test_6['TotalDependents']


In [15]:
### General ###
import os
import sys
import copy
import pickle
import random
import warnings
warnings.filterwarnings("ignore")
sys.path.append("./tfms")

### Data Wrangling ###
import numpy as np
import pandas as pd
from scipy import stats
from gauss_rank_scaler import GaussRankScaler

### Data Visualization ###
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use("fivethirtyeight")

### Machine Learning ###
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import QuantileTransformer
from sklearn.feature_selection import VarianceThreshold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

### Make prettier the prints ###
from colorama import Fore
c_ = Fore.CYAN
m_ = Fore.MAGENTA
r_ = Fore.RED
b_ = Fore.BLUE
y_ = Fore.YELLOW
g_ = Fore.GREEN

In [103]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import GradientBoostingClassifier

In [104]:
seed = 42

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
set_seed(seed)

In [117]:
scale = "none"
variance_threshould = 0.0
decompo = "no" #"PCA"
ncompo = 11

In [118]:
def get_dummies(df,dum_cols,val = 0):
    if val == 0:
        return df
    return pd.get_dummies(df, prefix=None, prefix_sep='_', dummy_na=False, columns=dum_cols)

def normalize(df):
    return (df - df.mean(0) )/df.std(0)

In [119]:
train_df = train_6.copy()
test_df  = test_6.copy()

In [120]:
train_features = train_df.drop(['CustomerAttrition','ID'],axis=1)
train_targets_scored = train_df['CustomerAttrition']

test_features = test_df.drop(['ID'],axis=1)
sample_submission = pd.read_csv('Sample Submission.csv')

concat = pd.concat([train_features,test_features],axis=0)

create_dummy = 0                #0 if dont want dummies
dum_cols = ['age', 'experience', 'married', 'house_ownership',
       'car_ownership', 'profession', 'city', 'state', 'current_job_years',
       'current_house_years']

concat = get_dummies(concat,dum_cols,create_dummy)
# norm_concat = normalize(concat)

train_features = concat[:len(train_features)]
test_features  = concat[len(train_features):]

In [121]:
data_all = pd.concat([train_features, test_features], ignore_index = True)
cols_numeric = [feat for feat in list(data_all.columns)]
mask = (data_all[cols_numeric].var() >= variance_threshould).values
data_all = data_all[cols_numeric].loc[:, mask]
cols_numeric = [feat for feat in list(data_all.columns)]
data_all

Unnamed: 0,sex,Aged,Married,TotalDependents,ServiceSpan,MobileService,4GService,CyberProtection,HardwareSupport,TechnicalAssistance,FilmSubscription,QuarterlyPayment,GrandPayment,SettlementProcess_Bank,SettlementProcess_Card,SettlementProcess_Check,SettlementProcess_Electronic,G/Q,Services_add,SAMT
0,0,0,0,1,48,1,2,0,1,1,0,96.259802,4628.129119,0,0,1,0,48.079562,5,1
1,1,0,1,0,45,1,2,0,0,0,1,18.876470,922.085998,1,0,0,0,48.848434,4,0
2,1,1,1,0,59,1,2,1,0,1,0,106.016057,6205.891129,0,1,0,0,58.537275,5,0
3,1,0,0,0,14,1,2,0,0,0,0,80.108839,1248.655391,0,0,1,0,15.586987,3,1
4,0,0,1,1,25,1,2,0,1,0,1,51.529773,1285.321277,0,0,1,0,24.943274,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7037,0,0,1,0,67,1,1,0,1,1,0,109.033026,7345.910050,0,1,0,0,67.373257,4,0
7038,1,0,0,0,2,1,2,0,0,0,0,71.468072,130.565363,0,1,0,0,1.826905,3,1
7039,0,0,1,0,22,1,2,0,1,0,1,102.930468,2392.842382,0,0,0,1,23.247173,5,0
7040,0,0,0,0,4,1,2,0,0,0,0,51.862869,263.231362,0,0,0,1,5.075526,3,0


In [122]:
def scale_minmax(col):
    return (col - col.min()) / (col.max() - col.min())

def scale_norm(col):
    return (col - col.mean()) / col.std()

if scale == "boxcox":
    print(b_, "boxcox")
    data_all[cols_numeric] = data_all[cols_numeric].apply(scale_minmax, axis = 0)
    trans = []
    for feat in cols_numeric:
        trans_var, lambda_var = stats.boxcox(data_all[feat].dropna() + 1)
        trans.append(scale_minmax(trans_var))
    data_all[cols_numeric] = np.asarray(trans).T
    
elif scale == "norm":
    print(b_, "norm")
    data_all[cols_numeric] = data_all[cols_numeric].apply(scale_norm, axis = 0)
    
elif scale == "minmax":
    print(b_, "minmax")
    data_all[cols_numeric] = data_all[cols_numeric].apply(scale_minmax, axis = 0)
    
elif scale == "rankgauss":
    ### Rank Gauss ###
    print(b_, "Rank Gauss")
    scaler = GaussRankScaler()
    data_all[cols_numeric] = scaler.fit_transform(data_all[cols_numeric])
    
else:
    pass

In [123]:
# PCA
if decompo == "PCA":
    print(b_, "PCA")
    
    pca_genes = PCA(n_components = ncompo,
                    random_state = seed).fit_transform(data_all)
    
    pca_genes = pd.DataFrame(pca_genes, columns = [f"pca_g-{i}" for i in range(ncompo)])
    data_all = pd.concat((data_all, pca_genes), axis=1)
else:
    pass

In [124]:
train_df = data_all[: train_df.shape[0]]
train_df = pd.concat([train_df,train_targets_scored],axis=1)
train_df.reset_index(drop = True, inplace = True)
test_df = data_all[train_df.shape[0]: ]
test_df.reset_index(drop = True, inplace = True)

print(f"{b_}train_df.shape: {r_}{train_df.shape}")
print(f"{b_}test_df.shape: {r_}{test_df.shape}")

X_test = test_df.values
print(f"{b_}X_test.shape: {r_}{X_test.shape}")

[34mtrain_df.shape: [31m(6337, 21)
[34mtest_df.shape: [31m(705, 20)
[34mX_test.shape: [31m(705, 20)


In [143]:
scores_auc_all = []
test_cv_preds = []

NB_SPLITS = 5

mskf = MultilabelStratifiedKFold(n_splits = NB_SPLITS, random_state = 0, shuffle = True)

oof_preds = []
oof_targets = []
scores = []
scores_auc = []

# for mskf
ms_tar = np.array(train_df)
targets = train_df.CustomerAttrition.values
train_it = train_df.drop(['CustomerAttrition'],axis=1)
#####

for fold_nb, (train_idx, val_idx) in enumerate(mskf.split(train_it, ms_tar)):
    print(b_,"FOLDS: ", r_, fold_nb + 1)
    print(g_, '*' * 60, c_)
    
    X_train, y_train = train_it.values[train_idx, :], np.array(targets).reshape(-1,1)[train_idx]
    X_val, y_val = train_it.values[val_idx, :], np.array(targets).reshape(-1,1)[val_idx]
    ### Model ###
    
    
#     model = DecisionTreeClassifier(max_depth = 4,random_state=42)
    model = GradientBoostingClassifier(learning_rate=0.1, min_samples_split=500,min_samples_leaf=50,max_depth=9,max_features='log2',subsample=0.9,random_state=42)

    model.fit(X_train,y_train)
    
    y_pred = model.predict(X_train)
    acc = accuracy_score(y_train, y_pred)
    f1  = f1_score(y_train,y_pred)
    print(f'Train acc: {acc} f1: {f1}')
    
    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    f1  = f1_score(y_val,y_pred)
    print(f'Val acc: {acc} f1: {f1}')
    
    
    ### Save OOF for CV ###
    oof_preds.append(y_pred)
    oof_targets.append(y_val.reshape(-1,))
    scores.append(f1)
    
    ### Predict on test ###
    preds_test = model.predict_proba(X_test)[:,1:].reshape(-1)
    test_cv_preds.append(preds_test)

oof_preds_all = np.concatenate(oof_preds)
oof_targets_all = np.concatenate(oof_targets)
test_preds_all = np.stack(test_cv_preds)

[34m FOLDS:  [31m 1
[32m ************************************************************ [36m
Train acc: 0.8383858267716535 f1: 0.6557651991614256
Val acc: 0.7780429594272077 f1: 0.5342237061769616
[34m FOLDS:  [31m 2
[32m ************************************************************ [36m
Train acc: 0.8276679841897233 f1: 0.6351464435146443
Val acc: 0.8081440877055599 f1: 0.5797598627787307
[34m FOLDS:  [31m 3
[32m ************************************************************ [36m
Train acc: 0.8339257503949447 f1: 0.6407518154634771
Val acc: 0.7965435978004713 f1: 0.5647058823529412
[34m FOLDS:  [31m 4
[32m ************************************************************ [36m
Train acc: 0.8325782942682687 f1: 0.6395250212044106
Val acc: 0.7992063492063493 f1: 0.5537918871252205
[34m FOLDS:  [31m 5
[32m ************************************************************ [36m
Train acc: 0.8320505229919084 f1: 0.637718177948063
Val acc: 0.7913385826771654 f1: 0.5516074450084602


In [144]:
oof = accuracy_score(oof_targets_all,oof_preds_all)
print('OOF acc: %f' % oof)

sample_sub = pd.read_csv("Sample Submission.csv")

# Lets Take Weighted ensemble
base = np.zeros((len(sample_sub)))
for itr,prs in enumerate(test_preds_all):
    base += (scores[itr]/np.sum(scores))*prs

sample_sub['CustomerAttrition'] = 1*(base >= 0.5)

sample_sub.replace({1:'Yes', 0:'No'}, inplace=True)
sample_sub.to_csv("submission_try.csv",index=False)

OOF acc: 0.794698
