In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from tqdm.notebook import tqdm_notebook
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
from catboost import CatBoostClassifier
import xgboost as xgb

In [2]:
train = pd.read_csv('TrainMobi.csv')
test = pd.read_csv('TestMobi.csv')
sub = pd.read_csv('SampleSubmission.csv')

In [3]:
train.head()

Unnamed: 0,ID,country_code,region,age,FQ1,FQ2,FQ3,FQ4,FQ5,FQ6,...,FQ27,FQ28,FQ29,FQ30,FQ31,FQ32,FQ33,FQ34,FQ37,Target
0,ID_000J8GTZ,1,6,35.0,2,,,2,,,...,,,1.0,,,,1.0,1.0,0,0
1,ID_000QLXZM,32,7,70.0,2,,,2,,,...,,,2.0,,,,1.0,2.0,0,0
2,ID_001728I2,71,7,22.0,2,1.0,,2,,,...,,,2.0,,,,2.0,1.0,1,0
3,ID_001R7IDN,48,3,27.0,1,,,2,,2.0,...,,,,,,2.0,1.0,1.0,1,0
4,ID_0029QKF8,25,0,79.0,2,,,2,,,...,,,2.0,,,2.0,1.0,1.0,1,0


In [4]:
test.shape

(46477, 41)

In [None]:
train.isnull().sum()/len(train) *100

In [6]:
train = train.drop(['FQ5','FQ17','FQ24','FQ35','FQ36','FQ27','FQ28','FQ29','FQ30','FQ31'], axis=1)
test = test.drop(['FQ5','FQ17','FQ24','FQ35','FQ36','FQ27','FQ28','FQ29','FQ30','FQ31'], axis=1)

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
train.isnull().sum()/len(train) *100

In [10]:
ntrain = train.shape[0] # to be used to split train and test set from the combined dataframe

full_data = pd.concat((train, test)).reset_index(drop=True)
print(f'The shape of the combined dataframe is: {full_data.shape}')

The shape of the combined dataframe is: (154923, 32)


In [None]:
full_data.isnull().sum()


In [12]:
def ProcessDataForModelling(data):
        #Fill up missing non-numeric data with categorical data
        for label, content in data.items():
            if not pd.api.types.is_float_dtype(content):
                if pd.isnull(content).sum():
                    data[col] = data[col].fillna(data[col].mode()[0])

        #Fill up missing numeric data with median
        for label, content in data.items():
            if pd.api.types.is_float_dtype(content):
                if pd.isnull(content).sum():
                    data[label] = content.fillna(content.median())

In [13]:
ProcessDataForModelling(full_data)

In [14]:
# Separate train and test data from the combined dataframe
train_df = full_data[:ntrain]
test_df = full_data[ntrain:]


# Check the shapes of the split dataset
train_df.shape, test_df.shape

((108446, 32), (46477, 32))

In [15]:
main_cols = train.columns.difference(['ID', 'Target'])
X = train_df[main_cols]
y = train_df.Target

In [16]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.27, random_state=42)

In [17]:
X_train.shape , X_test.shape , y_train.shape , y_test.shape

((79165, 30), (29281, 30), (79165,), (29281,))

In [18]:
data_dmatrix = xgb.DMatrix(data=X_train,label=y_train)

In [19]:
MobiBankModel = xgb.XGBClassifier(learning_rate=0.3,
                                 n_estimators=1350,
                                 max_depth=14,
                                 min_child_weight=5,
                                 gamma=0,
                                 subsample=0.9,
                                 colsample_bytree=0.9,
                                 objective= 'binary:logistic',
                                 nthread=4,
                                 scale_pos_weight=2,
                                 seed=42)

In [None]:
MobiBankModel = MobiBankModel.fit(X, y)

In [21]:
X_TestPreds = MobiBankModel.predict_proba(X_test)

In [22]:
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, X_TestPreds )
print(score*100)

100.0


In [334]:
#Measure the F1 Scores
from sklearn.metrics import f1_score
F1_macro = f1_score(y_test, X_TestPreds, average='macro')
F1_micro= f1_score(y_test, X_TestPreds, average='micro')
F1_weighted = f1_score(y_test, X_TestPreds, average='weighted')
F1_None = f1_score(y_test, X_TestPreds, average=None)

In [335]:
print(F1_macro*100)
print(F1_micro*100)
print(F1_weighted*100)
print(F1_None*100)

100.0
100.0
100.0
[100. 100.]


In [336]:
from sklearn.metrics import recall_score
# calculate recall
recall = recall_score(y_test, X_TestPreds, average='binary')
print('Recall: %.3f' % recall)

Recall: 1.000


In [337]:
from sklearn.metrics import precision_score
# calculate prediction
precision = precision_score(y_test, X_TestPreds, average='binary')
print('Precision: %.3f' % precision)

Precision: 1.000


In [338]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score

In [339]:
pd.DataFrame(confusion_matrix(y_test, X_TestPreds))

Unnamed: 0,0,1
0,21039,0
1,0,8242


In [340]:
test_df = test_df[main_cols]

In [341]:
#test_df = test_df.loc[:, test_df.columns != ['ID','Target']]
#df.loc[:, df.columns != 'b']

In [342]:
test_df.shape,test.shape

((46477, 30), (46477, 31))

In [343]:
MobiBankPreds = MobiBankModel.predict(test_df)


In [344]:
test.shape

(46477, 31)

In [345]:
len(MobiBankPreds)

46477

In [346]:
MobiBankPreds = pd.DataFrame({'ID':test['ID'],'target': MobiBankPreds})

In [347]:
MobiBankPreds.to_csv("MobiBankPreds13.csv", index=False)