## Working on the Train data

### 1. import the packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.model_selection import cross_val_score, GridSearchCV
from xgboost import XGBClassifier 
import multiprocessing

### 2. Read the train data

In [2]:
data=pd.read_csv('C:/Users/Administrator/Desktop/Datathon_2/Yes_Bank_Training.csv').set_index('serial_number')
data.head()

Unnamed: 0_level_0,age_in_years,job_description,marital_status,education_details,has_default,balance_in_account,housing_status,previous_loan,phone_type,date,month_of_year,call_duration,campaign_contacts,days_passed,previous_contact,poutcome_of_campaign,outcome
serial_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
2,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
3,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
4,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
5,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


#### this extra missing parameter in the test data which is 'sep' in month of year has to be added to the train data. now the process continues..

In [6]:
# data=data.append(data_to_train,ignore_index=True) #run only once

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [3]:
data.head()

Unnamed: 0_level_0,age_in_years,job_description,marital_status,education_details,has_default,balance_in_account,housing_status,previous_loan,phone_type,date,month_of_year,call_duration,campaign_contacts,days_passed,previous_contact,poutcome_of_campaign,outcome
serial_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
2,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
3,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
4,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
5,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
data.outcome.value_counts()

no     30388
yes     1840
Name: outcome, dtype: int64

In [4]:
data_dummies=pd.get_dummies(data.drop('outcome',axis=1))
data_dummies.shape

(32228, 51)

In [5]:
data_train=pd.DataFrame(data_dummies)
data_train['outcome']=data['outcome']
data_train.shape

(32228, 52)

In [None]:
# data_train.dtypes
# print(data_train.columns)
# #1,2,3,8,10,15
# print('this columns have more than 2 values ',list(['job_description','marital_status','education_details',
#                                                   'phone_type','month_of_year','poutcome_of_campaign'])) # to get the columns of the categorical variables

In [None]:
# columns=['job_description','marital_status','education_details','has_default','housing_status','previous_loan',
#          'phone_type','month_of_year','poutcome_of_campaign']
# for column in columns:
#     data_train[column]=data_train[column].astype('category')
# data_train.dtypes

### 3. Apply label encoder

In [None]:
# columns=['job_description','marital_status','education_details','has_default','housing_status','previous_loan',
#          'phone_type','month_of_year','poutcome_of_campaign']
# le=LabelEncoder()
# for column in columns:
#     data_train[column]=le.fit_transform(data_train[column])
# data_train.head()

In [7]:
data_train.dtypes

age_in_years                      int64
balance_in_account                int64
date                              int64
call_duration                     int64
campaign_contacts                 int64
days_passed                       int64
previous_contact                  int64
job_description_admin.            uint8
job_description_blue-collar       uint8
job_description_entrepreneur      uint8
job_description_housemaid         uint8
job_description_management        uint8
job_description_retired           uint8
job_description_self-employed     uint8
job_description_services          uint8
job_description_student           uint8
job_description_technician        uint8
job_description_unemployed        uint8
job_description_unknown           uint8
marital_status_divorced           uint8
marital_status_married            uint8
marital_status_single             uint8
education_details_primary         uint8
education_details_secondary       uint8
education_details_tertiary        uint8


### 4. Split the train data into test and validate

In [6]:
X=data_train.drop('outcome',axis=1)
Y=data_train['outcome']
train_x,validate_x,train_y,validate_y=train_test_split(X,Y,test_size=0.3,random_state=100)

In [7]:
train_x.head()

Unnamed: 0_level_0,age_in_years,balance_in_account,date,call_duration,campaign_contacts,days_passed,previous_contact,job_description_admin.,job_description_blue-collar,job_description_entrepreneur,...,month_of_year_jun,month_of_year_mar,month_of_year_may,month_of_year_nov,month_of_year_oct,month_of_year_sep,poutcome_of_campaign_failure,poutcome_of_campaign_other,poutcome_of_campaign_success,poutcome_of_campaign_unknown
serial_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12739,45,1805,7,758,1,-1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
24195,41,7028,17,98,1,-1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3951,40,192,16,71,11,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1771,28,229,9,322,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
27279,33,1817,21,55,1,-1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


### 5.Applying one hot encoder

In [None]:
# import warnings
# warnings.filterwarnings('ignore')
# hot=OneHotEncoder(categorical_features=[6,9,10,11,12,13])
# train_x=hot.fit_transform(train_x)
# validate_x=hot.fit_transform(validate_x)

In [10]:
train_x.shape

(22559, 51)

In [11]:
validate_x.shape

(9669, 51)

### 6.Creating a function which print accuracy of different models

In [8]:
def get_accuracy(model,x1=train_x,y1=train_y,x2=validate_x,y2=validate_y):
    import warnings
    warnings.filterwarnings('ignore')
    model_name=model
    model=model()
    model.fit(x1,y1)
    pred=model.predict(x2)
    acc=accuracy_score(pred,y2)
    print('accuracy for ',model_name,' is: ',acc)

In [9]:
get_accuracy(DecisionTreeClassifier)
get_accuracy(RandomForestClassifier)
get_accuracy(AdaBoostClassifier)
get_accuracy(KNeighborsClassifier)
get_accuracy(XGBClassifier)

accuracy for  <class 'sklearn.tree.tree.DecisionTreeClassifier'>  is:  0.9289481849208812
accuracy for  <class 'sklearn.ensemble.forest.RandomForestClassifier'>  is:  0.9445651049746613
accuracy for  <class 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'>  is:  0.9428069086772158
accuracy for  <class 'sklearn.neighbors.classification.KNeighborsClassifier'>  is:  0.9403247491984693
accuracy for  <class 'xgboost.sklearn.XGBClassifier'>  is:  0.9500465404902265


In [14]:
data_train.shape

(32228, 52)

In [10]:
train_x=data_train.drop('outcome',axis=1)
train_y=data_train['outcome']

In [20]:
param_grid={'learning_rate':[0.04,0.05,0.06,0.07],
            'n_estimators':[300,400,500],
            'max_depth':[2,3,4],
            'min_child_weight':[2,3,4]
           }

In [21]:
model_xg=XGBClassifier(n_jobs=4)
gscv=GridSearchCV(estimator=model_xg,param_grid=param_grid,cv=3,scoring='roc_auc')
gscv.fit(train_x,train_y)
print('for xgboost ',gscv.best_params_)

for xgboost  {'learning_rate': 0.04, 'max_depth': 2, 'min_child_weight': 2, 'n_estimators': 300}


#### Applying the above models we can see that the best accuracy is obtained by Random Forest

## Working on the Test Data

In [22]:
data_test=pd.read_csv('C:/Users/Administrator/Desktop/Datathon_2/Yes_Bank_Test.csv').set_index('serial_number')
# data_to_train=data_test.query('month_of_year=="sep"')
data_test.head()

Unnamed: 0_level_0,age_in_years,job_description,marital_status,education_details,has_default,balance_in_account,housing_status,previous_loan,phone_type,date,month_of_year,call_duration,campaign_contacts,days_passed,previous_contact,poutcome_of_campaign
serial_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,33,admin.,married,secondary,no,796,yes,no,telephone,6,apr,216,1,-1,0,unknown
2,40,management,married,secondary,no,754,no,yes,cellular,6,apr,635,1,-1,0,unknown
3,30,technician,single,secondary,no,10152,yes,no,cellular,6,apr,1108,2,270,1,failure
4,44,blue-collar,married,secondary,no,545,yes,no,cellular,6,apr,122,3,136,1,failure
5,35,technician,married,secondary,no,5553,yes,no,cellular,6,apr,371,3,136,7,failure


In [23]:
data_test=pd.get_dummies(data_test)
data_test.shape

(13562, 51)

#### since the training data has one less feature present in the month of year column which is 'sep' the sep data need to be fed into the training model so that the results can be precise

In [None]:
# columns=['job_description','marital_status','education_details','has_default','housing_status','previous_loan',
#          'phone_type','month_of_year','poutcome_of_campaign']
# le=LabelEncoder()
# for column in columns:
#     data_test[column]=le.fit_transform(data_test[column])
# data_test.head()

In [None]:
# print(data_test.columns)
# print(data_train.month_of_year.unique())
# data_test.month_of_year.unique()

In [None]:
# import warnings
# warnings.filterwarnings('ignore')
# hot=OneHotEncoder(categorical_features=[1,2,3,8,10,15])
# data_test=hot.fit_transform(data_test).toarray()

## Testing on TestDataset

In [51]:
model_xg=XGBClassifier(learning_rate=0.05,max_depth=2,n_estimators=1000,min_child_weight=2,nthread=4,gamma=0.001)
model_xg.fit(train_x,train_y)
pred=model_xg.predict(data_test)
results=pd.DataFrame({'serial_number':data_test.index,'outcome':pred})

In [52]:
results.head()

Unnamed: 0,serial_number,outcome
0,1,no
1,2,no
2,3,no
3,4,no
4,5,no


In [53]:
results.to_csv('task1.csv',index=False)