In [26]:
# import libraries
import numpy as np
import pandas as pd

In [12]:
# load train data
dataset = pd.read_csv('dataset/Train_v2.csv')
dataset.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [13]:
# show information about data
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23524 entries, 0 to 23523
Data columns (total 13 columns):
country                   23524 non-null object
year                      23524 non-null int64
uniqueid                  23524 non-null object
bank_account              23524 non-null object
location_type             23524 non-null object
cellphone_access          23524 non-null object
household_size            23524 non-null int64
age_of_respondent         23524 non-null int64
gender_of_respondent      23524 non-null object
relationship_with_head    23524 non-null object
marital_status            23524 non-null object
education_level           23524 non-null object
job_type                  23524 non-null object
dtypes: int64(3), object(10)
memory usage: 2.3+ MB


In [14]:
# describe data (abstract look)
dataset.describe()

Unnamed: 0,year,household_size,age_of_respondent
count,23524.0,23524.0,23524.0
mean,2016.975939,3.797483,38.80522
std,0.847371,2.227613,16.520569
min,2016.0,1.0,16.0
25%,2016.0,2.0,26.0
50%,2017.0,3.0,35.0
75%,2018.0,5.0,49.0
max,2018.0,21.0,100.0


In [15]:
dataset.shape

(23524, 13)

In [16]:
# check if the is a null value in examples (rows)
dataset.isnull().sum()

country                   0
year                      0
uniqueid                  0
bank_account              0
location_type             0
cellphone_access          0
household_size            0
age_of_respondent         0
gender_of_respondent      0
relationship_with_head    0
marital_status            0
education_level           0
job_type                  0
dtype: int64

In [17]:
# show columns name
dataset.columns

Index(['country', 'year', 'uniqueid', 'bank_account', 'location_type',
       'cellphone_access', 'household_size', 'age_of_respondent',
       'gender_of_respondent', 'relationship_with_head', 'marital_status',
       'education_level', 'job_type'],
      dtype='object')

In [18]:
# overview dataset
for i in dataset.columns:
    print(i, ":", dataset[i].unique())

country : ['Kenya' 'Rwanda' 'Tanzania' 'Uganda']
year : [2018 2016 2017]
uniqueid : ['uniqueid_1' 'uniqueid_2' 'uniqueid_3' ... 'uniqueid_8757'
 'uniqueid_8758' 'uniqueid_8759']
bank_account : ['Yes' 'No']
location_type : ['Rural' 'Urban']
cellphone_access : ['Yes' 'No']
household_size : [ 3  5  8  7  1  6  4 10  2 11  9 12 16 15 13 14 21 18 17 20]
age_of_respondent : [ 24  70  26  34  32  42  54  76  40  69  64  31  38  47  27  48  25  21
  18  22  58  55  62  29  35  45  67  19  80  66  50  33  28  51  16  17
  30  37  59  65  46  56  52  23  43  49  44  72  53  63  39  81  78  36
  20  60  95  71  57  85  68  41  61  75  86  73  93  74  88  90  77  84
  82  89  79  83  94  87  92  91  98  97  96  99 100]
gender_of_respondent : ['Female' 'Male']
relationship_with_head : ['Spouse' 'Head of Household' 'Other relative' 'Child' 'Parent'
 'Other non-relatives']
marital_status : ['Married/Living together' 'Widowed' 'Single/Never Married'
 'Divorced/Seperated' 'Dont know']
education_level :

In [19]:
# drop "uniqueid" label
dataset = dataset.drop(['uniqueid'] , axis = 1)

In [31]:
y = dataset['bank_account'].values
X = dataset.drop(['bank_account'], axis = 1)

## Data Preprocessing

In [32]:
# transform boolean value [Tes, No] to binary value [0, 1]
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [33]:
# categorize
X = pd.get_dummies(X)
X.head()

Unnamed: 0,year,household_size,age_of_respondent,country_Kenya,country_Rwanda,country_Tanzania,country_Uganda,location_type_Rural,location_type_Urban,cellphone_access_No,...,job_type_Dont Know/Refuse to answer,job_type_Farming and Fishing,job_type_Formally employed Government,job_type_Formally employed Private,job_type_Government Dependent,job_type_Informally employed,job_type_No Income,job_type_Other Income,job_type_Remittance Dependent,job_type_Self employed
0,2018,3,24,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2018,5,70,1,0,0,0,1,0,1,...,0,0,0,0,1,0,0,0,0,0
2,2018,5,26,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,2018,5,34,1,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
4,2018,8,26,1,0,0,0,0,1,1,...,0,0,0,0,0,1,0,0,0,0


In [34]:
# standarise the dataset
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

## Training Part

In [35]:
# split dataset to train and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 19)

In [36]:
# evaluate the model (using custom loss func)
from sklearn import metrics
def evaluate(clf):
    y_pred = clf.predict(x_test)
    print('Score of {} are: Accuracy = {}'.format(type(clf).__name__, metrics.accuracy_score(y_test, y_pred)))

In [37]:
# model 1 (SVM)
from sklearn.svm import SVC
svm_svc = SVC(kernel = 'rbf')
svm_svc.fit(x_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [38]:
evaluate(svm_svc)

Score of SVC are: Accuracy = 0.8899121564182488


In [39]:
# model 2 (KNN)
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(5)
knn.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [40]:
evaluate(knn)

Score of KNeighborsClassifier are: Accuracy = 0.88169453102862


In [41]:
# model 3
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier()
random_forest.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [42]:
evaluate(random_forest)

Score of RandomForestClassifier are: Accuracy = 0.8676678945877019


In [43]:
from sklearn.tree import DecisionTreeClassifier
desicion_tree = DecisionTreeClassifier()
desicion_tree.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [44]:
evaluate(desicion_tree)

Score of DecisionTreeClassifier are: Accuracy = 0.8362142249929159


In [45]:
from sklearn.naive_bayes import GaussianNB
bn_gaussian = GaussianNB()
bn_gaussian.fit(x_train, y_train)

GaussianNB(priors=None)

In [46]:
evaluate(bn_gaussian)

Score of GaussianNB are: Accuracy = 0.8403230376877302


## Testing Part

In [47]:
# load test dataset file
test_dataset = pd.read_csv("dataset/Test_v2.csv")
test_dataset.head()

Unnamed: 0,country,year,uniqueid,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_6056,Urban,Yes,3,30,Male,Head of Household,Married/Living together,Secondary education,Formally employed Government
1,Kenya,2018,uniqueid_6060,Urban,Yes,7,51,Male,Head of Household,Married/Living together,Vocational/Specialised training,Formally employed Private
2,Kenya,2018,uniqueid_6065,Rural,No,3,77,Female,Parent,Married/Living together,No formal education,Remittance Dependent
3,Kenya,2018,uniqueid_6072,Rural,No,6,39,Female,Head of Household,Married/Living together,Primary education,Remittance Dependent
4,Kenya,2018,uniqueid_6073,Urban,No,3,16,Male,Child,Single/Never Married,Secondary education,Remittance Dependent


In [48]:
# get IDs & countries for output result
uniqueids = test_dataset.uniqueid
countries = test_dataset.country

In [49]:
# drop unique id column
test_dataset = test_dataset.drop(['uniqueid'], axis = 1)

In [50]:
test_dataset = pd.get_dummies(test_dataset)
test_dataset.head()

Unnamed: 0,year,household_size,age_of_respondent,country_Kenya,country_Rwanda,country_Tanzania,country_Uganda,location_type_Rural,location_type_Urban,cellphone_access_No,...,job_type_Dont Know/Refuse to answer,job_type_Farming and Fishing,job_type_Formally employed Government,job_type_Formally employed Private,job_type_Government Dependent,job_type_Informally employed,job_type_No Income,job_type_Other Income,job_type_Remittance Dependent,job_type_Self employed
0,2018,3,30,1,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
1,2018,7,51,1,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
2,2018,3,77,1,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,1,0
3,2018,6,39,1,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,1,0
4,2018,3,16,1,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,1,0


In [51]:
# standarise the test dataset
test_dataset = scaler.fit_transform(test_dataset)

In [52]:
prediction = svm_svc.predict(test_dataset)

In [53]:
# result columns ()
new_unique_id = uniqueids + " x " + countries
result_columns_list = list(zip(new_unique_id, prediction))

In [54]:
result = pd.DataFrame(result_columns_list, columns = ['unique_id', 'bank_account'])

In [55]:
result.describe()

Unnamed: 0,bank_account
count,10086.0
mean,0.059687
std,0.236917
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [56]:
# export result
result.to_csv('SubmissionFile.csv', index=False)