In [3]:
import numpy as np
import pandas as pd
import scipy
import random
import time
import os

# Part 1

In [2]:
raw_train_data = pd.read_csv("bank-additional-train.csv",delimiter=",", encoding="utf-8-sig")
raw_train_data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,32,management,married,professional.course,unknown,no,no,cellular,jul,mon,...,4,999,0,nonexistent,1.4,93.918,-42.7,4.962,5228.1,no
1,41,admin.,married,high.school,no,yes,yes,cellular,apr,mon,...,1,999,0,nonexistent,-1.8,93.075,-47.1,1.405,5099.1,no
2,32,admin.,married,university.degree,no,yes,no,cellular,may,mon,...,2,999,0,nonexistent,-1.8,92.893,-46.2,1.299,5099.1,no
3,37,admin.,married,high.school,no,unknown,unknown,cellular,jul,thu,...,3,999,0,nonexistent,1.4,93.918,-42.7,4.968,5228.1,no
4,38,admin.,divorced,university.degree,no,no,no,cellular,jul,tue,...,8,999,0,nonexistent,1.4,93.918,-42.7,4.961,5228.1,no


In [3]:
#See the details in different attributes 
for key in raw_train_data.keys():
    print(raw_train_data[key].value_counts().head(10))

31    1744
32    1663
33    1629
36    1605
35    1570
34    1558
30    1550
37    1335
29    1301
39    1285
Name: age, dtype: int64
admin.           9380
blue-collar      8331
technician       6075
services         3553
management       2646
retired          1536
entrepreneur     1325
self-employed    1276
housemaid         967
unemployed        906
Name: job, dtype: int64
married     22426
single      10410
divorced     4159
unknown        74
Name: marital, dtype: int64
university.degree      10931
high.school             8555
basic.9y                5449
professional.course     4709
basic.4y                3790
basic.6y                2060
unknown                 1559
illiterate                16
Name: education, dtype: int64
no         29301
unknown     7767
yes            1
Name: default, dtype: int64
yes        19447
no         16753
unknown      869
Name: housing, dtype: int64
no         30595
yes         5605
unknown      869
Name: loan, dtype: int64
cellular     23506
telepho

In [6]:
missinglistName = []#For later using
for key in raw_train_data.keys():
    missing=0
    keyword=''
    value = raw_train_data[key].value_counts()
    if 'unknown' in list(value.keys()):
        missing = value['unknown']
        proportion = missing / len(raw_train_data[key])
        missinglistName.append(key)
        print("The number of missing data in %s is %d and the proportion is %f with the keyword unknown" %(key,missing,proportion))

The number of missing data in job is 296 and the proportion is 0.007985 with the keyword unknown
The number of missing data in marital is 74 and the proportion is 0.001996 with the keyword unknown
The number of missing data in education is 1559 and the proportion is 0.042057 with the keyword unknown
The number of missing data in default is 7767 and the proportion is 0.209528 with the keyword unknown
The number of missing data in housing is 869 and the proportion is 0.023443 with the keyword unknown
The number of missing data in loan is 869 and the proportion is 0.023443 with the keyword unknown


In [23]:
missinglistName

['job', 'marital', 'education', 'default', 'housing', 'loan']

In [10]:
#These percentage of missing data in these columns less that 1%, then we will fill them up later

In [7]:
#Translate non-numerical data to numerical data
def trans_non_numerical_data(df):
    columns = df.columns.values
    for column in columns:
        text_digit_vals ={}
        def convert_to_int(val):
            return text_digit_vals[val]
        
        if df[column].dtype!= np.int64 and df[column].dtype!= np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = set(column_contents)
            x = 1
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    if unique != 'unknown':
                        text_digit_vals[unique] = x
                        x+=1
                    else:
                        text_digit_vals[unique] = 'unknown'
            df[column] = list(map(convert_to_int,df[column]))
            
    return df

In [8]:
raw_train_data = trans_non_numerical_data(raw_train_data)

In [9]:
for key in raw_train_data.keys():
    print(raw_train_data[key].value_counts().head(10))

31    1744
32    1663
33    1629
36    1605
35    1570
34    1558
30    1550
37    1335
29    1301
39    1285
Name: age, dtype: int64
2     9380
8     8331
7     6075
6     3553
5     2646
11    1536
10    1325
4     1276
9      967
1      906
Name: job, dtype: int64
1          22426
3          10410
2           4159
unknown       74
Name: marital, dtype: int64
3          10931
5           8555
2           5449
7           4709
4           3790
1           2060
unknown     1559
6             16
Name: education, dtype: int64
1          29301
unknown     7767
2              1
Name: default, dtype: int64
2          19447
1          16753
unknown      869
Name: housing, dtype: int64
1          30595
2           5605
unknown      869
Name: loan, dtype: int64
1    23506
2    13563
Name: contact, dtype: int64
6     12351
2      6474
3      5576
7      4787
4      3701
5      2379
8       632
10      509
1       499
9       161
Name: month, dtype: int64
2    7757
5    7673
4    7340
3    7269


In [None]:
# Now we need to deal with the missing data, we use K-Nearest Neighbors (KNN) Imputation.

In [10]:
def dropMissing(dataset,missinglistName):
    #Find the missing line index and put them in to a list
    missingLine = dataset[(dataset['job']=='unknown')|(dataset['marital']=='unknown')|\
                          (dataset['education']=='unknown')|(dataset['default']=='unknown')|\
                          (dataset['housing']=='unknown')|(dataset['loan']=='unknown')]
    missingIndex = list(missingLine.index)
    #Create a new database without three columns with missing data for distance calculation using
    datawithoutMissing = dataset.drop(missinglistName, axis=1)
    return missingIndex,datawithoutMissing   

In [11]:
missingIndex,datawithoutMissing = dropMissing(raw_train_data,missinglistName)

In [12]:
from scipy.spatial import distance_matrix
"""
We calculate  for the missing line with all other complete line in this function and then find the minima distance, 
Copy the value in complete line to missing data line (in the next function) to decreace error. 
"""
#missingIndex is all row number with missing data
def findMinEntropy(data,missingIndex):
    a = data.drop(missingIndex).reset_index()
    completeIndexlist =np.asarray(a.iloc[:,0])
    missingMatrix = np.asarray(data.loc[missingIndex]).astype(np.int64)
    bigMatrix = np.asarray(data.drop(missingIndex)).astype(np.int64) #np.asarray(data).astype(np.int64)
    #matrix = cosine_similarity(missingMatrix,bigMatrix.transpose())
    matrix = scipy.spatial.distance_matrix(missingMatrix,bigMatrix)
    indexList = []
    print("finish calculate the matrix")
    for i in range(matrix.shape[0]):
        index_min = np.argmin(matrix[i,:])
        indexList.append(completeIndexlist[index_min])
    return indexList

In [13]:
completeIndexList = findMinEntropy(datawithoutMissing,missingIndex)

finish calculate the matrix


In [14]:
"""
As last explanation said, we copy the value in completed line for the missing value in missng data line. 
"""
def changeValue(raw_train_data,missingColList,missingIndexList,completeIndexList):
    for i in range(len(missingIndexList)):
        missingLineIndex = missingIndexList[i]
        completeLineIndex = completeIndexList[i]
        for missingCol in missingColList:
            if raw_train_data.at[missingLineIndex,missingCol] == 'unknown':
                value = raw_train_data.at[completeLineIndex, missingCol]#Value is used to fill
                if value == 'unknown':
                    print('This position updata failure. The imformation for the completed Line index is:')
                    print('Line index is: '+ str(completeLineIndex)+ ' Column number is: ' + str(missingCol))
                raw_train_data.at[missingLineIndex, missingCol] = int(value)
    return raw_train_data

In [15]:
#get the clean data
raw_train_data =changeValue(raw_train_data,missinglistName,missingIndex,completeIndexList)

In [16]:
#We save this data for later using whcih means we don't need to do the data Preprocess again at next time.
raw_train_data.to_csv('clean_data.csv',sep=",", index = False, encoding="utf-8-sig")

# Part2

In [4]:
whole_data=pd.read_csv("clean_data.csv",delimiter=",", encoding="utf-8-sig")

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import KFold
from sklearn.model_selection import cross_validate
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neural_network import MLPClassifier



In [8]:
#Split the data to training data, validition data and test data
X=whole_data.drop('y', axis=1)
y=whole_data['y']
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.6, random_state=520)

In [16]:
#X_train, X_valid, y_train, y_valid = train_test_split( X, y, test_size=0.33, random_state=521)

In [20]:
#SVC
#https://scikit-learn.org/stable/modules/svm.html

In [109]:
clf = SVC(gamma='auto')

In [110]:
model_svm=clf.fit(X_train, y_train) 

In [111]:
model_svm.score(X_test, y_test) 

0.8898480352486288

In [None]:
#neural network
#https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier

In [None]:
#logistic & 'lbfgs'

In [44]:
nn_log=MLPClassifier(activation = 'logistic',solver ='lbfgs', alpha =  0.1, hidden_layer_sizes=(5, 2), random_state=1)

In [45]:
model_nn_log=nn_log.fit(X_train, y_train)

In [46]:
model_nn_log.score(X_test, y_test) 

0.9033360309324701

In [None]:
#relu&'adam'

In [105]:
nn_log=MLPClassifier(solver ='adam', alpha =  100, hidden_layer_sizes=(3, 2), random_state=1)

In [106]:
model_nn_log=nn_log.fit(X_train, y_train)

In [107]:
model_nn_log.score(X_test, y_test) 

0.9073374696520097

# Part 3

In [None]:
#https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

In [None]:
from sklearn.datasets import make_moons, make_circles, make_classification
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

In [None]:
h = .02  # step size in the mesh

names = ["RBF SVM", "Neural Net 1", "Neural Net 1"]

classifiers = [
    SVC(gamma='auto'),
    MLPClassifier(activation = 'logistic',solver ='lbfgs', alpha =  0.1, hidden_layer_sizes=(5, 2), random_state=1),
    MLPClassifier(solver ='adam', alpha =  100, hidden_layer_sizes=(3, 2), random_state=1)
   ]

X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
                           random_state=1, n_clusters_per_class=1)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)

datasets = [make_moons(noise=0.3, random_state=0),
            make_circles(noise=0.2, factor=0.5, random_state=1),
            linearly_separable
            ]

figure = plt.figure(figsize=(27, 9))
i = 1
# iterate over datasets
for ds_cnt, ds in enumerate(datasets):
    # preprocess dataset, split into training and test part
    X, y = ds
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=.4, random_state=42)

    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    # just plot the dataset first
    cm = plt.cm.RdBu
    cm_bright = ListedColormap(['#FF0000', '#0000FF'])
    ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
    if ds_cnt == 0:
        ax.set_title("Input data")
    # Plot the training points
    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
               edgecolors='k')
    # Plot the testing points
    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6,
               edgecolors='k')
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xticks(())
    ax.set_yticks(())
    i += 1

    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)

        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, x_max]x[y_min, y_max].
        if hasattr(clf, "decision_function"):
            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
        else:
            Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

        # Plot the training points
        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
                   edgecolors='k')
        # Plot the testing points
        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
                   edgecolors='k', alpha=0.6)

        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_xticks(())
        ax.set_yticks(())
        if ds_cnt == 0:
            ax.set_title(name)
        ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
                size=15, horizontalalignment='right')
        i += 1

plt.tight_lay