In [None]:
##################################
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd 
import numpy as np
import random
##################################

In [None]:
df = pd.read_csv("bank.csv")
df

In [None]:
"""
This cell reformats the data into integers with each unique data being its own number so that it can be clustered properly
"""
job = [ "admin.","blue-collar","entrepreneur","housemaid","management","retired","self-employed","services","student","technician","unemployed","unknown"]

for ii in range(len(job)): 
    df.job[df.job == job[ii]] = ii

marital = ["divorced","married","single","unknown"]

for ii in range(len(marital)):
    df.marital[df.marital == marital[ii]] = ii

education =  ["primary","secondary","tertiary","unknown"]

for ii in range(len(education)):
    df.education[df.education == education[ii]] = ii

df.default[df.default == "no" ] = 0
df.default[df.default == "yes" ] = 1

df.housing[df.housing == "no" ] = 0
df.housing[df.housing == "yes" ] = 1

df.loan[df.loan == "no" ] = 0
df.loan[df.loan == "yes" ] = 1

contact = ["telephone","cellular","unknown"] 

for ii in range(len(contact)):
    df.contact[df.contact == contact[ii]] = ii

month = ["jan", "feb", "mar", "apr", "may","jun", "jul", "aug", "sep", "oct","nov", "dec"]

for ii in range(len(month)):
    df.month[df.month == month[ii]] = ii

poutcome = ["other","failure","success", "unknown"]


for ii in range(len(poutcome)):
    df.poutcome[df.poutcome == poutcome[ii]] = ii

df.y[df.y == "no" ] = 0
df.y[df.y == "yes" ] = 1
df.y = df.y.astype('int')

In [None]:
"""
Grabs a random sample of 500 rows from the data set for analysis
"""
sampled = df.sample(500)

sampled.head()

In [None]:
sampled = sampled[~sampled.isin([np.nan, np.inf, -np.inf]).any(1)]
sampled

In [None]:
"""
Sets the input attributes as the first 16, with the 17th attribute acting as the output.
The most relevent attributes that apply to the output are then graphed through the feature importance function.
"""
x = df.iloc[:, 0:16]
y = df.iloc[:, -1]
y = y.astype('int')

model = ExtraTreesClassifier()
model.fit(x, y)
print(model.feature_importances_)
feat_importances = pd.Series(model.feature_importances_, index=x.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.show()

In [None]:
"""
Filters the sample array set down to the 4 most influential attributes
"""
new_df = np.array(sampled[['duration','month','day','age']])

new_df = np.nan_to_num(new_df)
new_df = new_df.astype('int')

In [None]:
"""
Generates the elbow graph to help identify the number of clusters
"""
distortions = []
for i in range(1, 11):
    km = KMeans(n_clusters=i)
    km.fit(new_df)
    distortions.append(km.inertia_)
    
plt.plot(range(1, 11), distortions, 'bx-')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.tight_layout()

plt.show()

In [None]:
"""
Fits the data to the KMeans algorithm for clustering based on the most 'balanced' K value found in the elbow graph
"""

km = KMeans(3)

km.fit(df)

In [None]:
"""
Preps and fits the pca algorithm to be able to easily plot the kmeans
"""
from sklearn.decomposition import PCA

pca = PCA(n_components=2)

pca_components = pca.fit_transform(df)
pca_components

labels = km.labels_

In [None]:
"""
Check to see the pca columns
"""
pca_df = pd.DataFrame(data = pca_components, columns = ['pca1', 'pca2'])
pca_df.head()

In [None]:
"""
Sets the data for plotting
"""
pca_df = pd.concat([pca_df, pd.DataFrame({'cluster': labels})], axis = 1)
pca_df.head()

In [None]:
"""
Graphs the clusters for the total dataset
"""
import seaborn as sns
plt.figure(figsize=(10,10))
ax = sns.scatterplot(x = 'pca1', y = 'pca2', hue = 'cluster', data = pca_df, palette = ['red', 'blue', 'orange'])

centers_on_PCs = pca.transform(km.cluster_centers_)
plt.scatter(centers_on_PCs[:, 0], centers_on_PCs[:,1], s= 250, c = "k", marker="*")

In [None]:
"""
Calculates the percentages of each result type
"""
print("Percentage of positives")
sampled['y'].value_counts()/len(sampled)

In [None]:
"""
Defines the x and y for the KFold cross validation
"""
x = new_df
y = np.array(sampled['y'])

In [None]:
"""
Generates the predicted scores from the cross validation scores
"""
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn import svm

clf = svm.SVC(kernel='linear', C=1)

cv=KFold(n_splits=10, shuffle=True, random_state=1)

predicted_score = cross_val_score(clf, x, y, cv=cv)


In [None]:
"""
Generates the accuracy score for the average predicted values
"""
from numpy import mean, std
print('Accuracy: %.3f ( +/- %.3f)' % (mean(predicted_score), std(predicted_score)))

In [None]:
"""
Prints the predicted values from the cross validation
"""
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_val_predict

X_train, X_test, Y_train, Y_test = train_test_split(new_df, y,train_size = 0.8, random_state=1)

clf = LinearDiscriminantAnalysis()
clf.fit(new_df, y)

predicted = cross_val_predict(clf, new_df, y, cv=cv)

predicted

In [None]:
"""
Prints out a dataframe with the the actual and predicted values as columns
"""
data = {'y_Actual': sampled['y'],
        'y_Predicted': predicted}

data = pd.DataFrame(data, columns=['y_Actual', 'y_Predicted'])
data

In [None]:
"""
Prints a classification summary with more metrics to determine accuracy
"""
from sklearn.metrics import classification_report
print(classification_report(sampled['y'], predicted))

In [None]:
"""
Prints out confusion matrix
"""
confusion_matrix = pd.crosstab(data['y_Actual'], data['y_Predicted'], rownames=['Actual'], colnames=['Predicted'], margins= True)

sns.heatmap(confusion_matrix, annot=True)
plt.show()