# Bank customers retirement predictions

**Task:** We are working as analysts at a bank and are tasked to develop a model that can predict whether a customer is able to retire or not based on his/her features. Features are the age and net 401K savings (retirement savings in the U.S.).

Data Source: https://www.kaggle.com/adarshkumarjha/bank-customer-retirement

# Importing the libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Importing the dataset

In [None]:
ds = pd.read_csv('Bank_Customer_retirement.csv')

In [None]:
ds

# Visualising the dataset

In [None]:
sns.pairplot(data = ds, hue = 'Retire', vars = ['Age', '401K Savings'])

In [None]:
sns.countplot(x = ds.Retire)

# Missing data

In [None]:
# We observe no missing data.

sns.heatmap(ds.isnull(), yticklabels = False, cbar = False, cmap = 'Blues')

In [None]:
X = ds.iloc[:, 1:-1].values
y = ds.iloc[:, -1].values

In [None]:
X

In [None]:
y

# Encoding categorical variables

In [None]:
# There are no categorical variables

# Splitting the dataset into the training set and test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Fitting the SVC to the dataset

In [None]:
from sklearn.svm import SVC
svc = SVC(kernel = 'rbf', random_state = 0)
svc.fit(X_train, y_train)

In [None]:
# Predicting the test set results

y_pred = svc.predict(X_test)

# Visualising the Dataset

In [None]:
sns.scatterplot(x = ds['Age'], y = ds['401K Savings'], hue = ds['Retire'])

# Visualising the Training set

In [None]:
from matplotlib.colors import ListedColormap

X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01), 
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, svc.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape), alpha = 0.75, 
             cmap = ListedColormap(('red','green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c = ListedColormap(('red','green'))(i), label = j)
plt.title('Training set')
plt.xlabel('Age')
plt.ylabel('401K Savings')
plt.legend()
plt.show()

In [None]:
# Visualising the Test set

In [None]:
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01), 
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, svc.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape), alpha = 0.75, 
             cmap = ListedColormap(('red','green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c = ListedColormap(('red','green'))(i), label = j)
plt.title('Training set')
plt.xlabel('Age')
plt.ylabel('401K Savings')
plt.legend()
plt.show()

# Model Evaluation - Confusion Matrix and K-Fold Cross Validation

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(y_test, y_pred)

In [None]:
cm

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import cross_val_score
accuracy = cross_val_score(estimator = svc, X = X_train, y = y_train, cv = 10)
print("Mean accuracy = ",round(accuracy.mean()*100, 1), '%')
print("Mean std is = ", accuracy.std())

# Model Improvements - Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = [{'C':[1,10,100,1000], 'kernel':['linear']}, 
              {'C':[1,10,100,1000], 'kernel':['rbf'], 'gamma':[0.5,0.1,0.01,0.001]},
              {'C':[1,10,100,1000], 'kernel':['poly'], 'degree':[2,3,4], 'gamma':[0.5,0.1,0.01,0.001]}]
grid_search = GridSearchCV(estimator = svc, param_grid = parameters, scoring = 'accuracy', cv = 10, n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

print('Optimal accuracy is:', round(best_accuracy*100, 1),'%')
print(best_parameters)

# Using the improved model

In [None]:
# We are using the SVC with the optimal parameters obatined from the Grid Search CV

svc_new = SVC(C=1, gamma=0.001, kernel='rbf', random_state = 0)
svc_new.fit(X_train, y_train)

y_pred = svc_new.predict(X_test)

In [None]:
print(confusion_matrix(y_test, y_pred))

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
accuracy = cross_val_score(estimator = svc_new, X = X_train, y = y_train, cv = 10)
print("Mean accuracy = ",round(accuracy.mean()*100, 1), '%')
print("Mean std is = ", accuracy.std())