In [1]:
import time
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from sklearn.manifold import TSNE 
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import dendrogram

from Agnes import Agnes
from utils import report
from DBscan import DBscan 

import warnings
warnings.filterwarnings("ignore") 
cmap = 'Spectral'

In [2]:
data = pd.read_csv('datset.csv') 
cat = ['OverTime', 'MaritalStatus', 'JobRole', 'Gender', 'EducationField', 'Department', 'BusinessTravel', 'Attrition']
data = data.drop(['EmployeeCount', 'Over18', 'StandardHours', 'EmployeeNumber'], axis=1)
for i in cat:
    data[i] = (data[i].astype('category').cat.codes).apply(np.int64)
X, y = data.drop(['Attrition'], axis=1), data['Attrition']

In [3]:
X, y = np.array(X), list(y)

data_x = data.drop(['Attrition'], axis=1)
X_standardized = np.array((data_x - data_x.mean()) / data_x.std())
X_normalized = np.array(((data_x - data_x.min()) / (data_x.max() - data_x.min())))

In [4]:
agnes = Agnes('hamming', 'ward') 
with open('Agnes//Distances_Hamming.pkl', 'rb') as f:
    dist_matrix = pickle.load(f) 
start = time.time()
clusters_H, distances_H, nb_clusters_H = agnes.cluster(X, dist_matrix=dist_matrix) 
end = time.time() - start
print('Hamming done', end)

agnes = Agnes('manhattan', 'ward') 
with open('Agnes//Distances_Manhattan.pkl', 'rb') as f:
    dist_matrix = pickle.load(f) 
start = time.time()
clusters_M, distances_M, nb_clusters_M = agnes.cluster(X, dist_matrix=dist_matrix) 
end = time.time() - start
print('Manhattan done', end)

agnes = Agnes('manhattan',  'ward') 
with open('Agnes//Distances_Manhattan_Norm.pkl', 'rb') as f:
    dist_matrix = pickle.load(f) 
start = time.time()
clusters_N, distances_N, nb_clusters_N = agnes.cluster(X_normalized, dist_matrix=dist_matrix) 
end = time.time() - start
print('Manhattan Norm done', end)

agnes = Agnes('manhattan', 'ward') 
with open('Agnes//Distances_Manhattan_Std.pkl', 'rb') as f:
    dist_matrix = pickle.load(f) 
start = time.time()
clusters_S, distances_S, nb_clusters_S = agnes.cluster(X_standardized, dist_matrix=dist_matrix) 
end = time.time() - start
print('Manhattan Std done', end)

AttributeError: 'list' object has no attribute 'shape'

In [None]:
fig, ax = plt.subplots()
ax.plot(distances_H, color='orange', label='Min Ditsance')
ax.tick_params(axis='y', labelcolor='orange')
ax.set_xlabel('#Iterations')
ax.set_ylabel('Min ditsance')

ax2 = ax.twinx()
ax2.plot(nb_clusters_H, color='purple', label='#Clusters')
ax2.set_yscale('log')
ax2.tick_params(axis='y', labelcolor='purple')
ax2.set_ylabel('#Clusters')

ax.set_title('Evolution')
plt.show()

In [None]:
fig, ax = plt.subplots()
ax.plot(distances_M, color='orange', label='Min Ditsance')
ax.tick_params(axis='y', labelcolor='orange')
ax.set_xlabel('#Iterations')
ax.set_ylabel('Min ditsance')

ax2 = ax.twinx()
ax2.plot(nb_clusters_M, color='purple', label='#Clusters')
ax2.set_yscale('log')
ax2.tick_params(axis='y', labelcolor='purple')
ax2.set_ylabel('#Clusters')

ax.set_title('Evolution')
plt.show()

In [None]:
fig, ax = plt.subplots()
ax.plot(distances_N, color='orange', label='Min Ditsance')
ax.tick_params(axis='y', labelcolor='orange')
ax.set_xlabel('#Iterations')
ax.set_ylabel('Min ditsance')

ax2 = ax.twinx()
ax2.plot(nb_clusters_N, color='purple', label='#Clusters')
ax2.set_yscale('log')
ax2.tick_params(axis='y', labelcolor='purple')
ax2.set_ylabel('#Clusters')

ax.set_title('Evolution')
plt.show()

In [None]:
fig, ax = plt.subplots()
ax.plot(distances_S, color='orange', label='Min Ditsance')
ax.tick_params(axis='y', labelcolor='orange')
ax.set_xlabel('#Iterations')
ax.set_ylabel('Min ditsance')

ax2 = ax.twinx()
ax2.plot(nb_clusters_S, color='purple', label='#Clusters')
ax2.set_yscale('log')
ax2.tick_params(axis='y', labelcolor='purple')
ax2.set_ylabel('#Clusters')

ax.set_title('Evolution')
plt.show()

In [None]:
clusters_H.keys(), clusters_M.keys(), clusters_N.keys(), clusters_S.keys()

In [None]:
yes_H, no_H = list(clusters_H.values())[1], list(clusters_H.values())[0]
yes_M, no_M = list(clusters_M.values())[1], list(clusters_M.values())[0]
yes_N, no_N = list(clusters_N.values())[1], list(clusters_N.values())[0]
yes_S, no_S = list(clusters_S.values())[1], list(clusters_S.values())[0]

predicted_H = [1 if i in yes_H else 0 for i in range(len(X))] 
predicted_M = [1 if i in yes_M else 0 for i in range(len(X))] 
predicted_N = [1 if i in yes_N else 0 for i in range(len(X))] 
predicted_S = [1 if i in yes_S else 0 for i in range(len(X))] 

In [None]:
report(y, predicted_H)

In [None]:
report(y, predicted_M)

In [None]:
report(y, predicted_N)

In [None]:
report(y, predicted_S)