In [1]:
%matplotlib notebook
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA as PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import confusion_matrix

In [2]:
data = pd.read_csv('../data_and_visualisation/HTRU_2.csv')
data.columns = ['Mean IP', 'Standard deviation IP', 'Excess kurtosis IP', 'Skewness IP',
                'Mean DS', 'Standard deviation DS', 'Excess kurtosis DS', 'Skewness DS',
                'Class']

In [3]:
x = data.drop('Class', axis = 'columns')
y = data['Class']

In [4]:
ssc = StandardScaler()
ssc.fit(x)
x_skal = ssc.transform(x)

In [5]:
x_skal = pd.DataFrame(x_skal)
x_skal.columns = ['Mean IP', 'Standard deviation IP', 'Excess kurtosis IP', 'Skewness IP',
                'Mean DS', 'Standard deviation DS', 'Excess kurtosis DS', 'Skewness DS']

In [6]:
x_skal

Unnamed: 0,Mean IP,Standard deviation IP,Excess kurtosis IP,Skewness IP,Mean DS,Standard deviation DS,Excess kurtosis DS,Skewness DS
0,-0.334107,1.802379,-0.011822,-0.370549,-0.371110,-0.588931,0.504409,0.211560
1,-0.314311,-1.053271,-0.145268,-0.116613,-0.322117,-0.235343,-0.125997,-0.391379
2,1.000768,1.553363,-0.513438,-0.390191,-0.304414,-0.275681,-0.312261,-0.481304
3,-0.871346,-0.858823,0.115570,-0.104886,-0.388018,-0.763113,1.323985,1.386742
4,-0.682521,0.021788,0.050758,-0.219475,-0.372501,-0.605113,0.514453,0.249118
...,...,...,...,...,...,...,...,...
17892,0.988281,1.943401,-0.625682,-0.406710,-0.384018,-0.727298,1.586006,1.699975
17893,0.447387,0.429146,-0.328863,-0.234660,0.128755,0.939882,-1.189130,-0.906567
17894,0.321909,1.956337,-0.299367,-0.407505,0.299112,1.671507,-1.288047,-0.941322
17895,0.133693,1.074608,-0.260083,-0.291057,-0.361976,-0.664862,0.378242,0.275827


In [7]:
pca = PCA()
x_pca = pca.fit_transform(x_skal)

In [8]:
x_pca = pd.DataFrame(x_pca)

In [9]:
x_pca_y = pd.DataFrame.copy(x_pca)
x_pca_y['Class'] = y

In [10]:
from mpl_toolkits.mplot3d import Axes3D

def boja(xs):
    ret = []
    for x in xs:
        if x == 1:
            ret.append('SteelBlue')
        else:
            ret.append('DarkRed')
    return ret

fig = plt.figure(figsize=(8, 8))
ax = Axes3D(fig)
ax.scatter(x_pca_y[0], x_pca_y[1], x_pca_y[2], c = boja(x_pca_y['Class']))
plt.show()

<IPython.core.display.Javascript object>

In [13]:
def matConf(model, x, y):
    y_pred = model.predict(x)
    return confusion_matrix(y, y_pred)


def klasteruj_i_prikazi(k, x, x_pca, sses, siluete):
    fig = plt.figure(figsize=(4, 4))
    ax = Axes3D(fig)
    ax.view_init(27,91)
    
    ksredina = KMeans(n_clusters=k)
    ksredina.fit(x)
    
    print("SSE: %f" % ksredina.inertia_)
    sses.append(ksredina.inertia_)
    print("Silueta: %f" % silhouette_score(x, ksredina.labels_))
    siluete.append(silhouette_score(x, ksredina.labels_))
    print("\"Matrica konfuzije\": ")
    print(matConf(ksredina, x, y)[:2])

    ax.scatter(x_pca[0], x_pca[1], x_pca[2], c=ksredina.labels_)

In [14]:
sses = []
siluete = []

In [15]:
klasteruj_i_prikazi(2, x, x_pca, sses, siluete)

<IPython.core.display.Javascript object>

SSE: 122773426.768590
Silueta: 0.594949
"Matrica konfuzije": 
[[13697  2561]
 [ 1609    30]]


In [126]:
klasteruj_i_prikazi(3, x, x_pca, sses, siluete)

<IPython.core.display.Javascript object>

SSE: 79317644.375348
Silueta: 0.427238
"Matrica konfuzije": 
[[8498  910 6850]
 [1568    9   62]]


In [16]:
klasteruj_i_prikazi(4, x, x_pca, sses, siluete)

<IPython.core.display.Javascript object>

SSE: 57108858.708802
Silueta: 0.421143
"Matrica konfuzije": 
[[3059 9534  456 3209]
 [  33  124    5 1477]]


In [17]:
klasteruj_i_prikazi(5, x, x_pca, sses, siluete)

<IPython.core.display.Javascript object>

SSE: 41792683.749478
Silueta: 0.428321
"Matrica konfuzije": 
[[5972 1739  292  920 7335]
 [  58   18    4 1125  434]]


In [18]:
klasteruj_i_prikazi(6, x, x_pca, sses, siluete)

<IPython.core.display.Javascript object>

SSE: 32871593.628342
Silueta: 0.409469
"Matrica konfuzije": 
[[ 819 5503  167 2635 6311  823]
 [   8  519    1   29   65 1017]]


In [19]:
klasteruj_i_prikazi(6, x, x_pca, sses, siluete)

<IPython.core.display.Javascript object>

SSE: 32871683.257160
Silueta: 0.409718
"Matrica konfuzije": 
[[ 823 6327  167  815 5502 2624]
 [1008   65    1    8  528   29]]


In [20]:
klasteruj_i_prikazi(7, x, x_pca, sses, siluete)

<IPython.core.display.Javascript object>

SSE: 28000389.347448
Silueta: 0.387800
"Matrica konfuzije": 
[[3277 4447 5976  411 1278   90  779]
 [  27  571   77    6   11    1  946]]


In [21]:
klasteruj_i_prikazi(8, x, x_pca, sses, siluete)

<IPython.core.display.Javascript object>

SSE: 23631762.105944
Silueta: 0.401428
"Matrica konfuzije": 
[[5895  410 4582 1271   91  731   54 3224]
 [  74    6  406   11    1   84 1030   27]]


In [35]:
fig,ax = plt.subplots()
ax.plot(xs, sses)
ax.set_xlabel("K #")
ax.set_ylabel("sse")

ax2=ax.twinx()
ax2.plot(xs, siluete, c = 'orange')
ax2.set_ylabel("silueta")
plt.show()


<IPython.core.display.Javascript object>

In [38]:
#sve ukazuje na to da treba prestati sa povecanjem broja kalstera iako smo videli da nam "matrica konfuzije" govori da se tek nakon osam klastera smanji entropija rasporeda klasa po klasteirma
#zakljucak je da podaci nisu lepi za klasterovanje