# PCA on Jet and Sharks

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import networkx as nx
from pyvis.network import Network
sns.set()

from scipy.spatial.distance import euclidean
from sklearn.decomposition import PCA

In [2]:
df = pd.read_excel("DBgang27x14.xlsx")
df.rename(columns={"Unnamed: 0":"Name"},inplace=True)
df.set_index("Name",inplace=True)
df

Unnamed: 0_level_0,Jet,Sharks,20s,30s,40s,JH,COL,HS,Single,Married,Divorced,Pusher,Bookie,Burglar
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
ART,1,0,0,0,1,1,0,0,1,0,0,1,0,0
AL,1,0,0,1,0,1,0,0,0,1,0,0,0,1
SAM,1,0,1,0,0,0,1,0,1,0,0,0,1,0
CLYDE,1,0,0,0,1,1,0,0,1,0,0,0,1,0
MIKE,1,0,0,1,0,1,0,0,1,0,0,0,1,0
JIM,1,0,1,0,0,1,0,0,0,0,1,0,0,1
GREG,1,0,1,0,0,0,0,1,0,1,0,1,0,0
JOHN,1,0,1,0,0,1,0,0,0,1,0,0,0,1
DOUG,1,0,0,1,0,0,0,1,1,0,0,0,1,0
LANCE,1,0,1,0,0,1,0,0,0,1,0,0,0,1


In [3]:
df_features = df[df.columns[2:]]
df_features.head()

Unnamed: 0_level_0,20s,30s,40s,JH,COL,HS,Single,Married,Divorced,Pusher,Bookie,Burglar
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ART,0,0,1,1,0,0,1,0,0,1,0,0
AL,0,1,0,1,0,0,0,1,0,0,0,1
SAM,1,0,0,0,1,0,1,0,0,0,1,0
CLYDE,0,0,1,1,0,0,1,0,0,0,1,0
MIKE,0,1,0,1,0,0,1,0,0,0,1,0


## PCA

### PCA scelta ed analisi componenti

In [4]:
pca = PCA()
pca.fit(df_features.values)
pca.explained_variance_ratio_*100

array([2.36455169e+01, 1.92744693e+01, 1.55842001e+01, 1.29943273e+01,
       1.15713146e+01, 8.75363490e+00, 4.53033929e+00, 3.64619752e+00,
       1.11116627e-30, 4.98377362e-31, 2.01313618e-31, 6.12125114e-32])

#### Component 1 ~ 23.64%
#### Component 2 ~ 19.27%
#### Component 3 ~ 15.58%
#### Component 4 ~ 12.99%
#### Component 5 ~ 11.57%
#### Component 6 ~ 8.75%
#### Component 7 ~ 4.53%
#### Component 8 ~ 3.64%
####  ...
#### Sum 8 ~ 99.97%
#### Sum 6 ~ 91.80%
#### Decido di usare 6 componenti

In [5]:
pca = PCA(n_components=6)
pca.fit(df_features.values)
df_pca_comp = pd.DataFrame(data = pca.components_,
                           columns = df_features.columns.values,
                           index = ['Component 1', 'Component 2', 'Component 3','Component 4', 'Component 5', 'Component 6'])
df_pca_comp

Unnamed: 0,20s,30s,40s,JH,COL,HS,Single,Married,Divorced,Pusher,Bookie,Burglar
Component 1,0.224932,-0.207233,-0.017699,0.19736,0.007593,-0.204953,-0.509372,0.35459,0.154782,-0.145186,-0.365792,0.510978
Component 2,-0.441155,0.508036,-0.066881,-0.277506,0.341222,-0.063717,-0.330921,0.406264,-0.075343,0.20754,-0.088585,-0.118954
Component 3,0.302642,-0.242462,-0.06018,-0.548845,0.006209,0.542636,-0.07495,0.010245,0.064705,0.36343,-0.327049,-0.036381
Component 4,0.243601,-0.091153,-0.152448,0.217165,0.296551,-0.513716,0.191982,-0.104176,-0.087806,0.547928,-0.320575,-0.227353
Component 5,-0.243754,0.411836,-0.168082,0.230319,-0.368646,0.138327,-0.011125,-0.393429,0.404554,0.210277,-0.378766,0.168489
Component 6,-0.34196,-0.267421,0.609381,0.253843,-0.347035,0.093192,-0.03561,0.233986,-0.198376,0.317554,-0.161828,-0.155726


In [6]:
title = "Heatmap componenti pca"
fig = px.imshow(df_pca_comp,color_continuous_scale='RdBu_r',aspect="auto", title=title)
fig.write_html(title+'.html')
fig.show()

In [7]:
cod_componenti = []

In [8]:
descrittori_componente1 = df_pca_comp.iloc[0][abs(df_pca_comp.iloc[0])>0.2]
descrittori_componente1

20s         0.224932
30s        -0.207233
HS         -0.204953
Single     -0.509372
Married     0.354590
Bookie     -0.365792
Burglar     0.510978
Name: Component 1, dtype: float64

In [9]:
# ventenne istruzione bassa sposato scassinatore
cod_componenti.append("(1)20s JH Married Burglar")

In [10]:
descrittori_componente2 = df_pca_comp.iloc[1][abs(df_pca_comp.iloc[1])>0.2]
descrittori_componente2

20s        -0.441155
30s         0.508036
JH         -0.277506
COL         0.341222
Single     -0.330921
Married     0.406264
Pusher      0.207540
Name: Component 2, dtype: float64

In [11]:
#trentenne istruzione alta sposato spacciatore
cod_componenti.append("(2)30s COL Married Pusher")

In [12]:
descrittori_componente3 = df_pca_comp.iloc[2][abs(df_pca_comp.iloc[2])>0.2]
descrittori_componente3

20s        0.302642
30s       -0.242462
JH        -0.548845
HS         0.542636
Pusher     0.363430
Bookie    -0.327049
Name: Component 3, dtype: float64

In [13]:
#ventenne istruzione media spacciatore
cod_componenti.append("(3)20s HS Pusher")

In [14]:
descrittori_componente4 = df_pca_comp.iloc[3][abs(df_pca_comp.iloc[3])>0.2]
descrittori_componente4

20s         0.243601
JH          0.217165
COL         0.296551
HS         -0.513716
Pusher      0.547928
Bookie     -0.320575
Burglar    -0.227353
Name: Component 4, dtype: float64

In [15]:
# ventenne istruzione bassa o alta single spacciatore
cod_componenti.append("(4)20s JH/COL Single Pusher")

In [16]:
descrittori_componente5 = df_pca_comp.iloc[4][abs(df_pca_comp.iloc[4])>0.2]
descrittori_componente5

20s         -0.243754
30s          0.411836
JH           0.230319
COL         -0.368646
Married     -0.393429
Divorced     0.404554
Pusher       0.210277
Bookie      -0.378766
Name: Component 5, dtype: float64

In [17]:
# trentenne istruzione medio bassa divorziato spacciatore o scassinatore
cod_componenti.append("(5)30s JH/HS Divorced Pusher/Burglar")

In [18]:
descrittori_componente6 = df_pca_comp.iloc[5][abs(df_pca_comp.iloc[5])>0.2]
descrittori_componente6

20s        -0.341960
30s        -0.267421
40s         0.609381
JH          0.253843
COL        -0.347035
Married     0.233986
Pusher      0.317554
Name: Component 6, dtype: float64

In [19]:
# quarantenne istruzione medio bassa sposato spacciatore
cod_componenti.append("(6)40s JH/HS Married Pusher")

### Riduzione dimensionale

In [20]:
scores_pca = pca.transform(df_features)
df_pca = pd.DataFrame(data=scores_pca, index=df_features.index, columns=cod_componenti)
df_pca = pd.concat([df_pca,df[df.columns[:2]]],axis=1)
df_pca.head()


X has feature names, but PCA was fitted without feature names



Unnamed: 0_level_0,(1)20s JH Married Burglar,(2)30s COL Married Pusher,(3)20s HS Pusher,(4)20s JH/COL Single Pusher,(5)30s JH/HS Divorced Pusher/Burglar,(6)40s JH/HS Married Pusher,Jet,Sharks
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ART,-0.356095,-0.466142,-0.303449,0.802994,0.209677,1.215309,1,0
AL,0.974496,0.519467,-0.800347,-0.207149,0.365502,0.134822,1,0
SAM,-0.523838,-0.517813,-0.076053,0.409925,-1.054002,-0.816292,1,0
CLYDE,-0.576702,-0.762267,-0.993928,-0.065509,-0.379366,0.735927,1,0
MIKE,-0.766236,-0.18735,-1.17621,-0.004213,0.200552,-0.140875,1,0


In [21]:
title = "Heatmap sulle componenti pca"
fig = px.imshow(df_pca[df_pca.columns[:-2]],color_continuous_scale='RdBu_r',aspect='Auto',title=title,height=700)
fig.write_html(title+'.html')
fig.show()

In [22]:
px.imshow(df_pca[df_pca['Jet ']==1][df_pca.columns[:-2]],color_continuous_scale='RdBu_r',aspect='Auto',title="Heatmap sulle componenti pca JET")

In [23]:
df_pca[df_pca['Jet ']==1][df_pca.columns[:-2]].describe()

Unnamed: 0,(1)20s JH Married Burglar,(2)30s COL Married Pusher,(3)20s HS Pusher,(4)20s JH/COL Single Pusher,(5)30s JH/HS Divorced Pusher/Burglar,(6)40s JH/HS Married Pusher
count,15.0,15.0,15.0,15.0,15.0,15.0
mean,0.070485,-0.374767,-0.091386,0.208443,-0.042275,0.020369
std,0.916168,0.432027,0.701035,0.508016,0.492679,0.507869
min,-1.168549,-0.922752,-1.17621,-0.735095,-1.054002,-0.816292
25%,-0.561166,-0.694447,-0.39459,-0.034861,-0.359828,-0.354494
50%,-0.356095,-0.429724,-0.200783,0.143974,0.042014,0.060284
75%,1.090675,-0.080455,0.192161,0.439043,0.28759,0.236665
max,1.406661,0.519467,1.236049,1.278428,0.789595,1.215309


In [24]:
px.imshow(df_pca[df_pca['Sharks ']==1][df_pca.columns[:-2]],color_continuous_scale='RdBu_r',aspect='Auto',title="Heatmap sulle componenti pca SHARKS")

In [25]:
df_pca[df_pca['Jet ']==0][df_pca.columns[:-2]].describe()

Unnamed: 0,(1)20s JH Married Burglar,(2)30s COL Married Pusher,(3)20s HS Pusher,(4)20s JH/COL Single Pusher,(5)30s JH/HS Divorced Pusher/Burglar,(6)40s JH/HS Married Pusher
count,12.0,12.0,12.0,12.0,12.0,12.0
mean,-0.088106,0.468459,0.114233,-0.260554,0.052844,-0.025461
std,0.621417,0.727683,0.563497,0.588121,0.636237,0.463177
min,-1.168549,-0.953121,-1.17621,-1.092548,-0.853661,-0.472158
25%,-0.404399,0.125364,-0.124871,-0.781736,-0.251698,-0.392021
50%,0.018263,0.287107,0.168633,-0.183683,-0.095724,-0.066825
75%,0.198383,1.145787,0.5065,0.030192,0.324815,0.055781
max,0.784729,1.464689,0.751042,0.647517,1.113283,0.850972


#### Profilo medio JET

In [26]:
df_jet_means = df_pca[df_pca['Jet ']==1][df_pca.columns[:-2]].describe().loc[['mean']]
df_jet_means

Unnamed: 0,(1)20s JH Married Burglar,(2)30s COL Married Pusher,(3)20s HS Pusher,(4)20s JH/COL Single Pusher,(5)30s JH/HS Divorced Pusher/Burglar,(6)40s JH/HS Married Pusher
mean,0.070485,-0.374767,-0.091386,0.208443,-0.042275,0.020369


#### Profilo medio SHARKS

In [27]:
df_sharks_means = df_pca[df_pca['Jet ']==0][df_pca.columns[:-2]].describe().loc[['mean']]
df_sharks_means

Unnamed: 0,(1)20s JH Married Burglar,(2)30s COL Married Pusher,(3)20s HS Pusher,(4)20s JH/COL Single Pusher,(5)30s JH/HS Divorced Pusher/Burglar,(6)40s JH/HS Married Pusher
mean,-0.088106,0.468459,0.114233,-0.260554,0.052844,-0.025461


### Confronto profili

In [28]:
df_means = pd.concat([df_jet_means,df_sharks_means])
df_means['Component'] = ["Jet","Sharks"]
df_means.reset_index(drop=True,inplace=True)
df_means = df_means.transpose()
df_means.reset_index(inplace=True)
df_means.columns = df_means.iloc[-1]
df_means = df_means.iloc[:-1]
df_means

6,Component,Jet,Sharks
0,(1)20s JH Married Burglar,0.070485,-0.088106
1,(2)30s COL Married Pusher,-0.374767,0.468459
2,(3)20s HS Pusher,-0.091386,0.114233
3,(4)20s JH/COL Single Pusher,0.208443,-0.260554
4,(5)30s JH/HS Divorced Pusher/Burglar,-0.042275,0.052844
5,(6)40s JH/HS Married Pusher,0.020369,-0.025461


In [29]:
title = "Confronto profilo Jet e Sharks"
fig = go.Figure()

fig.add_trace(go.Scatterpolar(
      r=df_means['Jet'],
      theta=df_means['Component'],
      fill='toself',
      name='Jet'
))
fig.add_trace(go.Scatterpolar(
      r=df_means['Sharks'],
      theta=df_means['Component'],
      fill='toself',
      name='Sharks'
))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
    )),
  showlegend=True,
  title = title,
)

fig.show()
fig.write_html(title+'.html')

### Commento profili
#### Dal grafico si evince che le componenti discriminanti fra le due bande sono principalmente la 2 e la 4
#### DISTINZIONI PRINCIPALI MEDIE:
#### I Jet sono sui 20, single e con istruzione alta o bassa
#### Gli Sharks sono sui 30, sposati con istruzione alta

### Visualizzazione su 2 e 3 dimensioni

In [30]:
df_pca['Gang'] = df_pca['Jet '].apply(lambda x : 'Jet' if x==1 else 'Sharks')
df_pca.drop(columns=['Jet ','Sharks '],inplace=True)

In [31]:
# aggiungo un po' di rumore bianco per migliorare la visualizzazione
df_pca_no_noise = df_pca.copy()
df_pca_noise = df_pca.copy()
for col in df_pca_noise.columns[:-1]:
    df_pca_noise[col] = df_pca_noise[col] + (np.random.randn( len(df_pca_noise[col]) ) /25.0)

In [32]:
df_pca_noise.head()

Unnamed: 0_level_0,(1)20s JH Married Burglar,(2)30s COL Married Pusher,(3)20s HS Pusher,(4)20s JH/COL Single Pusher,(5)30s JH/HS Divorced Pusher/Burglar,(6)40s JH/HS Married Pusher,Gang
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ART,-0.414226,-0.417047,-0.283107,0.809142,0.243055,1.235089,Jet
AL,0.910524,0.567564,-0.742251,-0.181237,0.418761,0.208499,Jet
SAM,-0.505891,-0.442293,-0.04562,0.396558,-1.046997,-0.802395,Jet
CLYDE,-0.590409,-0.743009,-0.97525,-0.093332,-0.369638,0.747178,Jet
MIKE,-0.800403,-0.170318,-1.169424,-0.031838,0.216422,-0.101868,Jet


In [33]:
title = "Visualizzazione su componenti principali 1 e 2"
fig = px.scatter(df_pca_noise.reset_index(),x=cod_componenti[0],y=cod_componenti[1],text="Name",color="Gang", title=title)
fig.update_traces(textposition="top center")
fig.write_html(title+'.html')
fig.show()

In [34]:
title="Visualizzazione su componenti discriminanti 4 e 2"
fig = px.scatter(df_pca_noise.reset_index(),x=cod_componenti[1],y=cod_componenti[3],text="Name",color="Gang",title=title)
fig.update_traces(textposition='top center')
fig.write_html(title+'.html')
fig.show()

In [35]:
title = "Visualizzazione su componenti principali: 1,2,3"
fig = px.scatter_3d(df_pca_noise.reset_index(),x=cod_componenti[0],y=cod_componenti[1],z=cod_componenti[2],text="Name"
                  ,range_x=(-2,3),range_y=(-2,2),range_z = (-2,2), color="Gang", title  = title)
fig.update_traces(textposition='top center')
fig.write_html(title+'.html')
fig.show()

In [36]:
title = "Visualizzazione su componenti discriminanti: 2,4,3"
fig = px.scatter_3d(df_pca_noise.reset_index(),x=cod_componenti[1],y=cod_componenti[3],z=cod_componenti[2],text="Name"
                  ,range_x=(-1.8,2.8),range_y=(-1.8,1.8),range_z = (-1.8,1.8), color="Gang", title = title)
fig.update_traces(textposition='top center')
fig.write_html(title+'.html')
fig.show()

### Albero costo minimo

In [37]:
df_pca_no_noise[df_pca_no_noise.columns[:-1]]

Unnamed: 0_level_0,(1)20s JH Married Burglar,(2)30s COL Married Pusher,(3)20s HS Pusher,(4)20s JH/COL Single Pusher,(5)30s JH/HS Divorced Pusher/Burglar,(6)40s JH/HS Married Pusher
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ART,-0.356095,-0.466142,-0.303449,0.802994,0.209677,1.215309
AL,0.974496,0.519467,-0.800347,-0.207149,0.365502,0.134822
SAM,-0.523838,-0.517813,-0.076053,0.409925,-1.054002,-0.816292
CLYDE,-0.576702,-0.762267,-0.993928,-0.065509,-0.379366,0.735927
MIKE,-0.766236,-0.18735,-1.17621,-0.004213,0.200552,-0.140875
JIM,1.206854,-0.911331,-0.200783,0.143974,0.507897,-0.372078
GREG,0.348185,0.110559,1.236049,0.172003,-0.34029,0.372912
JOHN,1.406661,-0.429724,-0.255243,0.127604,-0.290087,0.060284
DOUG,-1.168549,0.02644,-0.08473,-0.735095,0.108561,-0.301527
LANCE,1.406661,-0.429724,-0.255243,0.127604,-0.290087,0.060284


In [38]:
# calcolo distanze euclidee
df_in_graph = df_pca_no_noise[df_pca_no_noise.columns[:-1]]
components = df_in_graph.values
adj_matrix = []
for i in range (0,df_in_graph.shape[0]):
    adj_row = []
    for j in range (0, df_in_graph.shape[0]):
        adj_row.append(euclidean(components[i],components[j]))
    adj_matrix.append(adj_row)
adj_matrix
components = df_in_graph.index
df_adj = pd.DataFrame(data = adj_matrix, columns=components, index=components)

In [39]:
#G = nx.from_pandas_adjacency(df_adj)
#MST = nx.minimum_spanning_tree(G)
#plt.figure(figsize=(18,10))
#nx.draw_networkx(MST)

In [40]:
#nt = Network('900px', '1600px')
#nt.from_nx(MST)
#nt.show_buttons(filter_=['physics'])
#nt.show('MST_PCA.html')