# World Hapiness High Dimentional Data Viusalization

### Importing libraries

In [1]:
import pandas as pd 
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.decomposition import PCA

import plotly.express as px
import matplotlib.pyplot as plt

### Load Data

In [3]:
data = pd.read_csv("./world_happiness.csv")
data.head()

Unnamed: 0,Country or region,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,Finland,1.34,1.587,0.986,0.596,0.153,0.393
1,Denmark,1.383,1.573,0.996,0.592,0.252,0.41
2,Norway,1.488,1.582,1.028,0.603,0.271,0.341
3,Iceland,1.38,1.624,1.026,0.591,0.354,0.118
4,Netherlands,1.396,1.522,0.999,0.557,0.322,0.298


In [6]:
data.dtypes

Country or region                object
GDP per capita                  float64
Social support                  float64
Healthy life expectancy         float64
Freedom to make life choices    float64
Generosity                      float64
Perceptions of corruption       float64
dtype: object

In [5]:
data.describe()

Unnamed: 0,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
count,156.0,156.0,156.0,156.0,156.0,156.0
mean,0.905147,1.208814,0.725244,0.392571,0.184846,0.110603
std,0.398389,0.299191,0.242124,0.143289,0.095254,0.094538
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.60275,1.05575,0.54775,0.308,0.10875,0.047
50%,0.96,1.2715,0.789,0.417,0.1775,0.0855
75%,1.2325,1.4525,0.88175,0.50725,0.24825,0.14125
max,1.684,1.624,1.141,0.631,0.566,0.453


### Data Normalization

In [7]:
min_max_scaler = preprocessing.MinMaxScaler()

In [8]:
x = data.drop("Country or region", axis=1).values
x_scaled = min_max_scaler.fit_transform(x)

data[data.columns[1:]] = x_scaled

In [10]:
data.describe()

Unnamed: 0,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
count,156.0,156.0,156.0,156.0,156.0,156.0
mean,0.537498,0.744344,0.635621,0.62214,0.326583,0.244156
std,0.236573,0.184231,0.212203,0.227083,0.168294,0.208693
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.357928,0.650092,0.480061,0.488114,0.192138,0.103753
50%,0.570071,0.782943,0.691499,0.660856,0.313604,0.188742
75%,0.731888,0.894397,0.772787,0.803883,0.438604,0.31181
max,1.0,1.0,1.0,1.0,1.0,1.0


### K-Means Clustering

In [11]:
clustering_data = data.drop("Country or region", axis=1)

kmeans = KMeans(n_clusters=3).fit(clustering_data)

data["class"] = kmeans.labels_.astype(str)

In [17]:
data

Unnamed: 0,Country or region,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,class
0,Finland,0.795724,0.977217,0.864154,0.944532,0.270318,0.867550,0
1,Denmark,0.821259,0.968596,0.872918,0.938193,0.445230,0.905077,0
2,Norway,0.883610,0.974138,0.900964,0.955626,0.478799,0.752759,0
3,Iceland,0.819477,1.000000,0.899211,0.936609,0.625442,0.260486,0
4,Netherlands,0.828979,0.937192,0.875548,0.882726,0.568905,0.657837,0
...,...,...,...,...,...,...,...,...
151,Rwanda,0.213183,0.437808,0.538124,0.879556,0.383392,0.907285,1
152,Tanzania,0.282660,0.544951,0.437336,0.660856,0.487633,0.324503,1
153,Afghanistan,0.207838,0.318350,0.316389,0.000000,0.279152,0.055188,1
154,Central African Republic,0.015439,0.000000,0.092025,0.356577,0.415194,0.077263,1


In [19]:
data["class"].value_counts()

2    88
1    46
0    22
Name: class, dtype: int64

### Scatter plot matrix: 


#### Using Scatter plot matrix we can analyze: 
        - corellation analysis 
        - cluster analysis 
        - outlier detection 

In [25]:
dims = data.drop(["Country or region","class"], axis=1).columns

fig = px.scatter_matrix(data, dimensions=dims, color=data["class"], hover_name=data["Country or region"])
fig.update_traces(marker=dict(size=4), diagonal_visible=False, showupperhalf=False)
fig.update_layout(width=800, height=800, title="Happiness Index", font_size=7)
fig.show();

### Parallel coordinate plot

In [27]:
fig = px.parallel_coordinates(data,
                             color=data["class"].astype(int),
                             color_continuous_scale=px.colors.diverging.Tealrose)
fig.show()

In [28]:
# Data reduction
#     - reduce the data size -> sampling 
#     - reduce the dimensions -> UMAP 

### PCA  

In [29]:
pca = PCA(n_components=2)
projected_data = pca.fit_transform(data.drop(["Country or region","class"], axis=1))

In [30]:
projected_data

array([[-6.65255851e-01,  3.02545454e-01],
       [-6.96071330e-01,  3.96852232e-01],
       [-7.13376302e-01,  3.08581134e-01],
       [-5.44468495e-01,  1.07410537e-01],
       [-6.01075772e-01,  2.92846873e-01],
       [-6.78277674e-01,  2.88293772e-01],
       [-6.47714129e-01,  3.56688236e-01],
       [-6.60355969e-01,  4.31079242e-01],
       [-6.24954051e-01,  2.94523322e-01],
       [-5.22898098e-01,  1.22733977e-01],
       [-6.11529195e-01,  2.84767432e-01],
       [-3.00378637e-01, -2.89712189e-02],
       [-2.96391653e-01, -1.51512136e-01],
       [-6.50527875e-01,  1.44559161e-01],
       [-5.02589299e-01,  2.15535122e-01],
       [-6.23304627e-01,  2.31786098e-01],
       [-5.05606339e-01,  1.65805490e-01],
       [-4.55903801e-01, -9.41205856e-03],
       [-3.65637527e-01, -8.66875203e-03],
       [-2.64868020e-01, -3.04015632e-01],
       [-4.56283806e-01,  1.73407444e-01],
       [-4.83880354e-01,  1.78659368e-01],
       [-1.39424390e-01, -1.83077374e-01],
       [-4.

In [33]:
# Visualize PCA projected data

fig = px.scatter(projected_data[:,0], projected_data[:, 1],
                 color=data["class"], labels={"index":"d1","x":"d2"},
                hover_name=data["Country or region"])
fig.show()