In [21]:
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import StandardScaler
df = pd.read_csv('../data/europe.csv')

fig = px.box(df, y=df.columns[1:], title='Boxplots for Variables')
fig.show()

In [22]:
numeric_df = df.iloc[:, 1:]

scaler = StandardScaler()
standardized_data = scaler.fit_transform(numeric_df)

standardized_df = pd.DataFrame(standardized_data, columns=numeric_df.columns)

fig = px.box(standardized_df, y=standardized_df.columns, title='Boxplots for Standardized Variables')
fig.show()

In [23]:
import pycountry as pyc
from sklearn.decomposition import PCA
import numpy as np

countries_codes = {}
for country in pyc.countries:
    countries_codes[country.name] = country.alpha_2
dataset = pd.read_csv('../data/europe.csv')
columns = dataset.columns
features = ['Area', 'GDP', 'Inflation', 'Life.expect', 'Military', 'Pop.growth', 'Unemployment']

x_scaled = StandardScaler().fit_transform(dataset[features])

pca = PCA(n_components=2)
pca_features = pca.fit_transform(x_scaled)  # paises en la nueva base de componentes pcpales

# print(pca.components_)  # array de autovectores

loadings = pca.components_.T * np.sqrt(pca.explained_variance_)  # matriz de las cargas
total_var = pca.explained_variance_ratio_.sum() * 100

fig = px.scatter(pca_features, x=0, y=1, hover_name=dataset['Country'],
                 title=f'PCA BiPlot - Total Explained Variance {total_var:.2f}%')

fig.update_traces(marker_color="rgba(0,0,0,0)")
fig.update_layout(xaxis_title='PCA1', yaxis_title='PCA2', height=700, width=1100)

for i, row in enumerate(pca_features):
    country_iso = countries_codes[dataset['Country'][i]]
    fig.add_layout_image(
        dict(
            source=f"https://raw.githubusercontent.com/matahombres/CSS-Country-Flags-Rounded/master/flags/{country_iso}.png",
            xref="x",
            yref="y",
            xanchor="center",
            yanchor="middle",
            x=row[0],
            y=row[1],
            sizex=0.3,
            sizey=0.3,
            sizing="contain",
            opacity=1,
            layer="above"
        )
    )

colors = ['red', 'blue', 'darkgreen', 'darkorange', 'green', 'brown', 'purple']
for i, feature in enumerate(features):
    fig.add_annotation(
        ax=0, ay=0,
        axref="x", ayref="y",
        x=loadings[i, 0],
        y=loadings[i, 1],
        showarrow=True,
        arrowsize=1,
        arrowhead=1,
        xanchor="right",
        yanchor="top",
        arrowcolor=colors[i]
    )
    fig.add_annotation(
        x=loadings[i, 0],
        y=loadings[i, 1],
        ax=0, ay=0,
        xanchor="center",
        yanchor="bottom",
        text=feature,
        yshift=5,
        font={"size": 15, "color": colors[i]}
    )
fig.show()

In [24]:
features = ['Area', 'GDP', 'Inflation', 'Life.expect', 'Military', 'Pop.growth', 'Unemployment']

x_scaled = StandardScaler().fit_transform(dataset[features])

pca = PCA(n_components=1)
pca_features = pca.fit_transform(x_scaled)  # paises en la nueva base de componentes pcpales

df = pd.DataFrame(data=list(map(lambda x: x[0], pca_features)), columns=['PC1'], index=dataset['Country'])

fig = px.bar(data_frame=df, text_auto='.2f')
fig.update_layout(yaxis_title='PC1', title='PC1 per country', showlegend=False)
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.show()

## Composición de PC1

In [26]:
print(f"PC1 =", end='')
for i, feature in enumerate(features):
    print (f"{'+' if i!=0 else ''} ({pca.components_[0][i]: .4f} * {feature}) ", end="")
print("")

PC1 = ( 0.1249 * Area) + (-0.5005 * GDP) + ( 0.4065 * Inflation) + (-0.4829 * Life.expect) + ( 0.1881 * Military) + (-0.4757 * Pop.growth) + ( 0.2717 * Unemployment) 


Vemos que un valor de **PC1** parecería estar relacionado con una mejor calidad de vida. Las variables que más afectan a este índice son:

**Positivamente:**
- PBI (0.50)
- Expectativa de Vida (0.48)
- Crecimiento Poblacional (0.48)

**Negativamente:**
- Infalción (-0.48)
- Desempleo (-0.27)