In [60]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
import seaborn as sns

In [30]:
test = pd.read_csv("test.csv")

In [31]:
test.interpolate(method = "spline", order = 3, inplace=True)


DataFrame.interpolate with object dtype is deprecated and will raise in a future version. Call obj.infer_objects(copy=False) before interpolating instead.



In [32]:
numeric_data = test.select_dtypes(include = ["float64", "int64"])
numeric_columns = [column for column in numeric_data.columns if column != "PassengerId"]
numeric_data = numeric_data[numeric_columns]

In [33]:
corr_data = numeric_data.corr()

fig = go.Figure()

fig.add_trace(
    go.Heatmap(
        z = corr_data,
        x = numeric_columns,
        y = numeric_columns,
        text = corr_data.values,
        texttemplate="%{text:.2f}",
        colorscale="RdBu_r",
        textfont = dict(
            size = 14
        )
    )
)

fig.update_layout(
    template = "plotly_dark",
    title_text = "Matriz de correlación para la base de datos"
)

fig.show()

In [34]:
fig = make_subplots(cols = len(numeric_columns), rows = len(numeric_columns))



for i, value_x in enumerate(numeric_columns):
    for j, value_y in enumerate(numeric_columns):
        if i == j:
            fig.add_trace(
                go.Histogram(
                    x = test[value_x],
                    marker = dict(
                        color = "steelblue",
                        line = dict(
                            color = "black",
                            width = 1
                        )
                    )
                ),
                row = i + 1,
                col = j + 1,
            )

            fig.update_xaxes(
                title_text = value_x,
                row = i + 1,
                col = j + 1
            )

            fig.update_yaxes(
                title_text = value_x,
                row = i + 1,
                col = j + 1
            )
        else:
            fig.add_trace(
                go.Scatter(
                    x = test[value_x],
                    y = test[value_y],
                    mode = "markers",
                    marker = dict(
                        color = "steelblue"
                    )
                ),
                row = i + 1,
                col = j + 1,
            )

            fig.update_xaxes(
                title_text = value_x,
                row = i + 1,
                col = j + 1
            )

            fig.update_yaxes(
                title_text = value_y,
                row = i + 1,
                col = j + 1
            )

fig.update_layout(
    showlegend = False
)

fig.update_layout(
    template = "plotly_dark",
    width = 1800,
    height = 1800
)

fig.show()

In [53]:
numeric_scaled = StandardScaler().fit_transform(numeric_data)

In [64]:
pca_95 = PCA(n_components=0.95).fit(numeric_scaled)
varianza_exp = pca_95.explained_variance_ratio_
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x = list(range(1,len(varianza_exp) + 1)),
        y = varianza_exp ,
        mode = "lines+markers",
        marker = dict(
            color = "orange"
        )
    )
)

fig.update_layout(
    template = "plotly_dark",
    xaxis = dict(
        title_text = "Componentes",
        tickmode = "array",
        tickvals = list(range(1,len(varianza_exp)+1))

    ),
    yaxis_title = "Porcentaje de varianza",
    title = "Porcentaje de varianza explicada por componente"
)

fig.show()

In [66]:
cumsum_var = np.cumsum(varianza_exp)

fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x = list(range(1,len(varianza_exp) +1)),
        y = cumsum_var,
        mode = "lines+markers",
        marker = dict(
            color = "deepskyblue"
        )
    )
)

fig.update_layout(
    template = "plotly_dark",
    xaxis = dict(
        title_text = "Componentes",
        tickmode = "array",
        tickvals = list(range(1,len(varianza_exp) +1))
    ),
    yaxis_title = "Suma acumulada",
    title = "Suma acumulada para cada componente"
)

In [68]:
pca_data = PCA(n_components=0.90).fit(numeric_scaled)
lambdas = pca_data.explained_variance_
Gammas = pca_data.components_.T
R = Gammas[:, :2] * np.sqrt(lambdas[:2])

theta = np.linspace(0, 2*np.pi,400)

In [81]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x = np.cos(theta),
        y = np.sin(theta),
        mode = "lines",
        marker = dict(
            color = "red"
        ),
        showlegend=False
    )
)

for index, value in enumerate(list(numeric_columns)):
    fig.add_trace(
        go.Scatter(
            x = [R[index,0]],
            y = [R[index,1]],
            mode = "markers+text",
            text = [value],
            textposition="top center",
            showlegend=False
        )
    )

fig.update_layout(
    template = "plotly_dark",
    yaxis = dict(
        scaleanchor= "x",
        title_text = "PC2"
    ),
    xaxis = dict(
        title = "PC1"
    ),
    title = "Círculo de correlaciones (PC1 vs PC2)"
)