In [1]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.decomposition import PCA

import plotly.express as px

In [2]:
variables_cluster = [
        "prods_cant",
        "hb_flag",
        "ope_tipo_operacion_SUELDO",
        "desc_segmento_PREVISIONAL",
        "pza_flag",       
        "ctas_saldo_ars",
        "ctas_cred_total_ars",
        "inv_capital_ars",
        "txe_cant",
        "ctc_cant",
        "txe_ctc_monto_ars",
        "prestamos_personales_capintc",
]

df = pd.read_csv('2001.csv', sep='|', decimal=',')

In [3]:
### Normalizacion y reconvercion de tipos de datos ###
### Normalizacion y reconvercion de tipos de datos ###


def z_transform(tabla, variables):
    for col in variables:
        tabla[col] = tabla[col].astype(float) 
        tabla[col].values[tabla[col] <= 0] = np.NaN  
        tabla[col] = tabla[col].apply(np.log)  
        tabla[col] = (tabla[col] - tabla[col].mean()) / tabla[col].std() 
        tabla[col].values[tabla[col] > 3] = 3  
        tabla[col].values[tabla[col] < -3] = -3 
        tabla[col].fillna(tabla[col].min(), inplace=True) 
        tabla[col] = (tabla[col] - tabla[col].min()) / (tabla[col].max() - tabla[col].min()) 
    return tabla

normalizar = [
        "ctas_saldo_ars",
        "ctas_cred_total_ars",
        "inv_capital_ars",
        "txe_ctc_monto_ars",
        "txe_cant",
        "ctc_cant",
        "prestamos_personales_capintc",
        ]


df = z_transform(df,normalizar)

prods_cant_map = {
        0:0.00,
        1:0.25,
        2:0.50,
        3:0.75,
        4:1.00,
        5:1.00,
        6:1.00,
    }
    
df["prods_cant"] = df["prods_cant"].map(prods_cant_map)

cols = df.columns
types = df.dtypes

for i, k in zip(cols, types):
    if k == 'int64':
        df[i] = df[i].astype('int32')
    elif k == 'float64':
        df[i] = df[i].astype('float32')
    else:
        pass

In [4]:
### Parametros del modelo ###
### Parametros del modelo ###

#km_param = KMeans(algorithm="auto", copy_x=True, init="k-means++", max_iter=300,
#				  n_clusters=8, n_init=10, n_jobs=-1, precompute_distances="auto",
#			      random_state=10, tol=0.0001, verbose=0)

km_param = MiniBatchKMeans(init="k-means++",
						   max_iter=1000,
						   n_clusters=8,
						   n_init=10,
						   random_state=10,
						   tol=0.0001,
						   verbose=0,
						   batch_size=400)

variables_cluster = [
        "prods_cant",
        "hb_flag",
        "ope_tipo_operacion_SUELDO",
        "desc_segmento_PREVISIONAL",
        "pza_flag",       
        "ctas_saldo_ars",
        "ctas_cred_total_ars",
        "inv_capital_ars",
        "txe_cant",
        "ctc_cant",
        "txe_ctc_monto_ars",
        "prestamos_personales_capintc",
]

In [5]:
km = km_param.fit(df[variables_cluster])

In [6]:
clusters = km.predict(df[variables_cluster])

In [7]:
df = df[variables_cluster]

df["cluster"] = clusters

In [None]:
clusters_dic = {
				6:1,
				2:2,
				1:3,
				4:4,
				5:5,
				0:6,
				7:7,
				3:8
				}


df["cluster"] = df["cluster"].map(clusters_dic)

In [8]:
# Plot plano

df_pca = pd.DataFrame(data = PCA(n_components = 2).fit_transform(df.drop(columns = 'cluster')), columns = ['PCA_1', 'PCA_2'])
df_pca['cluster'] = clusters
#df_pca['cluster'] = df_pca['cluster'].map(clusters_dic)

fig = px.scatter(df_pca.sample(10000), x = 'PCA_1', y = 'PCA_2', color = 'cluster')
fig.write_html("ComparativaPlana2001.html")

del df_pca

In [9]:
# Plot 3D

df_pca = pd.DataFrame(data = PCA(n_components = 3).fit_transform(df.drop(columns = 'cluster')), columns = ['PCA_1', 'PCA_2', 'PCA_3'])
df_pca['cluster'] = clusters
#df_pca['cluster'] = df_pca['cluster'].map(clusters_dic)

fig = px.scatter_3d(df_pca.sample(10000), x='PCA_1', y='PCA_2', z='PCA_3', color='cluster')
fig.write_html("Comparativa3D2001.html")