In [None]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans, MiniBatchKMeans

import seaborn as sns

In [None]:
df = pd.read_csv('2001.csv', sep='|', decimal=',')
df_ori = df.copy()
df.head()

In [None]:
### Normalizacion y reconvercion de tipos de datos ###
### Normalizacion y reconvercion de tipos de datos ###


def z_transform(tabla, variables):
    for col in variables:
        tabla[col] = tabla[col].astype(float) 
        tabla[col].values[tabla[col] <= 0] = np.NaN  
        tabla[col] = tabla[col].apply(np.log)  
        tabla[col] = (tabla[col] - tabla[col].mean()) / tabla[col].std() 
        tabla[col].values[tabla[col] > 3] = 3  
        tabla[col].values[tabla[col] < -3] = -3 
        tabla[col].fillna(tabla[col].min(), inplace=True) 
        tabla[col] = (tabla[col] - tabla[col].min()) / (tabla[col].max() - tabla[col].min()) 
    return tabla

normalizar = [
        "ctas_saldo_ars",
        "ctas_cred_total_ars",
        "inv_capital_ars",
        "txe_ctc_monto_ars",
        "txe_cant",
        "ctc_cant",
        "prestamos_personales_capintc",
        ]


df = z_transform(df,normalizar)

prods_cant_map = {
        0:0.00,
        1:0.25,
        2:0.50,
        3:0.75,
        4:1.00,
        5:1.00,
        6:1.00,
    }
    
df["prods_cant"] = df["prods_cant"].map(prods_cant_map)

cols = df.columns
types = df.dtypes

for i, k in zip(cols, types):
    if k == 'int64':
        df[i] = df[i].astype('int32')
    elif k == 'float64':
        df[i] = df[i].astype('float32')
    else:
        pass

In [None]:
### Parametros del modelo ###
### Parametros del modelo ###

#km_param = KMeans(algorithm="auto", copy_x=True, init="k-means++", max_iter=100,
#				  n_clusters=8, n_init=10, precompute_distances="auto",
#			      random_state=10, tol=0.0001, verbose=0)

km_param = MiniBatchKMeans(init="k-means++",
						   max_iter=1000,
						   n_clusters=8,
						   n_init=10,
						   random_state=10,
						   tol=0.0001,
						   verbose=0,
						   batch_size=400)

variables_cluster = [
        "prods_cant",
        "hb_flag",
        "ope_tipo_operacion_SUELDO",
        "desc_segmento_PREVISIONAL",
        "pza_flag",       
        "ctas_saldo_ars",
        "ctas_cred_total_ars",
        "inv_capital_ars",
        "txe_cant",
        "ctc_cant",
        "txe_ctc_monto_ars",
        "prestamos_personales_capintc",
]

In [None]:
km = km_param.fit(df[variables_cluster])

In [None]:
clusters = km.predict(df[variables_cluster])

In [None]:
df["cluster"] = clusters
df_ori["cluster"] = clusters

# Control de porcentaje de agrupacion en cada cluster
check = 100*(df.groupby(['cluster']).agg(['count'])/len(df))
check['desc_segmento_PREVISIONAL']

In [None]:
# Reorganizacion de ubicacion de Clusters
clusters_dic = {
                5:1,
                4:2,
                1:3,
                3:4,
                2:5,
                6:6,
                7:7,
                8:8
                }

In [None]:
df["cluster"] = df["cluster"].map(clusters_dic)
df_ori["cluster"] = df_ori["cluster"].map(clusters_dic)

In [None]:
df.reset_index(inplace=True, drop=True)

In [None]:
len(df)

In [None]:
check = 100*(df.groupby(['cluster']).agg(['count'])/len(df))

In [None]:
check['desc_segmento_PREVISIONAL']

In [None]:
df_ori_desc = df_ori.groupby('cluster')[variables_cluster].mean()
df_ori_desc['tasa'] = df_ori.groupby('cluster').size() / df_ori.shape[0]
df_ori_desc_todos = df_ori[variables_cluster].mean().to_frame().transpose()
df_ori_desc_todos.index.name = 'cluster'
df_ori_desc_todos['tasa'] = len(df_ori) / len(clusters_dic) / len(df_ori) 
df_ori_desc_todos = df_ori_desc_todos.append(df_ori_desc)

In [None]:
df_ori_desc_todos.style.background_gradient(cmap = sns.light_palette("green", as_cmap = True)).format("{:,.2f}")