In [1]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans, MiniBatchKMeans

import seaborn as sns

In [2]:
df = pd.read_csv('2001.csv', sep='|', decimal=',')
df_ori = df.copy()
df.head()

Unnamed: 0,id_cli_persona,periodo,desc_segmento_PREVISIONAL,per_flg_activo_de_4m,txe_cant,ctc_cant,prods_cant,hb_flag,ope_tipo_operacion_SUELDO,pza_flag,anses_no_previsional,ctas_saldo_ars,ctas_cred_total_ars,inv_capital_ars,txe_monto_mov_ars,ctc_importe_origen_ars,prestamos_personales_capintc,txe_ctc_monto_ars
0,4525643,20200131,1,1,1,0,2,0,0,0,0,0.0,14552.12,0.0,74.0,0.0,30109.08,74.0
1,2097754,20200131,0,1,0,1,2,0,0,1,0,45542.43,82291.81,0.0,0.0,50.0,0.0,50.0
2,1515325,20200131,1,1,0,0,1,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2855597,20200131,1,1,0,0,0,0,0,0,0,72.72,19082.11,0.0,0.0,0.0,0.0,0.0
4,202911,20200131,0,1,0,1,2,0,1,1,0,0.0,0.0,0.0,0.0,6500.0,0.0,6500.0


In [3]:
def z_transform(tabla, variables):
    for col in variables:
        tabla[col] = tabla[col].astype(float)  # sin esto falla el np.NaN siguiente, y de todas formas el campo va a pasar a float luego
        tabla[col].values[tabla[col] <= 0] = np.NaN  # sin valor, se quita de la distribución
        tabla[col] = tabla[col].apply(np.log)  # logaritmo para dispersar la variable hacia una distribución normal
        tabla[col] = (tabla[col] - tabla[col].mean()) / tabla[col].std()  # transformación z, queda promedio 0 y desvío 1
        tabla[col].values[tabla[col] > 3] = 3  # outliers forzados, límite superior
        tabla[col].values[tabla[col] < -3] = -3  # outliers forzados, límite inferior
        tabla[col].fillna(tabla[col].min(), inplace=True)  # va con el mínimo y no con -3 porque el límite inferior muchas veces no llega a 3 desvíos
        tabla[col] = (tabla[col] - tabla[col].min()) / (tabla[col].max() - tabla[col].min())  # reescalado a [0,1]
    return tabla

normalizar = [
        "ctas_saldo_ars",
        "ctas_cred_total_ars",
        "inv_capital_ars",
        "txe_ctc_monto_ars",
        "txe_cant",
        "ctc_cant",
        "prestamos_personales_capintc",
        ]


df = z_transform(df,normalizar)

prods_cant_map = {
        0:0.00,
        1:0.25,
        2:0.50,
        3:0.75,
        4:1.00,
        5:1.00,
        6:1.00,
    }
    
df["prods_cant"] = df["prods_cant"].map(prods_cant_map)

cols = df.columns
types = df.dtypes

for i, k in zip(cols, types):
    if k == 'int64':
        df[i] = df[i].astype('int32')
    elif k == 'float64':
        df[i] = df[i].astype('float32')
    else:
        pass

In [4]:
#km_param = KMeans(algorithm="auto", copy_x=True, init="k-means++", max_iter=100,
#				  n_clusters=8, n_init=10, precompute_distances="auto",
#			      random_state=10, tol=0.0001, verbose=0)

km_param = MiniBatchKMeans(init="k-means++",
						   max_iter=1000,
						   n_clusters=8,
						   n_init=10,
						   random_state=10,
						   tol=0.0001,
						   verbose=0,
						   batch_size=400)

variables_cluster = [
        "prods_cant",
        "hb_flag",
        "ope_tipo_operacion_SUELDO",
        "desc_segmento_PREVISIONAL",
        "pza_flag",       
        "ctas_saldo_ars",
        "ctas_cred_total_ars",
        "inv_capital_ars",
        "txe_cant",
        "ctc_cant",
        "txe_ctc_monto_ars",
        "prestamos_personales_capintc",
]

In [5]:
km = km_param.fit(df[variables_cluster])

In [6]:
clusters = km.predict(df[variables_cluster])

In [7]:
df["cluster"] = clusters
df_ori["cluster"] = clusters

check = 100*(df.groupby(['cluster']).agg(['count'])/len(df))
check['desc_segmento_PREVISIONAL']

Unnamed: 0_level_0,count
cluster,Unnamed: 1_level_1
0,8.861903
1,31.809394
2,7.549095
3,13.060815
4,15.033175
5,9.940162
6,7.021816
7,6.72364


In [8]:
clusters_dic = {
                5:1,
                4:2,
                1:3,
                3:4,
                2:5,
                6:6,
                7:7,
                8:8
                }

In [9]:
df["cluster"] = df["cluster"].map(clusters_dic)
df_ori["cluster"] = df_ori["cluster"].map(clusters_dic)

In [10]:
df.reset_index(inplace=True, drop=True)

In [11]:
len(df)

936695

In [12]:
check = 100*(df.groupby(['cluster']).agg(['count'])/len(df))

In [13]:
check['desc_segmento_PREVISIONAL']

Unnamed: 0_level_0,count
cluster,Unnamed: 1_level_1
1.0,9.940162
2.0,15.033175
3.0,31.809394
4.0,13.060815
5.0,7.549095
6.0,7.021816
7.0,6.72364


In [14]:
df_ori_desc = df_ori.groupby('cluster')[variables_cluster].mean()
df_ori_desc['tasa'] = df_ori.groupby('cluster').size() / df_ori.shape[0]
df_ori_desc_todos = df_ori[variables_cluster].mean().to_frame().transpose()
df_ori_desc_todos.index.name = 'cluster'
df_ori_desc_todos['tasa'] = len(df_ori) / len(clusters_dic) / len(df_ori) 
df_ori_desc_todos = df_ori_desc_todos.append(df_ori_desc)

In [15]:
df_ori_desc_todos.style.background_gradient(cmap = sns.light_palette("green", as_cmap = True)).format("{:,.2f}")

Unnamed: 0_level_0,prods_cant,hb_flag,ope_tipo_operacion_SUELDO,desc_segmento_PREVISIONAL,pza_flag,ctas_saldo_ars,ctas_cred_total_ars,inv_capital_ars,txe_cant,ctc_cant,txe_ctc_monto_ars,prestamos_personales_capintc,tasa
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0.0,1.33,0.31,0.19,0.7,0.3,25427.14,43681.91,17117.93,2.52,2.4,6273.53,15367.09,0.12
1.0,1.54,1.0,0.0,1.0,0.18,49373.67,59468.97,37054.56,5.27,2.63,8698.68,10234.03,0.1
2.0,1.99,0.02,0.0,1.0,1.0,8754.57,26283.0,8497.51,0.96,1.15,1882.25,25572.24,0.15
3.0,0.43,0.0,0.0,1.0,0.0,5861.4,24899.79,7009.44,0.02,0.04,25.46,13064.54,0.32
4.0,1.6,0.0,0.0,0.99,0.0,20128.37,31363.11,11798.02,3.77,2.83,6456.66,17248.26,0.13
5.0,1.78,0.98,1.0,0.01,0.0,60702.97,82876.89,28684.38,7.64,7.45,20684.5,13930.62,0.08
6.0,2.64,0.96,0.91,0.01,1.0,45860.79,77431.57,23489.84,6.91,6.28,16798.53,21865.0,0.07
7.0,1.47,1.0,0.0,0.0,0.22,90665.72,127880.24,55829.99,2.91,6.68,18457.27,12668.17,0.07
