In [60]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
!pip install seaborn
import seaborn as sns
import joblib

[0m

In [61]:
df = pd.read_csv('datos_gambling.csv') #Features que se usaran para el clustering

In [62]:
df.columns

Index(['Unnamed: 0', 'player_win', 'player_loss', 'push', 'total_hands',
       'win_pct', 'win_push', 'win_push_pct', 'doubles_won', 'doubles_lost',
       'doubles_won_pct', 'player_bj', 'dealer_bj', 'dealer_high_card',
       'dealer_low_card', 'dealer_bust', 'dealer_draw', 'dealer_stand',
       'dealer_bust_pct', 'dealer_draw_pct', 'dealer_stand_pct',
       'dealer_avg_hand', 'num_of_shuffles', 'shuffle_method'],
      dtype='object')

In [63]:
df = df.drop(["Unnamed: 0", "win_pct", "win_push_pct", "doubles_won_pct", "dealer_high_card", "dealer_low_card", "dealer_bust_pct", "shuffle_method", "dealer_draw_pct", "dealer_stand_pct"], axis=1)

In [64]:
df

Unnamed: 0,player_win,player_loss,push,total_hands,win_push,doubles_won,doubles_lost,player_bj,dealer_bj,dealer_bust,dealer_draw,dealer_stand,dealer_avg_hand,num_of_shuffles
0,20,23,3,46,23,1,1,1,2,12,16,18,17.852,1
1,14,31,1,46,15,2,7,2,1,7,21,18,18.258,2
2,23,22,3,48,26,4,2,2,2,14,18,16,18.267,3
3,21,23,4,48,25,3,1,2,3,13,14,21,18.168,4
4,18,28,3,49,21,3,1,0,4,9,19,21,18.199,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2875,20,24,3,47,23,2,1,2,0,15,13,19,18.096,92
2876,26,20,2,48,28,3,2,5,3,12,13,23,18.078,93
2877,21,22,3,46,24,3,4,3,1,14,18,14,18.076,94
2878,25,19,3,47,28,4,1,1,0,14,14,19,18.071,95


In [65]:
#Diferenciamos los feature a usar
#numeric_features = ['edad', 'frecuencia_juego', 'duracion_sesiones', 'monto_apostado', 'cambios_monto_apostado', 'ganancias_perdidas', 'historial_juego']
#categorical_features = ['genero', 'nivel_educativo', 'ingresos', 'estado_civil', 'decisiones_juego', 'contexto_social', 'motivaciones_jugar']

numeric_features = df.columns

In [66]:
numeric_features

Index(['player_win', 'player_loss', 'push', 'total_hands', 'win_push',
       'doubles_won', 'doubles_lost', 'player_bj', 'dealer_bj', 'dealer_bust',
       'dealer_draw', 'dealer_stand', 'dealer_avg_hand', 'num_of_shuffles'],
      dtype='object')

In [67]:
#Definimos la transformacion de los feature
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features)
        #('cat', OneHotEncoder(), categorical_features)
    ])

In [68]:
#Aplicamos la transformacion
X = preprocessor.fit_transform(df)

In [69]:
#Aplicamos Kmeans diviendolo en 2 clusters (grupos)
kmeans = KMeans(n_clusters=2, random_state=42)  #Ajustar el número de clusters
clusters = kmeans.fit_predict(X)

In [70]:
#Se a;ade la columna cluster con su valor respectivo (Adicto / No adicto)
df['cluster'] = clusters

In [71]:
df

Unnamed: 0,player_win,player_loss,push,total_hands,win_push,doubles_won,doubles_lost,player_bj,dealer_bj,dealer_bust,dealer_draw,dealer_stand,dealer_avg_hand,num_of_shuffles,cluster
0,20,23,3,46,23,1,1,1,2,12,16,18,17.852,1,1
1,14,31,1,46,15,2,7,2,1,7,21,18,18.258,2,1
2,23,22,3,48,26,4,2,2,2,14,18,16,18.267,3,0
3,21,23,4,48,25,3,1,2,3,13,14,21,18.168,4,0
4,18,28,3,49,21,3,1,0,4,9,19,21,18.199,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2875,20,24,3,47,23,2,1,2,0,15,13,19,18.096,92,0
2876,26,20,2,48,28,3,2,5,3,12,13,23,18.078,93,0
2877,21,22,3,46,24,3,4,3,1,14,18,14,18.076,94,0
2878,25,19,3,47,28,4,1,1,0,14,14,19,18.071,95,0


In [72]:
#Se crea y entrena el arbol de decision para analizar la importancia de las feature y su relacion al cluster
clf = DecisionTreeClassifier()
clf.fit(X, clusters)

In [56]:
#Se extraen la importancia de estas
importances = clf.feature_importances_
feature_importances = pd.DataFrame(importances, index=numeric_features, columns=['importance']).sort_values('importance', ascending=False)

In [55]:
#Importancia de las feature en el problema
print(feature_importances)

                 importance
player_loss        0.630523
dealer_bust        0.168989
player_win         0.076332
doubles_won        0.020843
dealer_avg_hand    0.016341
dealer_draw        0.016260
num_of_shuffles    0.015053
dealer_bj          0.014641
dealer_stand       0.013397
doubles_lost       0.007207
player_bj          0.005884
push               0.005208
win_push           0.004884
total_hands        0.004438


In [48]:
# Analizar los clusters
cluster_summary = df.groupby('cluster').mean()
print(cluster_summary)

         player_win  player_loss      push  total_hands   win_push  \
cluster                                                              
0         22.835320    20.587183  3.222802    46.645306  26.058122   
1         18.020806    25.983745  3.399220    47.403771  21.420026   

         doubles_won  doubles_lost  player_bj  dealer_bj  dealer_bust  \
cluster                                                                 
0           3.612519      1.861401   2.283905   1.860656    14.371088   
1           2.561769      2.463589   2.032510   2.435631    10.505202   

         dealer_draw  dealer_stand  dealer_avg_hand  num_of_shuffles  
cluster                                                               
0          14.961997     17.312221        18.042001        49.005961  
1          17.194408     19.704161        18.058075        48.058518  


In [59]:
#Expoertacion del modelo
joblib.dump(clusters, 'modelo_clustering.pkl')

['modelo_clustering.pkl']