In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import keras
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input,InputLayer, Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras import applications, initializers, Model, optimizers, metrics
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from tensorflow.keras.utils import to_categorical

from sklearn import preprocessing
from sklearn import tree

from keras.callbacks import LearningRateScheduler, ModelCheckpoint

from sklearn.cluster import AgglomerativeClustering, KMeans

from sklearn.metrics import silhouette_samples, silhouette_score

import plotly.express as px

In [2]:
data = pd.read_csv("datosnuevos.csv",encoding='latin-1',index_col=0,sep = ";")
data = data.dropna()
data = data.rename(columns={'P47,':'P47'})
data['P47'] = data['P47'].str.replace(",","")
data = data.drop(['Perfil','Periodo','Curso'], axis=1)

In [3]:
encode_norm = {"Totalmente en desacuerdo":-0.9,"Levemente en desacuerdo":-0.4,"Ni de deacuerdo ni en desacuerdo":-0.1,"Medianamente de acuerdo":0.4,"Totalmente de acuerdo":0.9}
encode = {"Totalmente en desacuerdo":0,"Levemente en desacuerdo":1,"Ni de deacuerdo ni en desacuerdo":2,"Medianamente de acuerdo":3,"Totalmente de acuerdo":4}

preguntas = []
for i in range(1,48):
    preguntas.append("P"+str(i))
    data["P"+str(i)] = data["P"+str(i)].map(encode_norm)
data["Genero"] = data["Genero"].map({"Femenino": 0, "Masculino":1})

In [4]:
ramas = {"ADMINISTRACION DE EMPRESAS": "CCSS", "AGRONOMIA":"C","TECNOLOGIA EN REGENCIA DE FARMACIA":"SALUD","ZOOTECNIA":"C","COMUNICACION SOCIAL":"AYH",
"LICENCIATURA EN MATEMATICAS":"C","TECNOLOGIA EN REGENCIA DE FARMACIA (RESOLUCION 08200)":"SALUD","LICENCIATURA EN INGLES COMO LENGUA EXTRANJERA":"AYH",
"TECNOLOGIA EN SISTEMAS AGROFORESTALES":"C","LICENCIATURA EN ETNOEDUCACION":"AYH","TECNOLOGIA AGROFORESTAL":"C","TECNOLOG?A EN DESAROLLO DE SOFTWARE":"ING",
"ADMINISTRACION EN SALUD":"SALUD","TECNOLOGIA EN PRODUCCION AGRICOLA":"C","LICENCIATURA EN FILOSOFIA":"AYH","CURSOS LIBRES":"OTROS",
"TECNOLOGIA EN SISTEMAS DE COMUNICACIONES INAL?MBRICAS":"ING","TECNOLOGIA EN PRODUCCION DE AUDIO":"ING","TECNOLOGIA EN SANEAMIENTO AMBIENTAL":"C",
"TECNOLOGIA EN AUDIO":"ING","LICENCIATURA EN LENGUAS EXTRANJERAS CON ?NFASIS EN INGL?S":"AYH","PSICOLOGIA":"AYH","TECNOLOGIA INDUSTRIAL":"ING",
"ADULTO MAYOR, ACTOR SOCIAL":"OTROS","ARTES VISUALES":"AYH","TECNOLOGIA DE TELECOMUNICACIONES":"ING","TECNOLOGIA EN RADIOLOGIA E IMAGENES DIAGNOSTICAS":"SALUD",
"Música":"AYH","LICENCIATURA EN ETNOEDUCACI?N (RESOLUCI?N 26750)":"AYH","CURRICULA Y PEDAGOGIA":"CCSS","FILOSOFIA":"AYH","QUIMICA":"C",
"TECNOLOGIA EN GESTION COMERCIAL Y DE NEGOCIOS":"CCSS","Bachillerato a distancia":"OTROS","TECNOLOGIA EN SISTEMAS":"ING","TECNOLOGIA EN GESTION AGROPECUARIA":"C"}
data["Programa"] = data["Programa"].replace(ramas)
data.loc[data['Programa'].str.contains('INGENIER'), 'Programa'] = 'ING'
data.loc[data['Programa'].str.contains('CICLO'), 'Programa'] = 'OTROS'
data.Programa = data.Programa.map({"CCSS":0,"C":1,"ING":2,"SALUD":3,"AYH":4,"OTROS":5})

In [5]:
preguntas_adiccion = ['P2','P7','P9','P11','P12','P13','P15',
'P17','P23','P25','P26','P27','P32','P34','P38','P40','P41','P42','P43',
'P44','P45','P46','P47']

preguntas_todas = preguntas
preguntas = list(set(preguntas) - set(preguntas_adiccion))
#norm = MinMaxScaler()
#data.loc[:,preguntas_todas] = norm.fit_transform(data.loc[:,preguntas_todas])

In [6]:
data.Edad = pd.qcut(data.Edad, q=4, precision = 0,labels=False)

In [7]:
X = data.loc[:,preguntas_todas]
y = data.loc[:,preguntas_todas]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5, random_state=1)

In [8]:
def autoencoder():
    input_shape = (X_train.shape[1],)
    model = Sequential()
    #Input
    model.add(InputLayer(input_shape=input_shape))

    #Hidden
    model.add(Dense(32 , activation = "relu"))
    model.add(Dense(16 , activation = "tanh"))
    model.add(Dense(3 , activation = "tanh"))
    model.add(Dense(16 , activation = "tanh"))
    model.add(Dense(32 , activation = "relu"))
    #Output
    model.add(Dense(y_train.shape[1], activation = 'linear'))


    model.compile(loss="categorical_crossentropy",                
              optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),  
              metrics=["mean_squared_error"])
    return model

In [9]:
def train_encoder(model):
    callback_es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)
    annealer = LearningRateScheduler(lambda x: 1e-3 * 0.95 ** x)

    model.fit(X_train, y_train, validation_data=(X_val,y_val), callbacks = [annealer, callback_es], batch_size = 8, epochs=100, verbose = 0)

    encoder = Model(model.input, model.layers[-4].output)
    return encoder

In [10]:
def get_predictions(encoder, X_train, X_test):
    pred = encoder.predict(X_train)
    pred_test = encoder.predict(X_test)
    return pred, pred_test

In [11]:

def train_kmeans(pred, pred_test, n):
    kmeans = KMeans(n_clusters=n, random_state=0)
    kmeans.fit(pred)

    labels = kmeans.predict(pred_test)
    return labels, kmeans.labels_, kmeans.inertia_

In [12]:
def createDF_labels(pred_test, labels):
    pred_df = pd.DataFrame(pred_test, columns = ['x','y','z'])
    #pred_df = pd.DataFrame(pred_test, columns = ['x','y'])
    pred_df['label'] = labels
    return pred_df

In [13]:
X_test_v= X_test.merge(data.loc[:,['Genero','Edad','Programa']], left_index=True, right_index=True)

In [14]:
def descriptionCluster(groups, variable, data):
    columns = []
    for i in range(0,len(groups)):
        columns.append('C'+str(i))
    df = pd.DataFrame(columns=columns)

    i=0
    for clus in groups:
        c = (clus[variable].value_counts()*100/len(clus))
        df['C'+str(i)] = c
        i+=1
    df = df.sort_index()
    #perc = pd.DataFrame(data[variable].value_counts()*100/len(data))
    df = df.fillna(0)
    max_values = df.max().values
    max_values = max_values[max_values != 0]
    l = len(max_values)
    value = max_values.sum()/l
    return value, df

In [15]:
n = 50 # Iteraciones

inertias = []

for i in range(0,n):
    model = autoencoder()
    encoder = train_encoder(model)
    pred, pred_test = get_predictions(encoder, X_train, X_test)
    
    row_inertia = []
    for k in range(2,11):
        labels, labels2, inertia = train_kmeans(pred, pred_test, k)
        row_inertia.append(inertia)

    print(str(i+1)+"/"+str(n), end = "\r")
    inertias.append(row_inertia)

50/50

In [16]:
inertias = np.array(inertias)
mean_inertia = np.mean(inertias, axis = 0)
clusters = [2,3,4,5,6,7,8,9,10]
df_inertia = pd.DataFrame({'Nº Cluster':clusters, 'WCSS':mean_inertia})

In [17]:
fig = px.line(df_inertia, y = 'WCSS', x = 'Nº Cluster', width=800, height=800)
fig.update_layout(
    font=dict(
        size=22,
    )
)

In [18]:
n = 200 # Iteraciones

pred_df = None
pred_df_train = None
labels = None
df_test = None
labels2 = None

ant = 0
total_value = 0
total_score = 0
total_result = 0
for i in range(0,n):
    model = autoencoder()
    encoder = train_encoder(model)
    pred, pred_test = get_predictions(encoder, X_train, X_test)
    
    labels, labels2, _ = train_kmeans(pred, pred_test, 4)
    pred_df1 = createDF_labels(pred_test, labels)
    pred_df2 = createDF_labels(pred, labels2)
    
    X_test_v = X_test.merge(data.loc[:,['Genero','Edad','Programa']], left_index=True, right_index=True)
    X_test_v['clus'] = labels

    X_train_v = X_train.merge(data.loc[:,['Genero','Edad','Programa']], left_index=True, right_index=True)
    X_train_v['clus'] = labels2

    clus0 = X_train_v[X_train_v.clus == 0]
    clus1 = X_train_v[X_train_v.clus == 1]
    clus2 = X_train_v[X_train_v.clus == 2]
    clus3 = X_train_v[X_train_v.clus == 3]

    clus = [clus0, clus1, clus2, clus3]

    v, df = descriptionCluster(clus, 'Genero', X_test_v)
    value = v
    v, df = descriptionCluster(clus, 'Edad', X_test_v)
    value += v
    v, df = descriptionCluster(clus, 'Programa', X_test_v)
    value += v
    value = value/3

    score = silhouette_score(X_train, labels2, metric='euclidean')*100
    result = score/2 + value/2

    if result > ant:
        df_test = X_test_v
        ant = result
        encoder.save('best_clustering.h5')
        pred_df = pred_df1
        pred_df_train = pred_df2

    total_score += score 
    total_value += value
    total_result += result
    print(str(i+1)+"/"+str(n), end = "\r")

print("\nValue: ", total_value/(n))
print("\nScore: ", total_score/(n))
print("\nScore: ", total_result/(n))

200/200
Value:  40.36610210289453

Score:  7.912514302388771

Score:  24.139308202641654


In [40]:
total_score/n/100

0.0791251430238877

In [19]:

%matplotlib widget
fig = px.scatter_3d(pred_df, x = 'x', y = 'y', z = 'z', color = 'label', width=800, height=800)
#fig = px.scatter(pred_df, x = 'x',y='y', color = 'label')
fig.update_layout(font=dict(size= 14))
fig.update_traces(marker=dict(size=4,
                              line=dict(width=0.5,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))

fig.show()


In [20]:
import plotly.express as px
%matplotlib widget
fig = px.scatter_3d(pred_df_train, x = 'x', y = 'y', z = 'z', color = 'label', width=800, height=800)
#fig = px.scatter(pred_df, x = 'x',y='y', color = 'label')
fig.update_layout(font=dict(size= 14))
fig.update_traces(marker=dict(size=4,
                              line=dict(width=0.5,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.show()

In [21]:
score = silhouette_score(X_test, labels, metric='euclidean')
score

0.15456921414291913

In [22]:
score = silhouette_score(X_train, labels2, metric='euclidean')
score

0.14451536041122703

In [23]:
X_test_v = df_test
clus0 = X_test_v[X_test_v.clus == 0]
clus1 = X_test_v[X_test_v.clus == 1]
clus2 = X_test_v[X_test_v.clus == 2]
clus3 = X_test_v[X_test_v.clus == 3]

clus = [clus0, clus1, clus2, clus3]
#clus = [clus0,clus1]

In [24]:
v, df = descriptionCluster(clus, 'Genero', X_test_v)
value = v
df

Unnamed: 0,C0,C1,C2,C3
0,51.29683,37.5,31.746032,50.0
1,48.70317,62.5,68.253968,50.0


In [25]:
v, df = descriptionCluster(clus, 'Edad', X_test_v)
value += v
df

Unnamed: 0,C0,C1,C2,C3
0,25.072046,18.75,39.68254,28.571429
1,21.037464,31.25,12.698413,28.571429
2,28.530259,37.5,30.15873,35.714286
3,25.360231,12.5,17.460317,7.142857


In [26]:
X_test_v.Programa.value_counts()

0    122
2    111
1    102
3     59
4     42
5      4
Name: Programa, dtype: int64

In [27]:
v, df = descriptionCluster(clus, 'Programa', X_test_v)
value += v
df

Unnamed: 0,C0,C1,C2,C3
0,26.801153,31.25,26.984127,50.0
1,24.207493,6.25,22.222222,21.428571
2,24.495677,37.5,28.571429,14.285714
3,13.256484,18.75,12.698413,14.285714
4,10.086455,6.25,9.52381,0.0
5,1.152738,0.0,0.0,0.0


In [28]:
value/3 

43.02920535809585

In [29]:
def clus_description(clus):
    df = pd.DataFrame(columns=['C0','C1','C2','C3'])
    i = 0
    for c in clus:
        clus_per = c.loc[:,preguntas_todas].apply(pd.value_counts).sum(axis=1)
        s = clus_per*100/clus_per.sum()

        df['C'+str(i)] = s
        i +=1
    return df

In [30]:
clus_description(clus)

Unnamed: 0,C0,C1,C2,C3
-0.9,59.047152,2.792553,15.028707,11.398176
-0.4,16.224171,2.393617,15.433975,7.902736
-0.1,10.718008,3.856383,43.66768,27.203647
0.4,7.903612,26.196809,16.278284,29.483283
0.9,6.107057,64.760638,9.591354,24.012158


In [31]:
kmeans_no = KMeans(n_clusters=4, random_state=0)
kmeans_no.fit(X_train)
labels_no = kmeans_no.predict(X_test)

In [32]:
score = silhouette_score(X_train, kmeans_no.labels_, metric='euclidean')
score

0.12358990359805568

In [33]:
X_test_v['clus_no'] = labels_no
clus0 = X_test_v[X_test_v.clus_no == 0]
clus1 = X_test_v[X_test_v.clus_no == 1]
clus2 = X_test_v[X_test_v.clus_no == 2]
clus3 = X_test_v[X_test_v.clus_no == 3]


clus = [clus0, clus1, clus2, clus3]

In [34]:
v, df = descriptionCluster(clus, 'Genero', X_test_v)
value = v
df

Unnamed: 0,C0,C1,C2,C3
0,48.648649,54.594595,35.294118,36.363636
1,51.351351,45.405405,64.705882,63.636364


In [35]:
v, df = descriptionCluster(clus, 'Edad', X_test_v)
value += v
df

Unnamed: 0,C0,C1,C2,C3
0,20.945946,27.027027,37.647059,27.272727
1,23.648649,18.378378,16.470588,31.818182
2,33.783784,26.486486,27.058824,31.818182
3,21.621622,28.108108,18.823529,9.090909


In [36]:
v, df = descriptionCluster(clus, 'Programa', X_test_v)
value += v
df

Unnamed: 0,C0,C1,C2,C3
0,27.027027,27.027027,27.058824,40.909091
1,25.0,23.243243,23.529412,9.090909
2,19.594595,27.027027,29.411765,31.818182
3,16.216216,12.432432,10.588235,13.636364
4,10.810811,9.72973,8.235294,4.545455
5,1.351351,0.540541,1.176471,0.0


In [37]:
value/3

40.835019511490096