In [381]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import keras
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input,InputLayer, Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras import applications, initializers, Model, optimizers, metrics
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from tensorflow.keras.utils import to_categorical

from sklearn import preprocessing
from sklearn import tree

from keras.callbacks import LearningRateScheduler, ModelCheckpoint

from sklearn.cluster import AgglomerativeClustering, KMeans

from sklearn.metrics import silhouette_samples, silhouette_score

import plotly.express as px

In [382]:
data = pd.read_csv("datosnuevos.csv",encoding='latin-1',index_col=0,sep = ";")
data = data.dropna()
data = data.rename(columns={'P47,':'P47'})
data['P47'] = data['P47'].str.replace(",","")
data = data.drop(['Perfil','Periodo','Curso'], axis=1)

In [383]:
encode_norm = {"Totalmente en desacuerdo":-0.9,"Levemente en desacuerdo":-0.4,"Ni de deacuerdo ni en desacuerdo":-0.1,"Medianamente de acuerdo":0.4,"Totalmente de acuerdo":0.9}
preguntas = []
for i in range(1,48):
    preguntas.append("P"+str(i))
    data["P"+str(i)] = data["P"+str(i)].map(encode_norm)
data["Genero"] = data["Genero"].map({"Femenino": 0, "Masculino":1})

In [384]:
ramas = {"ADMINISTRACION DE EMPRESAS": "CCSS", "AGRONOMIA":"C","TECNOLOGIA EN REGENCIA DE FARMACIA":"SALUD","ZOOTECNIA":"C","COMUNICACION SOCIAL":"AYH",
"LICENCIATURA EN MATEMATICAS":"C","TECNOLOGIA EN REGENCIA DE FARMACIA (RESOLUCION 08200)":"SALUD","LICENCIATURA EN INGLES COMO LENGUA EXTRANJERA":"AYH",
"TECNOLOGIA EN SISTEMAS AGROFORESTALES":"C","LICENCIATURA EN ETNOEDUCACION":"AYH","TECNOLOGIA AGROFORESTAL":"C","TECNOLOG?A EN DESAROLLO DE SOFTWARE":"ING",
"ADMINISTRACION EN SALUD":"SALUD","TECNOLOGIA EN PRODUCCION AGRICOLA":"C","LICENCIATURA EN FILOSOFIA":"AYH","CURSOS LIBRES":"OTROS",
"TECNOLOGIA EN SISTEMAS DE COMUNICACIONES INAL?MBRICAS":"ING","TECNOLOGIA EN PRODUCCION DE AUDIO":"ING","TECNOLOGIA EN SANEAMIENTO AMBIENTAL":"C",
"TECNOLOGIA EN AUDIO":"ING","LICENCIATURA EN LENGUAS EXTRANJERAS CON ?NFASIS EN INGL?S":"AYH","PSICOLOGIA":"AYH","TECNOLOGIA INDUSTRIAL":"ING",
"ADULTO MAYOR, ACTOR SOCIAL":"OTROS","ARTES VISUALES":"AYH","TECNOLOGIA DE TELECOMUNICACIONES":"ING","TECNOLOGIA EN RADIOLOGIA E IMAGENES DIAGNOSTICAS":"SALUD",
"Música":"AYH","LICENCIATURA EN ETNOEDUCACI?N (RESOLUCI?N 26750)":"AYH","CURRICULA Y PEDAGOGIA":"CCSS","FILOSOFIA":"AYH","QUIMICA":"C",
"TECNOLOGIA EN GESTION COMERCIAL Y DE NEGOCIOS":"CCSS","Bachillerato a distancia":"OTROS","TECNOLOGIA EN SISTEMAS":"ING","TECNOLOGIA EN GESTION AGROPECUARIA":"C"}
data["Programa"] = data["Programa"].replace(ramas)
data.loc[data['Programa'].str.contains('INGENIER'), 'Programa'] = 'ING'
data.loc[data['Programa'].str.contains('CICLO'), 'Programa'] = 'OTROS'
data.Programa = data.Programa.map({"CCSS":0,"C":1,"ING":2,"SALUD":3,"AYH":4,"OTROS":5})

In [385]:
data.Edad = pd.qcut(data.Edad, q=4, precision = 0,labels=False)

In [386]:
X = data.loc[:,preguntas]
y = data.loc[:,preguntas]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5, random_state=1)

In [387]:
def autoencoder():
    input_shape = (X.shape[1],)
    model = Sequential()
    #Input
    model.add(InputLayer(input_shape=input_shape))

    #Hidden

    model.add(Dense(64 , activation = "tanh"))
    model.add(Dense(16 , activation = "tanh"))
    model.add(Dense(8  , activation = "tanh"))
    model.add(Dense(16 , activation = "tanh")) 
    model.add(Dense(64 , activation = "tanh"))
    #Output
    model.add(Dense(X.shape[1], activation = 'linear'))


   #model.compile(loss = "mse",#loss="categorical_crossentropy",optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),metrics=["mean_squared_error"])
    return model

In [388]:
def train_encoder(model):
    annealer = LearningRateScheduler(lambda x: 1e-3 * 0.95 ** x)
    callback_es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)

    model.fit(X_train, X_train,  validation_data=(X_val,X_val),
        callbacks = [callback_es],
        batch_size = 32, epochs=200, verbose = 0)

    encoder = Model(model.input, model.layers[-4].output)
    return encoder,model

In [389]:
def get_predictions(encoder, X):
    pred = encoder.predict(X)
    return pred

In [390]:
def train_kmeans(pred, pred_test, n):
    kmeans = KMeans(n_clusters=n, random_state=0)
    kmeans.fit(pred)

    labels = kmeans.predict(pred_test)
    return kmeans

In [391]:
def createDF_labels(pred_test, labels):
    pred_df = pd.DataFrame(pred_test, columns = ['x','y','z'])
    pred_df['label'] = labels
    return pred_df

In [392]:
def descriptionCluster(groups, variable, data):
    columns = []
    for i in range(0,len(groups)):
        columns.append('C'+str(i))
    df = pd.DataFrame(columns=columns)

    i=0
    for clus in groups:
        c = (clus[variable].value_counts()*100/len(clus))
        df['C'+str(i)] = c
        i+=1
    df = df.sort_index()
    #perc = pd.DataFrame(data[variable].value_counts()*100/len(data))
    df = df.fillna(0)
    max_values = df.max().values
    max_values = max_values[max_values != 0]
    l = len(max_values)
    value = max_values.sum()/l
    return value, df

In [393]:
from tensorflow.keras.layers import InputSpec
from keras import backend as K

class ClusteringLayer(tf.keras.layers.Layer):

    def __init__(self, n_clusters, weights=None, alpha=1.0, **kwargs):
        if 'input_shape' not in kwargs and 'input_dim' in kwargs:
            kwargs['input_shape'] = (kwargs.pop('input_dim'),)
        super(ClusteringLayer, self).__init__(**kwargs)
        self.n_clusters = n_clusters
        self.alpha = alpha
        self.initial_weights = weights
        self.input_spec = InputSpec(ndim=2)

    def build(self, input_shape):
        assert len(input_shape) == 2
        input_dim = input_shape[1]
        self.input_spec = InputSpec(dtype=K.floatx(), shape=(None, input_dim))
        self.clusters = self.add_weight(name='clusters', shape=(self.n_clusters, input_dim), initializer='glorot_uniform') 
        
        if self.initial_weights is not None:
            self.set_weights(self.initial_weights)
            del self.initial_weights
        self.built = True

    def call(self, inputs, **kwargs):
        q = 1.0 / (1.0 + (K.sum(K.square(K.expand_dims(inputs, axis=1) - self.clusters), axis=2) / self.alpha))
        q **= (self.alpha + 1.0) / 2.0
        q = K.transpose(K.transpose(q) / K.sum(q, axis=1))
        
        return q

    def compute_output_shape(self, input_shape):
        assert input_shape and len(input_shape) == 2
        return input_shape[0], self.n_clusters
        
    def get_config(self):
        config = {'n_clusters': self.n_clusters}
        base_config = super(ClusteringLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [394]:
def descriptionCluster(groups, variable, data):
    columns = []
    for i in range(0,len(groups)):
        columns.append('C'+str(i))
    df = pd.DataFrame(columns=columns)

    i=0
    for clus in groups:
        c = (clus[variable].value_counts()*100/len(clus))
        df['C'+str(i)] = c
        i+=1
    df = df.sort_index()
    #perc = pd.DataFrame(data[variable].value_counts()*100/len(data))
    df = df.fillna(0)
    max_values = df.max().values
    max_values = max_values[max_values != 0]
    l = len(max_values)
    value = max_values.sum()/l
    return value, df

In [395]:
def score_ac(y_pred_test):
    X_test_v= X_test.merge(data.loc[:,['Genero','Edad','Programa']], left_index=True, right_index=True)
    X_test_v['clus'] = y_pred_test
    
    clus0 = X_test_v[X_test_v.clus == 0]
    clus1 = X_test_v[X_test_v.clus == 1]
    clus2 = X_test_v[X_test_v.clus == 2]
    clus3 = X_test_v[X_test_v.clus == 3]
    clus4 = X_test_v[X_test_v.clus == 4]

    clus = [clus0, clus1, clus2, clus3, clus4]

    v, df = descriptionCluster(clus, 'Genero', X_test_v)
    value = v

    v, df = descriptionCluster(clus, 'Edad', X_test_v)
    value += v

    v, df = descriptionCluster(clus, 'Programa', X_test_v)
    value += v

    return (value/3)

In [396]:
n_clusters = 5
n_epochs   = 200
batch_size = 128
result = 0

best_result = 0
sil_avg = 0
score_avg = 0

best = None
p_best = None
n = 50
for i in range(0,n):
    model = autoencoder()
    pretrain_optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

    pretrain_epochs = n_epochs
    batch_size = batch_size

    model.compile(optimizer=pretrain_optimizer, loss='categorical_crossentropy')

    encoder, _ = train_encoder(model)

    clustering_layer = ClusteringLayer(n_clusters, name='clustering')(encoder.output)
    model = Model(inputs=encoder.input, outputs=clustering_layer)



    model.compile(optimizer=pretrain_optimizer,#tf.keras.optimizers.SGD(0.01, 0.9),
            loss='kld')

    kmeans = KMeans(n_clusters, random_state=0)
    y_pred = kmeans.fit_predict(encoder.predict(X_train))
    y_pred_last = np.copy(y_pred)

    model.get_layer(name='clustering').set_weights([kmeans.cluster_centers_])



    def target_distribution(q):
        weight = q ** 2 / q.sum(0)
        return (weight.T / weight.sum(1)).T
            
    loss = 0
    index = 0
    maxiter = 1000 #X.shape[0] 
    update_interval = 100
    tol = 0.001 # tolerance threshold to stop training
    index_array = np.arange(X_train.shape[0])

    for ite in range(int(maxiter)):
        if ite % update_interval == 0:
            q = model.predict(X_train, verbose=0)
            p = target_distribution(q)

        idx = index_array[index * batch_size: min((index+1) * batch_size, X_train.shape[0])]
        loss = model.train_on_batch(x=X_train.iloc[idx,:], y=p[idx])
        index = index + 1 if (index + 1) * batch_size <= X_train.shape[0] else 0

    q = model.predict(X_train, verbose=0)
    p = target_distribution(q)

    y_pred = q.argmax(1)

    q_test = model.predict(X_test, verbose=0)
    p_test = target_distribution(q_test)

    y_pred_test = q_test.argmax(1)

        
    score = silhouette_score(X_test, y_pred_test, metric='euclidean')*100

    ac = score_ac(y_pred_test)
    rendimiento = score/2 + ac/2

    if best_result < rendimiento:
        best = y_pred_test
        p_best = p_test
        best_result = rendimiento

    result += rendimiento
    sil_avg += score
    score_avg += ac

    print(str(i+1)+"/"+str(n), end = "\r")

print("Silhoutte: ", (sil_avg/n))
print("Acc: ", (score_avg/n))
print("Rendimiento: ",(result/n))

Silhoutte:  5.853619028039222
Acc:  41.831808832433765
Rendimiento:  23.842713930236478


In [397]:
y_pred_test = best
p_test = p_best

In [398]:
score = silhouette_score(X_train, y_pred, metric='euclidean')
score

0.07433333325133437

In [399]:
score = silhouette_score(X_test, y_pred_test, metric='euclidean')
score

0.12404434726477863

In [400]:
"""pred_df = pd.DataFrame(p_test, columns = ['x','y','z'])
pred_df['label'] = y_pred_test"""

"pred_df = pd.DataFrame(p_test, columns = ['x','y','z'])\npred_df['label'] = y_pred_test"

In [401]:
"""%matplotlib widget
fig = px.scatter_3d(pred_df, x = 'x', y = 'y', z = 'z', color = 'label', width=800, height=800)
#fig = px.scatter(pred_df, x = 'x',y='y', color = 'label')
fig.update_layout(font=dict(size= 14))
fig.update_traces(marker=dict(size=4,
                              line=dict(width=0.5,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))

fig.show()"""

"%matplotlib widget\nfig = px.scatter_3d(pred_df, x = 'x', y = 'y', z = 'z', color = 'label', width=800, height=800)\n#fig = px.scatter(pred_df, x = 'x',y='y', color = 'label')\nfig.update_layout(font=dict(size= 14))\nfig.update_traces(marker=dict(size=4,\n                              line=dict(width=0.5,\n                                        color='DarkSlateGrey')),\n                  selector=dict(mode='markers'))\n\nfig.show()"

In [402]:
X_test_v= X_test.merge(data.loc[:,['Genero','Edad','Programa']], left_index=True, right_index=True)

In [403]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [413]:
X_test_v['clus'] = y_pred_test
clus0 = X_test_v[X_test_v.clus == 0]
clus1 = X_test_v[X_test_v.clus == 1]
clus2 = X_test_v[X_test_v.clus == 2]
clus3 = X_test_v[X_test_v.clus == 3]
clus4 = X_test_v[X_test_v.clus == 4]


clus = [clus0, clus1, clus2, clus3, clus4]

In [414]:
v, df = descriptionCluster(clus, 'Genero', X_test_v)
value = v
df

Unnamed: 0,C0,C1,C2,C3,C4
0,51.811,36.364,14.286,24.324,42.308
1,48.189,63.636,85.714,75.676,57.692


In [415]:
v, df = descriptionCluster(clus, 'Edad', X_test_v)
value += v
df

Unnamed: 0,C0,C1,C2,C3,C4
0,25.627,45.455,28.571,40.541,19.231
1,20.613,9.091,14.286,16.216,30.769
2,29.248,18.182,28.571,24.324,42.308
3,24.513,27.273,28.571,18.919,7.692


In [416]:
v, df = descriptionCluster(clus, 'Programa', X_test_v)
value += v
df

Unnamed: 0,C0,C1,C2,C3,C4
0,26.741,9.091,14.286,37.838,38.462
1,23.955,27.273,42.857,18.919,11.538
2,23.677,36.364,14.286,37.838,26.923
3,14.206,9.091,14.286,2.703,19.231
4,10.306,18.182,14.286,2.703,3.846
5,1.114,0.0,0.0,0.0,0.0


In [417]:
value/3

46.86082920066207

In [419]:
def clus_description(clus):
    df = pd.DataFrame(columns=['C0','C1','C2','C3', 'C4'])
    i = 0
    for c in clus:
        clus_per = c.loc[:,preguntas].apply(pd.value_counts).sum(axis=1)
        s = clus_per*100/clus_per.sum()

        df['C'+str(i)] = s
        i +=1
    return df

In [420]:
df_clus = clus_description(clus)
df_clus

Unnamed: 0,C0,C1,C2,C3,C4
-0.9,57.583,14.313,16.109,14.261,6.547
-0.4,16.28,17.602,13.374,13.168,5.074
-0.1,11.889,33.269,13.678,53.306,8.101
0.4,8.031,22.244,35.562,13.111,28.396
0.9,6.217,12.573,21.277,6.153,51.882


In [422]:
fig = px.line(df_clus, height=800, width=1000, color_discrete_map={
                 "C0": "blue",
                 "C1": "purple",
                 "C2": "orange",
                 "C3": "red",
                 "C4": "green"
             })
fig.update_layout(font=dict(size= 20))
fig.update_traces(line_width=5)
fig.show()

In [412]:
#model.save("best_DEC.h5")