In [60]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import keras
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input,InputLayer, Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras import applications, initializers, Model, optimizers, metrics
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from tensorflow.keras.utils import to_categorical

from sklearn import preprocessing
from sklearn import tree

from keras.callbacks import LearningRateScheduler, ModelCheckpoint

from sklearn.cluster import AgglomerativeClustering, KMeans

from sklearn.metrics import silhouette_samples, silhouette_score

import plotly.express as px

In [61]:
data = pd.read_csv("datosnuevos.csv",encoding='latin-1',index_col=0,sep = ";")
data = data.dropna()
data = data.rename(columns={'P47,':'P47'})
data['P47'] = data['P47'].str.replace(",","")
data = data.drop(['Perfil','Periodo','Curso'], axis=1)

In [62]:
encode_norm = {"Totalmente en desacuerdo":-0.9,"Levemente en desacuerdo":-0.4,"Ni de deacuerdo ni en desacuerdo":-0.1,"Medianamente de acuerdo":0.4,"Totalmente de acuerdo":0.9}
encode = {"Totalmente en desacuerdo":0,"Levemente en desacuerdo":1,"Ni de deacuerdo ni en desacuerdo":2,"Medianamente de acuerdo":3,"Totalmente de acuerdo":4}

preguntas = []
for i in range(1,48):
    preguntas.append("P"+str(i))
    data["P"+str(i)] = data["P"+str(i)].map(encode_norm)
data["Genero"] = data["Genero"].map({"Femenino": 0, "Masculino":1})

In [63]:
ramas = {"ADMINISTRACION DE EMPRESAS": "CCSS", "AGRONOMIA":"C","TECNOLOGIA EN REGENCIA DE FARMACIA":"SALUD","ZOOTECNIA":"C","COMUNICACION SOCIAL":"AYH",
"LICENCIATURA EN MATEMATICAS":"C","TECNOLOGIA EN REGENCIA DE FARMACIA (RESOLUCION 08200)":"SALUD","LICENCIATURA EN INGLES COMO LENGUA EXTRANJERA":"AYH",
"TECNOLOGIA EN SISTEMAS AGROFORESTALES":"C","LICENCIATURA EN ETNOEDUCACION":"AYH","TECNOLOGIA AGROFORESTAL":"C","TECNOLOG?A EN DESAROLLO DE SOFTWARE":"ING",
"ADMINISTRACION EN SALUD":"SALUD","TECNOLOGIA EN PRODUCCION AGRICOLA":"C","LICENCIATURA EN FILOSOFIA":"AYH","CURSOS LIBRES":"OTROS",
"TECNOLOGIA EN SISTEMAS DE COMUNICACIONES INAL?MBRICAS":"ING","TECNOLOGIA EN PRODUCCION DE AUDIO":"ING","TECNOLOGIA EN SANEAMIENTO AMBIENTAL":"C",
"TECNOLOGIA EN AUDIO":"ING","LICENCIATURA EN LENGUAS EXTRANJERAS CON ?NFASIS EN INGL?S":"AYH","PSICOLOGIA":"AYH","TECNOLOGIA INDUSTRIAL":"ING",
"ADULTO MAYOR, ACTOR SOCIAL":"OTROS","ARTES VISUALES":"AYH","TECNOLOGIA DE TELECOMUNICACIONES":"ING","TECNOLOGIA EN RADIOLOGIA E IMAGENES DIAGNOSTICAS":"SALUD",
"Música":"AYH","LICENCIATURA EN ETNOEDUCACI?N (RESOLUCI?N 26750)":"AYH","CURRICULA Y PEDAGOGIA":"CCSS","FILOSOFIA":"AYH","QUIMICA":"C",
"TECNOLOGIA EN GESTION COMERCIAL Y DE NEGOCIOS":"CCSS","Bachillerato a distancia":"OTROS","TECNOLOGIA EN SISTEMAS":"ING","TECNOLOGIA EN GESTION AGROPECUARIA":"C"}
data["Programa"] = data["Programa"].replace(ramas)
data.loc[data['Programa'].str.contains('INGENIER'), 'Programa'] = 'ING'
data.loc[data['Programa'].str.contains('CICLO'), 'Programa'] = 'OTROS'
data.Programa = data.Programa.map({"CCSS":0,"C":1,"ING":2,"SALUD":3,"AYH":4,"OTROS":5})

In [64]:
preguntas_adiccion = ['P2','P7','P9','P11','P12','P13','P15',
'P17','P23','P25','P26','P27','P32','P34','P38','P40','P41','P42','P43',
'P44','P45','P46','P47']

preguntas_todas = preguntas
preguntas = list(set(preguntas) - set(preguntas_adiccion))

In [65]:
data.Edad = pd.qcut(data.Edad, q=4, precision = 0,labels=False)

In [66]:
X = data.loc[:,preguntas_todas]
y = data.loc[:,preguntas_todas]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5, random_state=1)

In [67]:
def train_kmeans(pred, pred_test, n):
    kmeans = KMeans(n_clusters=n, random_state=0)
    kmeans.fit(pred)

    labels = kmeans.predict(pred_test)
    return labels, kmeans.labels_, kmeans.inertia_

In [68]:
def createDF_labels(pred_test, labels):
    pred_df = pd.DataFrame(pred_test, columns = ['x','y','z'])
    #pred_df = pd.DataFrame(pred_test, columns = ['x','y'])
    pred_df['label'] = labels
    return pred_df

In [69]:
def descriptionCluster(groups, variable, data):
    columns = []
    for i in range(0,len(groups)):
        columns.append('C'+str(i))
    df = pd.DataFrame(columns=columns)

    i=0
    for clus in groups:
        c = (clus[variable].value_counts()*100/len(clus))
        df['C'+str(i)] = c
        i+=1
    df = df.sort_index()
    #perc = pd.DataFrame(data[variable].value_counts()*100/len(data))
    df = df.fillna(0)
    max_values = df.max().values
    max_values = max_values[max_values != 0]
    l = len(max_values)
    value = max_values.sum()/l
    return value, df

In [70]:
def get_predictions(encoder, X_train, X_test):
    pred = encoder.predict(X_train)
    pred_test = encoder.predict(X_test)
    return pred, pred_test

Cargar modelos desde fichero

In [71]:
model = keras.models.load_model('.\\best_model\\best_clustering.h5')
#model = keras.models.load_model('.\\best_model\\best_dec.h5')



In [72]:
pred, pred_test = get_predictions(model, X_train, X_test)
    
labels, labels2, _ = train_kmeans(pred, pred_test, 4)
pred_df1 = createDF_labels(pred_test, labels)
pred_df2 = createDF_labels(pred, labels2)

X_test_v = X_test.merge(data.loc[:,['Genero','Edad','Programa']], left_index=True, right_index=True)
X_test_v['clus'] = labels

X_train_v = X_train.merge(data.loc[:,['Genero','Edad','Programa']], left_index=True, right_index=True)
X_train_v['clus'] = labels2

In [73]:
clus0 = X_test_v[X_test_v.clus == 0]
clus1 = X_test_v[X_test_v.clus == 1]
clus2 = X_test_v[X_test_v.clus == 2]
clus3 = X_test_v[X_test_v.clus == 3]

clus = [clus0, clus1, clus2, clus3]

In [74]:
v, df = descriptionCluster(clus, 'Genero', X_test_v)
value = v
df

Unnamed: 0,C0,C1,C2,C3
0,36.842105,35.714286,28.571429,50.26738
1,63.157895,64.285714,71.428571,49.73262


In [75]:
v, df = descriptionCluster(clus, 'Edad', X_test_v)
value +=v
df

Unnamed: 0,C0,C1,C2,C3
0,42.105263,28.571429,28.571429,25.40107
1,10.526316,35.714286,28.571429,20.588235
2,26.315789,28.571429,35.714286,29.411765
3,21.052632,7.142857,7.142857,24.59893


In [76]:
v, df = descriptionCluster(clus, 'Programa', X_test_v)
value +=v
df

Unnamed: 0,C0,C1,C2,C3
0,36.842105,42.857143,21.428571,26.470588
1,18.421053,7.142857,35.714286,23.796791
2,26.315789,35.714286,21.428571,24.86631
3,7.894737,14.285714,14.285714,13.903743
4,10.526316,0.0,7.142857,9.893048


In [77]:
value = value/3
value

44.49744012437511

In [78]:
sil = silhouette_score(X_train, labels2, metric='euclidean')
sil

0.16089785477966972

In [79]:
sil = sil*100
print((value+ sil)/2)

30.293612801171044


In [80]:
(clus3['Programa'].value_counts()*100/len(clus3))

0    26.470588
2    24.866310
1    23.796791
3    13.903743
4     9.893048
5     1.069519
Name: Programa, dtype: float64

In [81]:
def clus_description(clus):
    df = pd.DataFrame(columns=['C0','C1','C2','C3'])
    i = 0
    for c in clus:
        clus_per = c.loc[:,preguntas_todas].apply(pd.value_counts).sum(axis=1)
        s = clus_per*100/clus_per.sum()

        df['C'+str(i)] = s
        i +=1
    return df

In [82]:
df_clus = clus_description(clus)
df_clus

Unnamed: 0,C0,C1,C2,C3
-0.9,11.31019,3.647416,6.838906,56.320401
-0.4,13.43785,1.215805,8.966565,16.304471
-0.1,50.671892,2.431611,18.237082,12.561156
0.4,15.509518,20.06079,42.553191,8.379793
0.9,9.070549,72.644377,23.404255,6.434179


In [83]:
fig = px.line(df_clus, height=800, width=1000, color_discrete_map={
                 "C0": "blue",
                 "C1": "purple",
                 "C2": "orange",
                 "C3": "yellow"
             })
fig.update_layout(font=dict(size= 20))
fig.update_traces(line_width=5)
fig.show()