In [None]:
import pandas as pd
import numpy as np
import pymysql
from gensim.models import Word2Vec
from sklearn.preprocessing import MinMaxScaler
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sagemaker.tensorflow import TensorFlowPredictor

In [None]:
# Descargar recursos de NLTK necesarios
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
endpoint_name = "231205-012632-recomendation-model" # Cambia cuando se reentrena

In [None]:
predictor = TensorFlowPredictor(endpoint_name)

In [None]:
userId = '26908e63-8d63-4227-bc73-94e596e107c0' # Id tomado al azar
communityId = np.array(['10']) # Id tomado al azar

In [None]:
def get_data(userId):
    db = pymysql.connect(host='wot-database.cvaeffy0qj7k.us-east-1.rds.amazonaws.com', user='wotAdmin', password='U35GebgcH7qD', db='develop')
    cursor = db.cursor()

    # Set users dataframe
    # Get column names
    cursor.execute("show columns from usuarias;")
    users_column_names = cursor.fetchall()
    users_column_names = [column[0] for column in users_column_names]

    # Get users data
    cursor.execute(f"SELECT * FROM usuarias WHERE id = '{userId}';")
    output = cursor.fetchall()
    users_df = pd.DataFrame(output, columns=users_column_names)
    users_df = users_df.set_index('id')

    # Normalize data
    users_df.drop([
        'rut',
        'nombre',
        'apellido',
        'celular',
        'mail',
        'empresa_actual',
        'empresa_adicional',
        'id_cargo',
        'id_cargo_adicional',
        'id_anios_experiencia',
        'experienciaDirectorios',
        'altaDireccion',
        'redesSociales',
        'factor', # Podría ser relevante, pero se tiene muy poca data de este campo
        'nombrePuebloOriginario', # Podría ser relevante, pero se tiene muy poca data de este campo
        'id_region_con_compromiso',
        'region_domicilio',
        'id_posibilidad_cambiarse_region',
        'disposicion_viajar',
        'id_modalidad',
        'id_jornada',
        'id_conocio_wot',
        'id_rol',
        'declaracion',
        'id_pais_domicilio', # Podría ser útil si se hacen comunidades enfocadas en países, pero por mientras no se considerará
        'universidad', # Podría ser útil si se hacen comunidades enfocadas en universidades, pero por mientras no se considerará
        'created_at',
        'updated_at'
        ], axis=1, inplace=True)

    # Se obtienen los valores correspondientes a los ids
    cursor.execute("SELECT id, nombre_industria FROM industrias;")
    output = cursor.fetchall()
    industries_df = pd.DataFrame(output, columns=['id', 'nombre'])
    industries_df.set_index('id', inplace=True)

    cursor.execute("SELECT id, personalidad FROM formulario_personalidades;")
    output = cursor.fetchall()
    personalities_df = pd.DataFrame(output, columns=['id', 'personalidad'])
    personalities_df.set_index('id', inplace=True)

    # Se reemplazan los ids por los valores correspondientes
    users_df['id_industria_actual'].replace(industries_df['nombre'], inplace=True)
    users_df['id_industria_adicional'].replace(industries_df['nombre'], inplace=True)
    users_df['id_personalidad'].replace(personalities_df['personalidad'], inplace=True)

    # Se reemplazan nombres de columnas
    users_df.rename(columns={
        'id_industria_actual': 'industria_actual',
        'id_industria_adicional': 'industria_adicional',
        'id_personalidad': 'personalidad'
    }, inplace=True)

    # Se reemplazan valores nulos texto vacío según corresponda
    users_df['postgrado'].fillna("", inplace=True)
    users_df['brief'].fillna("", inplace=True)
    users_df['industria_actual'].fillna("", inplace=True)
    users_df['industria_adicional'].fillna("", inplace=True)
    users_df['personalidad'].fillna("", inplace=True)
    
    # Get Communities
    # Get column names
    cursor.execute("show columns from Communities;")
    communities_column_names = cursor.fetchall()
    communities_column_names = [column[0] for column in communities_column_names]

    # Get communities data
    cursor.execute("SELECT * FROM Communities;")
    output = cursor.fetchall()
    communities_df = pd.DataFrame(output, columns=communities_column_names)
    communities_df = communities_df.set_index('id')

    # Normalize data
    communities_df.drop([
        'createdAt',
        'updatedAt'
        ], axis=1, inplace=True)

    communities_df

    # Get useful interaction data

    # Get users communities data
    cursor.execute("show columns from UsuariaCommunities;")
    users_communities_column_names = cursor.fetchall()
    users_communities_column_names = [column[0] for column in users_communities_column_names]

    cursor.execute(f"SELECT * FROM UsuariaCommunities WHERE userId = '{userId}';")
    output = cursor.fetchall()
    users_communities_df = pd.DataFrame(output, columns=users_communities_column_names)
    users_communities_df = users_communities_df.set_index('id')
    users_communities_df.drop([
        'createdAt',
        'updatedAt'
        ], axis=1, inplace=True)

    # Get posts data
    cursor.execute("show columns from Posts;")
    posts_column_names = cursor.fetchall()
    posts_column_names = [column[0] for column in posts_column_names]

    cursor.execute(f"SELECT * FROM Posts WHERE userId = '{userId}';")
    output = cursor.fetchall()
    posts_df = pd.DataFrame(output, columns=posts_column_names)
    posts_df = posts_df.set_index('id')
    posts_df.drop([
        'edited',
        'content',
        'createdAt',
        'updatedAt'
        ], axis=1, inplace=True)

    # Get users likes in communities data
    cursor.execute(f"SELECT PostLikes.id, PostLikes.usuariaId, Posts.communityId FROM PostLikes JOIN Posts ON PostLikes.postId = Posts.id WHERE Posts.userId = '{userId}';")
    output = cursor.fetchall()
    users_likes_df = pd.DataFrame(output, columns=['id', 'usuariaId', 'communityId'])
    users_likes_df = users_likes_df.set_index('id')
    
    # Close cursor and connection
    cursor.close()
    db.close()
    
    return users_df, communities_df, users_communities_df, posts_df, users_likes_df

def interaction(user_id, community_id, posts_df, users_likes_df):
    # 5 puntos si pertenece a la comunidad
    # Si se está ejecutando esta función, es porque la usuaria pertenece a la comunidad
    points = 5

    # 1 punto por cada like en la comunidad
    points += users_likes_df[(users_likes_df.usuariaId == user_id) & (users_likes_df.communityId == community_id)].shape[0]

    # 3 puntos por cada post en la comunidad
    points += posts_df[(posts_df.communityId == community_id) & (posts_df.userId == user_id)].shape[0] * 3

    return points

def build_interactions_df(user_id, communities_df, users_communities_df, posts_df, users_likes_df):
    # Set interactions dataframe

    # Use communities ids as column names
    interactions_df = pd.DataFrame(columns=communities_df.index)
    interactions_df.index.name = 'userId'

    for community_id in communities_df.index:
        interactions_df.loc[user_id, community_id] = interaction(user_id, community_id, posts_df, users_likes_df)

    # Reemplazar valores nulos por 0
    interactions_df.fillna(0, inplace=True)

    # Normalizar los valores
    scaler = MinMaxScaler()
    interactions_df = pd.DataFrame(scaler.fit_transform(interactions_df.T).T, columns=interactions_df.columns, index=interactions_df.index)
    
    return interactions_df

# Función para preprocesar el texto
def preprocess_text(text):
    # Convertir a minúsculas
    text = text.lower()

    # Tokenizar el texto
    tokens = word_tokenize(text)

    # Eliminar palabras vacías (stop words)
    tokens = [word for word in tokens if word not in stopwords.words('spanish')]  # Asumiendo que el texto está en español

    # Lemmatización
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Volver a unir el texto
    return ' '.join(tokens)

def user_embedding(users_df, communities_df, user_communities_df, posts_df, user_likes_df):
    # Se junta todo el texto en una sola columna
    users_df['text'] = users_df['postgrado'] + ' ' + users_df['industria_actual'] + ' ' + users_df['industria_adicional'] + ' ' + users_df['intereses'] + ' ' + users_df['brief'] + ' ' + users_df['personalidad']
    users_df.drop(['postgrado', 'industria_actual', 'industria_adicional', 'intereses', 'brief', 'personalidad'], axis=1, inplace=True)

    # Aplicar la función de preprocesamiento
    users_df['text'] = users_df['text'].apply(preprocess_text)

    # Tokenizar el texto
    users_df['tokenized'] = users_df['text'].apply(word_tokenize)

    # Entrenar un modelo Word2Vec
    model = Word2Vec(sentences=users_df['tokenized'], vector_size=100, window=5, min_count=1, workers=4)

    # O convertir todo el texto en un vector (puedes promediar los vectores de todas las palabras en el texto)
    def document_vector(doc):
        return np.mean([model.wv[word] for word in doc if word in model.wv], axis=0)

    users_df['vector'] = users_df['tokenized'].apply(document_vector)

    # Representar el vector como un dataframe
    user_vector_df = pd.DataFrame(users_df['vector'].to_list(), index=users_df.index)

    # Normalizar los vectores
    scaler = MinMaxScaler()
    user_vector_df = pd.DataFrame(scaler.fit_transform(user_vector_df.T).T, columns=user_vector_df.columns, index=user_vector_df.index)
    
    # Obtener interacciones
    user_id = users_df.index.values[0]
    interactions = build_interactions_df(user_id, communities_df, user_communities_df, posts_df, user_likes_df)

    return np.array([np.array(user_vector_df).tolist()[0] + np.array(interactions).tolist()[0]])

def one_hot_community(communityId, nItems):
    communityId[0] = str(int(communityId[0])-1)
    with tf.compat.v1.Session() as tf_sess:
        processed_community = np.array(tf_sess.run(tf.one_hot(communityId, depth=nItems)).tolist())
    return processed_community

In [None]:
user_data, communities_df, user_communities_df, posts_df, user_likes_df = get_data(userId)
user_vector = user_embedding(user_data, communities_df, user_communities_df, posts_df, user_likes_df)
community = one_hot_community(communityId, 62)

In [None]:
input_vals = {"instances": [[user_vector[0].tolist(), community[0].tolist()]]}
pred = predictor.predict(input_vals)