In [17]:
import numpy as np
import pandas as pd
import random
from os.path import exists

from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle

from utils.graphrec_automated import GraphRec
from utils.metrics import queries_ndcg, mean_ap

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.filterwarnings('ignore')


POSITIVE_VALUE = 5
NEGATIVE_VALUE = 1

USER_NUM=5000
ITEM_NUM=15000

def get_dataset(dataset, perc=0.9):
    rows = len(dataset)
    df = dataset.iloc[np.random.permutation(rows)].reset_index(drop=True)
    for col in ("user", "item"):
        df[col] = df[col].astype(np.int32)
        df["rate"] = df["rate"].astype(np.float32)

    split_index = int(rows * perc)
    df_train = df[0:split_index]
    df_test = df[split_index:].reset_index(drop=True)
    return df_train, df_test

def reduce_dimensionality(df_ratings, MAX_USERS_DESIRED = USER_NUM, MAX_ITEMS_DESIRED = ITEM_NUM):
    # print("Reduzindo dimensionalidade")
    # print("Tamanho inicial --- ", df_ratings.size)
    # index_max_user = df_ratings.loc[df_ratings.user >= MAX_USERS_DESIRED].index[0]
    # df_ratings = df_ratings[:index_max_user]
    # df_ratings.drop(df_ratings[df_ratings['item'] >= MAX_ITEMS_DESIRED].index, inplace = True)
    # print("Redução completa")
    # print("Usuários restantes --- ", df_ratings['user'].max())
    # print("Items restantes --- ", df_ratings['item'].max())
    # print("Tamanho final da base --- ", df_ratings.size)
    # return df_ratings
    # Filtra o DataFrame original para manter apenas as linhas correspondentes aos N usuários mais frequentes
    df_filtrado = df_ratings[(df_ratings['user'] <= MAX_USERS_DESIRED) & (df_ratings['item'] <= MAX_ITEMS_DESIRED)]
    
    return df_filtrado

  
def read_process(filname, sep="\t"):
    col_names = ["user", "item", "rate", "st"]
    df = pd.read_csv(filname, sep=sep, header=None, names=col_names, engine='python')
    df["user"] -= 1
    df["item"] -= 1
    for col in ("user", "item"):
        df[col] = df[col].astype(np.int32)
    df["rate"] = df["rate"].astype(np.float32)
    return df

def replace_value(value):
    if value > 3:
        return 1
    else:
        return 0

def pre_process_dataset():
    print("Iniciando script...")
    print("Carregando dataset principal...")
    df_user_item = pd.read_csv('./data/ifgproduz/relation_list.txt',  sep="\t")
    ### Transformar essa lista em uma matriz de conexão
    print("Criando matriz de conexão...")

    df_user_item.rename(columns={'nota': 'rate'}, inplace=True)
    df_user_item = df_user_item.rename({'id_producao_id': 'item'}, axis=1)
    df_user_item = df_user_item.rename({'id_curriculo_id': 'user'}, axis=1)

    # usuario_map = {usuario_antigo: novo_id for novo_id, usuario_antigo in enumerate(df_user_item['user'].unique())}
    # df_user_item['org_user'] = df_user_item['user']  # Adiciona uma coluna para o ID de usuário antigo
    # df_user_item['user'] = df_user_item['user'].map(usuario_map)

    users = df_user_item['user'].unique()
    users.sort()
    new_id = 0
    for u in users:
        df_user_item.loc[df_user_item['user'] == u, 'org_user'] = u
        df_user_item.loc[df_user_item['user'] == u, 'user'] = new_id
        new_id = new_id + 1

    # Reajusta os IDs de itens
    # item_map = {item_antigo: novo_id for novo_id, item_antigo in enumerate(df_user_item['item'].unique())}
    # df_user_item['org_item'] = df_user_item['item']  # Adiciona uma coluna para o ID de item antigo
    # df_user_item['item'] = df_user_item['item'].map(item_map)

    items = df_user_item['item'].unique()
    items.sort()
    new_id = 0
    for i in items:
        df_user_item.loc[df_user_item['item'] == i, 'org_item'] = i
        df_user_item.loc[df_user_item['item'] == i, 'item'] = new_id
        new_id = new_id + 1
    
    df_user_item['rate'] = df_user_item['rate'].clip(0.0, 5.0)

    ### Redução de dimensionalidade, se desejado
    ### df_ratings_complete = reduce_dimensionality(df_user_item, len(users), len(items))
    return df_user_item

    
df_ratings_complete = pre_process_dataset()

Iniciando script...
Carregando dataset principal...
Criando matriz de conexão...


In [19]:
df_ratings_complete.head()

Unnamed: 0,id_nota_rec,rate,user,item,org_user,org_item
0,1,5.0,489,1321.0,859.0,9444.0
1,2,5.0,489,1314.0,859.0,9436.0
2,3,5.0,489,1315.0,859.0,9437.0
3,4,5.0,489,1316.0,859.0,9438.0
4,5,5.0,489,1317.0,859.0,9439.0


In [7]:
users = df_ratings_complete['user'].unique()
items = df_ratings_complete['item'].unique()

In [20]:
len(users)

1130

In [21]:
df_ratings_complete['user'].max()

1129