In [20]:
import pandas as pd
import yaml
import os

In [18]:
movielens_path = 'preprocess/.data/Movielens/u.data'

column_names =['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv(movielens_path, sep='\t', names=column_names)

In [19]:
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [23]:
def load_config():
    config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config', 'datasets.yaml')
    with open(config_path, 'r') as file:
        config = yaml.safe_load(file)
    return config

def get_data_dir(base_name):
    data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), '.data', base_name)
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    return data_dir


def load_data(base_name):
    config = load_config()
    if base_name not in config['datasets']:
        raise ValueError(f'Configuração para {base_name} não encontrada.')
    
    data_config = config['datasets'][base_name]
    data_dir = get_data_dir(base_name)
    file_path = os.path.join(data_dir, data_config['file_name'])

    if os.path.exists(file_path):
        print(f'Carregando dados de {file_path}')
        if data_config['file_format'] == 'csv':
            df = pd.read_csv(file_path, names=data_config['column_names'])
        elif data_config['file_format'] == 'json':
            df = pd.read_json(file_path)
        else:
            raise ValueError(f'Formato de arquivo não suportado: {data_config["file_format"]}')
        
    else:
        raise FileNotFoundError(f'Arquivo {file_path} não enontrado na pasta .data')
    
    return df

def preprocess_data(df, base_name):
    config = load_config()
    data_config = config['datasets'][base_name]

    df = df.drop('timestamp', axis=1)

    if df[data_config['column_names'][0]].dtype != 'int' or df[data_config['column_names'][1]].dtype != 'int':
        # Remapeia os ids para números de 0 a n-1
        df[data_config['column_names'][0]] = pd.factorize(df[data_config['column_names'][0]])[0]
        df[data_config['column_names'][1]] = pd.factorize(df[data_config['column_names'][1]])[0]
    else:
        df = ordenar_df(df, config)

    return df

def ordenar_df(df, data_config):

    df.sort_values(by=[data_config['column_names'][0], data_config['column_names'][1]], inplace=True)
    df.reset_index(drop=True, inplace=True)
            
    # Verificar se o user_id e o item_id não começam com 0
    if df[data_config['column_names'][0]].min() != 0 or df[data_config['column_names'][1]].min() != 0:
        df[data_config['column_names'][0]] = pd.factorize(df[data_config['column_names'][0]])[0]
        df[data_config['column_names'][1]] = pd.factorize(df[data_config['column_names'][1]])[0]
    
    return df

In [24]:
base_name = 'Movielens'
df = load_data(base_name)
data = preprocess_data(df, base_name)
data.head()

NameError: name '__file__' is not defined

Número de user_id únicos: 943
Número de item_id únicos: 1682


In [13]:
data_path = 'preprocess/.data/MusicalInstruments/ratings_Musical_Instruments.csv'
column_names = ['user_id', 'item_id', 'ratings', 'unixReviewTime']
df = pd.read_csv(data_path, names=column_names)

In [14]:
df.head()

Unnamed: 0,user_id,item_id,ratings,unixReviewTime
0,A1YS9MDZP93857,6428320,3.0,1394496000
1,A3TS466QBAWB9D,14072149,5.0,1370476800
2,A3BUDYITWUSIS7,41291905,5.0,1381708800
3,A19K10Z0D2NTZK,41913574,5.0,1285200000
4,A14X336IB4JD89,201891859,1.0,1350432000


In [15]:
def contar_unicos(df):
    num_user_id_unicos = df['user_id'].nunique()
    num_item_id_unicos = df['item_id'].nunique()
    return num_user_id_unicos, num_item_id_unicos

# Contar user_id e item_id únicos
num_user_id_unicos, num_item_id_unicos = contar_unicos(df)

print(f"Número de user_id únicos: {num_user_id_unicos}")
print(f"Número de item_id únicos: {num_item_id_unicos}")

Número de user_id únicos: 339231
Número de item_id únicos: 83046


In [6]:
id_mapping = {id_value: index for index, id_value in enumerate(df['user_id'].unique())}
df['user_id'] = df['user_id'].map(id_mapping)

id_mapping = {id_value: index for index, id_value in enumerate(df['item_id'].unique())}
df['item_id'] = df['item_id'].map(id_mapping)

df.head()

Unnamed: 0,user_id,item_id,ratings,unixReviewTime
0,0,0,3.0,1394496000
1,1,1,5.0,1370476800
2,2,2,5.0,1381708800
3,3,3,5.0,1285200000
4,4,4,1.0,1350432000


In [16]:
if df['user_id'].dtype != 'int' or df['item_id'].dtype != 'int':
    # Remapeia os ids para números de 0 a n-1
    df['user_id'] = pd.factorize(df['user_id'])[0]
    df['item_id'] = pd.factorize(df['item_id'])[0]

In [17]:
df.head()

Unnamed: 0,user_id,item_id,ratings,unixReviewTime
0,0,0,3.0,1394496000
1,1,1,5.0,1370476800
2,2,2,5.0,1381708800
3,3,3,5.0,1285200000
4,4,4,1.0,1350432000


In [11]:
num_user_id_unicos, num_item_id_unicos = contar_unicos(df)

print(f"Número de user_id únicos: {num_user_id_unicos}")
print(f"Número de item_id únicos: {num_item_id_unicos}")

Número de user_id únicos: 339231
Número de item_id únicos: 83046
