In [1]:
import os
import pandas as pd
from dotenv import load_dotenv

In [2]:
# Load environment variables from .env file
load_dotenv()

True

In [3]:
def structure_ids(tables):
    """
    Retrieves user IDs from multiple tables and returns a list of IDs.

    Parameters:
    - tables (list): A list of table names.

    Returns:
    - ids (list): A list of user IDs extracted from the tables.
    """
        
    ids = []
    
    for i in range(len(tables)):
        data = pd.read_csv(f'../../data/raw/{dir_name}/{tables[i]}.csv', encoding='latin1')
        ids += [id[0] for id in data.values]
    
    return ids


dir_name = os.getenv('DIR_NAME')

tables = ['maputo', 'matola', 'mocambique', 'moçambique', 'mozambique']
ids = structure_ids(tables)

new_data = pd.DataFrame({'user_id': ids})

print(new_data.head())
print('---------------------------------------------')

# Check the number of rows and columns in the data
print(f"Number of rows: {new_data.shape[0]}")
print(f"Number of columns: {new_data.shape[1]}")
print('---------------------------------------------')

# Check for duplicate values in the data and remove them
print(new_data.duplicated().sum())
print('---------------------------------------------')

new_data = new_data.drop_duplicates()
new_data = new_data.sort_values(by='user_id')

print(new_data)
print('---------------------------------------------')

new_data.to_csv(f'../../data/processed/{dir_name}/users_ids.csv', index=False, encoding='latin1')


   user_id
0   346687
1   404506
2   705308
3   755840
4   858200
---------------------------------------------
Number of rows: 2362
Number of columns: 1
---------------------------------------------
632
---------------------------------------------
        user_id
0        346687
1        404506
1364     510444
2        705308
3        755840
...         ...
1359  159833173
1106  161128312
1107  161218651
1360  164799434
1361  165147195

[1730 rows x 1 columns]
---------------------------------------------
