In [8]:
import pandas as pd
import os
import gc
import psutil


In [9]:
# Get the current directory
current_dir = os.getcwd()

# Navigate one folder up
parent_dir = os.path.dirname(current_dir)

# Where the files are located
data = 'data\pull-pesquisas-city-2851556'

# Navigate down into the "data" folder
data_dir = os.path.join(parent_dir, data)


In [10]:
# Initialize an empty list to store DataFrames
dataframes = []

# Variable to track total rows
total_rows = 0

# Loop through all files in the "data" folder
try:
    for file_name in os.listdir(data_dir):
        
        if file_name.endswith('.csv'):  # Check if the file is a CSV
            
            file_path = os.path.join(data_dir, file_name)
            
            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)
            dataframes.append(df)  # Append the DataFrame to the list
            
             # Print dimensions of the current file
            print(f"File: {file_name} | Dimensions: {df.shape}")
            
            # Add the number of rows to the total count
            total_rows += df.shape[0]

    # Concatenate all DataFrames in the list by binding rows
    combined_df = pd.concat(dataframes, ignore_index=True)

    # Print dimensions of the combined DataFrame
    print(f"Combined DataFrame Dimensions: {combined_df.shape}")

    # Verify the sum of rows matches
    if total_rows == combined_df.shape[0]:
        print("Row count verification successful! Total rows match.")
    else:
        print("Row count verification failed! Mismatch in row count.")

    print(combined_df.head())  # Display the first few rows of the combined DataFrame

except FileNotFoundError:
    print(f"Folder '{data_dir}' not found.")
except Exception as e:
    print(f"An error occurred: {e}")

File: data-lake-prd-314410.cz.pull-pesquisas_city_2851556_2020.csv | Dimensions: (1, 22)
File: data-lake-prd-314410.cz.pull-pesquisas_city_2851556_2021.csv | Dimensions: (115243, 22)
File: data-lake-prd-314410.cz.pull-pesquisas_city_2851556_2022.csv | Dimensions: (1247986, 22)
File: data-lake-prd-314410.cz.pull-pesquisas_city_2851556_2023.csv | Dimensions: (1430037, 22)
File: data-lake-prd-314410.cz.pull-pesquisas_city_2851556_2024.csv | Dimensions: (1579543, 22)
Combined DataFrame Dimensions: (4372810, 22)
Row count verification successful! Total rows match.


In [11]:
print(combined_df.shape)

(4372810, 22)


In [12]:
# List the variables to delete manually
variables_to_delete = ['current_dir', 'parent_dir', 'data_dir', 'dataframes', 'file_name', 'file_path', 'df', 'total_rows', 'data', 'var', 'variables_to_delete']

# Iterate through the list and delete each variable
for var in variables_to_delete:
    if var in globals():  # Ensure the variable exists before trying to delete
        del globals()[var]

print("Memory cleared. Retained variables: combined_df")

Memory cleared. Retained variables: combined_df


In [13]:
# Function to get memory usage in bytes
def memory_usage():
    process = psutil.Process(os.getpid())
    return process.memory_info().rss  # Resident Set Size (in bytes)

# Check memory usage before garbage collection
memory_before = memory_usage()

# Trigger garbage collection
gc.collect()

# Check memory usage after garbage collection
memory_after = memory_usage()

# Calculate memory cleared
memory_cleared = memory_before - memory_after

print(f"Memory before GC: {memory_before / 1024**2:.2f} MB")
print(f"Memory after GC: {memory_after / 1024**2:.2f} MB")
print(f"Memory cleared: {memory_cleared / 1024**2:.2f} MB")

Memory before GC: 907.34 MB
Memory after GC: 907.34 MB
Memory cleared: 0.00 MB


In [14]:
data_id_min = combined_df['Data_ID'].min()
data_id_max = combined_df['Data_ID'].max()
print(f"Min Data_ID: {data_id_min}, Max Data_ID: {data_id_max}")

Min Data_ID: 20200605, Max Data_ID: 20241231


In [18]:
distinct_hotel_ids = combined_df['Hotel_ID'].nunique()
print(f"Number of distinct Hotel_IDs: {distinct_hotel_ids}")
del distinct_hotel_ids

Number of distinct Hotel_IDs: 414


In [19]:
# Check memory usage before garbage collection
memory_before = memory_usage()

# Trigger garbage collection
gc.collect()

# Check memory usage after garbage collection
memory_after = memory_usage()

# Calculate memory cleared
memory_cleared = memory_before - memory_after

print(f"Memory before GC: {memory_before / 1024**2:.2f} MB")
print(f"Memory after GC: {memory_after / 1024**2:.2f} MB")
print(f"Memory cleared: {memory_cleared / 1024**2:.2f} MB")

Memory before GC: 909.36 MB
Memory after GC: 909.36 MB
Memory cleared: 0.00 MB


In [24]:
print(f"Number of distinct Hotel_IDs: {combined_df['Hotel_ID'].nunique()}")
print(f"Number of distinct Moeda_IDs: {combined_df['Moeda_ID'].nunique()}")
print(f"Number of distinct Canal_IDs: {combined_df['Canal_ID'].nunique()}")
print(f"Number of distinct Reservas: {combined_df['Reservas'].nunique()}")
print(f"Number of distinct DiariaMedia: {combined_df['DiariaMedia'].nunique()}")
print(f"Number of distinct Estadias: {combined_df['Estadia'].nunique()}")

Number of distinct Hotel_IDs: 414
Number of distinct Moeda_IDs: 2
Number of distinct Canal_IDs: 366
Number of distinct Reservas: 56
Number of distinct DiariaMedia: 472813
Number of distinct Estadias: 89


In [21]:
combined_df['Moeda_ID'].unique()

array([ 16, 109], dtype=int64)

In [22]:
combined_df['DiariaMedia'] = combined_df.apply(
    lambda row: row['DiariaMedia'] * 0.16483969339817028 if row['Moeda_ID'] == 16 else row['DiariaMedia'], 
    axis=1
)

In [30]:
selected_columns_df = combined_df.loc[:, ['Data', 'Data_ID', 'Hotel_ID', 'Ocupacao_ID', 'DiariaMedia', "Estadia", 'Reservas']]

In [31]:
print(f"Number of distinct Reservas: {selected_columns_df['Reservas'].nunique()}")

Number of distinct Reservas: 56
