In [1]:
import pandas as pd
import numpy as np  
import os
import gc
import psutil

In [2]:
# Get the current directory
current_dir = os.getcwd()

# Navigate one folder up
parent_dir = os.path.dirname(current_dir)

# Where the files are located
data = 'data\other'

# Navigate down into the "data" folder
data_dir = os.path.join(parent_dir, data)

In [3]:
os.listdir(data_dir)

['data-lake-prd-314410.cz.pull-motivo-indisponibilidade.csv',
 'data-lake-prd-314410.cz.pull-pesquisas_100000_extract.csv',
 'data-lake-prd-314410.cz.pull-pesquisas_average_price_hotel_currency.csv',
 'data-lake-prd-314410.cz.pull-pesquisas_estadia_x_reservas_volume.csv',
 'data-lake-prd-314410.cz.pull-pesquisas_estadia_x_reservas_x_price_receitas.csv',
 'data-lake-prd-314410.cz.pull-pesquisas_lisbon_14days2024_v1.csv',
 'data-lake-prd-314410.cz.pull-pesquisas_lisbon_14days2024_v2.csv',
 'data-lake-prd-314410.cz.pull-pesquisas_lisbon_14days2024_v3.csv',
 'data-lake-prd-314410.cz.pull-pesquisas_lisbon_14days2024_v4.csv',
 'data-lake-prd-314410.cz.pull-pesquisas_lisbon_14days2024_v5.csv',
 'data-lake-prd-314410.cz.pull-pesquisas_lisbon_14days2024_v6.csv',
 'data-lake-prd-314410.cz.pull-pesquisas_lisbon_14days2024_v7.csv',
 'hotel_city_chanel_combin_extract.csv',
 'pull-pesquisas-cidade-ultimos-30-dias_2025_notnull.csv',
 'pull-pesquisas-hotel-ultimos-30-dias_1000random.csv',
 'pull-pesqu

In [4]:
# Filter the list of files for those containing "lisbon_14days"
filtered_files = [file for file in os.listdir(data_dir) if "lisbon_14days" in file]
print(filtered_files)

['data-lake-prd-314410.cz.pull-pesquisas_lisbon_14days2024_v1.csv', 'data-lake-prd-314410.cz.pull-pesquisas_lisbon_14days2024_v2.csv', 'data-lake-prd-314410.cz.pull-pesquisas_lisbon_14days2024_v3.csv', 'data-lake-prd-314410.cz.pull-pesquisas_lisbon_14days2024_v4.csv', 'data-lake-prd-314410.cz.pull-pesquisas_lisbon_14days2024_v5.csv', 'data-lake-prd-314410.cz.pull-pesquisas_lisbon_14days2024_v6.csv', 'data-lake-prd-314410.cz.pull-pesquisas_lisbon_14days2024_v7.csv']


In [None]:
# Initialize an empty list to store DataFrames
dataframes = []

# Variable to track total rows
total_rows = 0

# Loop through all files in the "data" folder
try:
    for file_name in filtered_files:
        
        if file_name.endswith('.csv'):  # Check if the file is a CSV
            
            file_path = os.path.join(data_dir, file_name)
            
            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)
            dataframes.append(df)  # Append the DataFrame to the list
            
             # Print dimensions of the current file
            print(f"File: {file_name} | Dimensions: {df.shape}")
            
            # Add the number of rows to the total count
            total_rows += df.shape[0]

    # Concatenate all DataFrames in the list by binding rows
    combined_df = pd.concat(dataframes, ignore_index=True)

    # Print dimensions of the combined DataFrame
    print(f"Combined DataFrame Dimensions: {combined_df.shape}")

    # Verify the sum of rows matches
    if total_rows == combined_df.shape[0]:
        print("Row count verification successful! Total rows match.")
    else:
        print("Row count verification failed! Mismatch in row count.")

    print(combined_df.head())  # Display the first few rows of the combined DataFrame

except FileNotFoundError:
    print(f"Folder '{data_dir}' not found.")
except Exception as e:
    print(f"An error occurred: {e}")

File: data-lake-prd-314410.cz.pull-pesquisas_lisbon_14days2024_v1.csv | Dimensions: (7467152, 8)
File: data-lake-prd-314410.cz.pull-pesquisas_lisbon_14days2024_v2.csv | Dimensions: (7817567, 8)
File: data-lake-prd-314410.cz.pull-pesquisas_lisbon_14days2024_v3.csv | Dimensions: (6803018, 8)
File: data-lake-prd-314410.cz.pull-pesquisas_lisbon_14days2024_v4.csv | Dimensions: (6548245, 8)
File: data-lake-prd-314410.cz.pull-pesquisas_lisbon_14days2024_v5.csv | Dimensions: (7413623, 8)
File: data-lake-prd-314410.cz.pull-pesquisas_lisbon_14days2024_v6.csv | Dimensions: (7442312, 8)
