In [30]:
import pandas as pd
import numpy as np  
import os
import gc
import psutil
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

In [31]:
# Get the current directory
current_dir = os.getcwd()

# Navigate one folder up
parent_dir = os.path.dirname(current_dir)

# Where the files are located
data = 'data\other'

# Navigate down into the "data" folder
data_dir = os.path.join(parent_dir, data)

In [None]:
os.listdir(data_dir)

In [None]:
# Filter the list of files for those containing "lisbon_14days"
filtered_files = [file for file in os.listdir(data_dir) if "lisbon_14days" in file]
print(filtered_files)

In [None]:
# Initialize an empty list to store DataFrames
dataframes = []

# Variable to track total rows
total_rows = 0

# Loop through all files in the "data" folder
try:
    for file_name in filtered_files:
        
        if file_name.endswith('.csv'):  # Check if the file is a CSV
            
            file_path = os.path.join(data_dir, file_name)
            
            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)
            dataframes.append(df)  # Append the DataFrame to the list
            
             # Print dimensions of the current file
            print(f"File: {file_name} | Dimensions: {df.shape}")
            
            # Add the number of rows to the total count
            total_rows += df.shape[0]

    # Concatenate all DataFrames in the list by binding rows
    combined_df = pd.concat(dataframes, ignore_index=True)

    # Print dimensions of the combined DataFrame
    print(f"Combined DataFrame Dimensions: {combined_df.shape}")

    # Verify the sum of rows matches
    if total_rows == combined_df.shape[0]:
        print("Row count verification successful! Total rows match.")
    else:
        print("Row count verification failed! Mismatch in row count.")

    print(combined_df.head())  # Display the first few rows of the combined DataFrame

except FileNotFoundError:
    print(f"Folder '{data_dir}' not found.")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
combined_df[combined_df['Requests_Com_Disponibilidade']==0]['Requests_Com_Disponibilidade'].describe()

In [None]:
combined_df

In [None]:
summary_statistics = combined_df.describe()
print(summary_statistics)

In [None]:
combined_df['Ocupacao_ID'].unique()

In [None]:
combined_df['Hotel_ID'].unique()

In [None]:
# Create a box plot for the 'Antecedencia' column
plt.figure(figsize=(10, 6))
sns.boxplot(x=combined_df['Antecedencia'], color="lightblue")
plt.title('Box Plot for Antecedencia')
plt.xlabel('Antecedencia')
plt.show()

In [None]:
# Create a box plot for the 'Estadia' column
plt.figure(figsize=(10, 6))
sns.boxplot(x=combined_df['Estadia'], color="lightblue")
plt.title('Box Plot for Estadia')
plt.xlabel('Estadia')
plt.show()

In [None]:
# Create a box plot for the 'DiariaMedia' column
plt.figure(figsize=(10, 6))
sns.boxplot(x=combined_df['DiariaMedia'], color="lightblue")
plt.title('Box Plot for DiariaMedia')
plt.xlabel('DiariaMedia')
plt.show()

In [None]:
print(combined_df['Requests'].sum())
print(combined_df['Requests_Com_Disponibilidade'].sum())

In [None]:
# Group by 'Antecedencia' and calculate the sum of 'Requests' and 'Requests_Com_Disponibilidade'
grouped_df = combined_df.groupby('Antecedencia').sum()

# Calculate the request hit rate
grouped_df['Request_Hit_Rate'] = grouped_df['Requests_Com_Disponibilidade'] / grouped_df['Requests']

# Display the result
print(grouped_df[['Request_Hit_Rate']])

In [None]:
# Plot the Request Hit Rate
plt.figure(figsize=(9, 9))
plt.bar(grouped_df.index, grouped_df['Request_Hit_Rate'], label='Request Convertion/Acceptance Rate', color="lightblue")

# Add LOWESS smoothed line
lowess = sm.nonparametric.lowess
smoothed = lowess(grouped_df['Request_Hit_Rate'], grouped_df.index, frac=0.4)
plt.plot(smoothed[:, 0], smoothed[:, 1], color='firebrick', label='LOWESS Smoothed Fit Line', linewidth=3)

plt.title('Request Hit Rate by Antecedencia')
plt.xlabel('\n Time in Advance (days)')
plt.ylabel('Request Convertion/Acceptance Rate \n')
plt.legend()
plt.show()

In [None]:
data_lake_prd_314410_rz_pull_02_pesquisas_28_Jan_2025_Hotel1746 = pd.read_csv('c:\\Users\\paulo\\Desktop\\hotels brazil\\data\\other\\data-lake-prd-314410.rz.pull-02-pesquisas_28_Jan_2025_Hotel1746.csv')
data_lake_prd_314410_rz_pull_02_pesquisas_28_Jan_2025_Hotel1746

In [None]:
data_lake_prd_314410_rz_pull_02_pesquisas_28_Jan_2025_Hotel1746['Request_ID'].nunique()

In [None]:
print(data_lake_prd_314410_rz_pull_02_pesquisas_28_Jan_2025_Hotel1746['Estadia'].mean())
print(data_lake_prd_314410_rz_pull_02_pesquisas_28_Jan_2025_Hotel1746['Antecedencia'].mean())
print(data_lake_prd_314410_rz_pull_02_pesquisas_28_Jan_2025_Hotel1746['ValorTotal'].mean())

In [None]:

fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot for 'Estadia'
mean_estadia = data_lake_prd_314410_rz_pull_02_pesquisas_28_Jan_2025_Hotel1746['Estadia'].mean()
axes[0].axvline(mean_estadia, color='skyblue', linestyle='--')
axes[0].set_title('Mean of Estadia')
axes[0].set_xlabel('Estadia')
axes[0].set_ylabel('Frequency')

# Plot for 'Antecedencia'
mean_antecedencia = data_lake_prd_314410_rz_pull_02_pesquisas_28_Jan_2025_Hotel1746['Antecedencia'].mean()
axes[1].axvline(mean_antecedencia, color='lightgreen', linestyle='--')
axes[1].set_title('Mean of Antecedencia')
axes[1].set_xlabel('Antecedencia')
axes[1].set_ylabel('Frequency')

# Plot for 'ValorTotal'
mean_valortotal = data_lake_prd_314410_rz_pull_02_pesquisas_28_Jan_2025_Hotel1746['ValorTotal'].mean()
axes[2].axvline(mean_valortotal, color='salmon', linestyle='--')
axes[2].set_title('Mean of ValorTotal')
axes[2].set_xlabel('ValorTotal')
axes[2].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Group by 'Ocupacao_ID' and count the number of rows for each group, then sort by the size
data_lake_prd_314410_rz_pull_02_pesquisas_28_Jan_2025_Hotel1746.groupby('Ocupacao_ID').size().sort_values(ascending=False)


In [None]:
6510391/9136497 

In [None]:
# Group by 'Ocupacao_ID' and count the number of rows for each group, then sort by the size
ocupacao_counts = data_lake_prd_314410_rz_pull_02_pesquisas_28_Jan_2025_Hotel1746.groupby('Ocupacao_ID').size().sort_values(ascending=False)

# Create a bar plot
plt.figure(figsize=(12, 8))
ocupacao_counts.plot(kind='bar', color='lightblue')
plt.title('Number of Requests by Ocupacao_ID for \n Hotel 1746 (Grand Mercure Rio De Janeiro Copacabana)')
plt.xlabel('Ocupacao_ID')
plt.ylabel('Number of Requests')
plt.show()

In [None]:
data_lake_prd_314410_rz_pull_01_response_28_Jan_2025_Hotel1746 = pd.read_csv('c:\\Users\\paulo\\Desktop\\hotels brazil\\data\\other\\data-lake-prd-314410.rz.pull-01-response_28_Jan_2025_Hotel1746.csv')
data_lake_prd_314410_rz_pull_01_response_28_Jan_2025_Hotel1746

In [None]:
data_lake_prd_314410_rz_pull_01_response_28_Jan_2025_Hotel1746['RequestID'].nunique()

In [None]:
data_lake_prd_314410_rz_pull_01_response_28_Jan_2025_Hotel1746[data_lake_prd_314410_rz_pull_01_response_28_Jan_2025_Hotel1746['Disponivel']==1]['RequestID'].nunique()

In [None]:
76/113

In [None]:
data_lake_prd_314410_rz_pull_01_response_28_Jan_2025_Hotel1746['TarifaID'].nunique()

In [None]:
# Group by 'Tarifa_ID' and count the number of rows for each group
tarifa_id_counts = data_lake_prd_314410_rz_pull_01_response_28_Jan_2025_Hotel1746.groupby('TarifaID').size()

# Display the result
print(tarifa_id_counts)

In [None]:
# Group by 'Tarifa_ID' and calculate the mean of 'Disponivel'
mean_disponivel_by_tarifa = data_lake_prd_314410_rz_pull_01_response_28_Jan_2025_Hotel1746.groupby('TarifaID')['Disponivel'].mean()

# Display the result
print(mean_disponivel_by_tarifa)

In [None]:
mean_disponivel_by_tarifa = mean_disponivel_by_tarifa.reset_index()

# Perform a left join
merged_df = pd.merge(mean_disponivel_by_tarifa, data_lake_prd_314410_cz_hoteis_tarifas, how='left', left_on='TarifaID', right_on='Tarifa_ID')

# Display the result
print(merged_df[['TarifaID', 'Tarifa']])

In [None]:
# Group by 'Antecedencia' and calculate the mean of 'Disponivel'
mean_disponivel_by_tarifa = data_lake_prd_314410_rz_pull_01_response_28_Jan_2025_Hotel1746.groupby('TarifaID')['Disponivel'].mean()

# Display the result
print(mean_disponivel_by_tarifa)

In [None]:
data_lake_prd_314410_rz_pull_02_pesquisas_28_Jan_2025_Hotel5998 = pd.read_csv('c:\\Users\\paulo\\Desktop\\hotels brazil\\data\\other\\data-lake-prd-314410.rz.pull-02-pesquisas_28_Jan_2025_Hotel5998.csv')
data_lake_prd_314410_rz_pull_02_pesquisas_28_Jan_2025_Hotel5998

In [None]:
print(data_lake_prd_314410_rz_pull_02_pesquisas_28_Jan_2025_Hotel5998['Estadia'].mean())
print(data_lake_prd_314410_rz_pull_02_pesquisas_28_Jan_2025_Hotel5998['Antecedencia'].mean())
print(data_lake_prd_314410_rz_pull_02_pesquisas_28_Jan_2025_Hotel5998['ValorTotal'].mean())

In [None]:
# Group by 'Ocupacao_ID' and count the number of rows for each group, then sort by the size
data_lake_prd_314410_rz_pull_02_pesquisas_28_Jan_2025_Hotel5998.groupby('Ocupacao_ID').size().sort_values(ascending=False)


In [None]:
1895102/2444544 

In [None]:
# Group by 'Ocupacao_ID' and count the number of rows for each group, then sort by the size
ocupacao_counts = data_lake_prd_314410_rz_pull_02_pesquisas_28_Jan_2025_Hotel5998.groupby('Ocupacao_ID').size().sort_values(ascending=False)

# Create a bar plot
plt.figure(figsize=(12, 8))
ocupacao_counts.plot(kind='bar', color='lightblue')
plt.title('Number of Requests by Ocupacao_ID for \n Hotel 5998 (B&B Hotels Rio de Janeiro Copacabana)')
plt.xlabel('Ocupacao_ID')
plt.ylabel('Number of Requests')
plt.show()

In [None]:
data_lake_prd_314410_rz_pull_01_response_28_Jan_2025_Hotel5998 = pd.read_csv('c:\\Users\\paulo\\Desktop\\hotels brazil\\data\\other\\data-lake-prd-314410.rz.pull-01-response_28_Jan_2025_Hotel5998.csv')
data_lake_prd_314410_rz_pull_01_response_28_Jan_2025_Hotel5998

In [None]:
data_lake_prd_314410_cz_hoteis_tarifas = pd.read_csv('c:\\Users\\paulo\\Desktop\\hotels brazil\\data\\lookups\\data-lake-prd-314410.cz.hoteis-tarifas.csv')
data_lake_prd_314410_cz_hoteis_tarifas

In [None]:
# Group by 'Tarifa_ID' and count the number of rows for each group
tarifa_id_counts = data_lake_prd_314410_rz_pull_01_response_28_Jan_2025_Hotel5998.groupby('TarifaID').size()

# Display the result
print(tarifa_id_counts)

In [None]:
# Group by 'Tarifa_ID' and calculate the mean of 'Disponivel'
mean_disponivel_by_tarifa = data_lake_prd_314410_rz_pull_01_response_28_Jan_2025_Hotel5998.groupby('TarifaID')['Disponivel'].mean()

# Display the result
print(mean_disponivel_by_tarifa)

In [None]:
mean_disponivel_by_tarifa = mean_disponivel_by_tarifa.reset_index()

# Perform a left join
merged_df = pd.merge(mean_disponivel_by_tarifa, data_lake_prd_314410_cz_hoteis_tarifas, how='left', left_on='TarifaID', right_on='Tarifa_ID')

# Display the result
print(merged_df[['TarifaID', 'Tarifa']])

In [None]:
print(data_lake_prd_314410_rz_pull_01_response_28_Jan_2025_Hotel1746['RequestID'].nunique())
print(data_lake_prd_314410_rz_pull_01_response_28_Jan_2025_Hotel5998['RequestID'].nunique())

In [None]:
# Perform an inner join on the 'Request_ID' column
merged_df = pd.merge(data_lake_prd_314410_rz_pull_01_response_28_Jan_2025_Hotel1746['RequestID'], 
                     data_lake_prd_314410_rz_pull_01_response_28_Jan_2025_Hotel5998['RequestID'], 
                     on='RequestID')


In [None]:
merged_df['RequestID'].nunique()

In [None]:
data_lake_prd_314410_cz_pull_pesquisas_28_01_2025_Hotel1746 = pd.read_csv('c:\\Users\\paulo\\Desktop\\hotels brazil\\data\\other\\data-lake-prd-314410.cz.pull-pesquisas_28_01_2025_Hotel1746.csv')
data_lake_prd_314410_cz_pull_pesquisas_28_01_2025_Hotel1746.head()

In [None]:
print(data_lake_prd_314410_cz_pull_pesquisas_28_01_2025_Hotel1746['Reservas'].sum())
print(data_lake_prd_314410_cz_pull_pesquisas_28_01_2025_Hotel1746['Requests'].sum())
print(data_lake_prd_314410_cz_pull_pesquisas_28_01_2025_Hotel1746['Requests_Sem_Response'].sum())
print(data_lake_prd_314410_cz_pull_pesquisas_28_01_2025_Hotel1746['Requests_Com_Disponibilidade'].sum())

In [None]:
# Filter the dataframe where "Reservas" is not NaN
filtered_df = data_lake_prd_314410_cz_pull_pesquisas_28_01_2025_Hotel1746.dropna(subset=['Reservas'])

# Calculate the weighted mean using "Reservas" as the weight
weighted_mean_estadia = np.average(filtered_df['Estadia'], weights=filtered_df['Reservas'])
weighted_mean_diariamedia = np.average(filtered_df['DiariaMedia'], weights=filtered_df['Reservas'])

print(weighted_mean_estadia)
print(weighted_mean_diariamedia)

In [None]:
data_lake_prd_314410_cz_pull_pesquisas_28_01_2025_Hotel5998 = pd.read_csv('c:\\Users\\paulo\\Desktop\\hotels brazil\\data\\other\\data-lake-prd-314410.cz.pull-pesquisas_28_01_2025_Hotel5998.csv')
data_lake_prd_314410_cz_pull_pesquisas_28_01_2025_Hotel5998.head()

In [None]:
print(data_lake_prd_314410_cz_pull_pesquisas_28_01_2025_Hotel5998['Reservas'].sum())
print(data_lake_prd_314410_cz_pull_pesquisas_28_01_2025_Hotel5998['Requests'].sum())
print(data_lake_prd_314410_cz_pull_pesquisas_28_01_2025_Hotel5998['Requests_Sem_Response'].sum())
print(data_lake_prd_314410_cz_pull_pesquisas_28_01_2025_Hotel5998['Requests_Com_Disponibilidade'].sum())

In [None]:
# Filter the dataframe where "Reservas" is not NaN
filtered_df = data_lake_prd_314410_cz_pull_pesquisas_28_01_2025_Hotel5998.dropna(subset=['Reservas'])

# Calculate the weighted mean using "Reservas" as the weight
# Calculate the weighted mean using "Reservas" as the weight
weighted_mean_estadia = np.average(filtered_df['Estadia'], weights=filtered_df['Reservas'])
weighted_mean_diariamedia = np.average(filtered_df['DiariaMedia'], weights=filtered_df['Reservas'])

print(weighted_mean_estadia)
print(weighted_mean_diariamedia)