In [None]:
# this NoteBook is for update detection for the Hotel Details Dataset using Jupyter Notebooks

In [None]:
# Import necessary libraries
import pandas as pd
import hashlib
from datetime import datetime

In [None]:


# def load_data(filepath, input_template_path=None):
#     try:
#         # Try loading the existing data
#         df = pd.read_csv(filepath, parse_dates=['last_updated'])
#     except FileNotFoundError:
#         print(f"No file found at {filepath}. Initializing an empty DataFrame based on the template path.")
#         # If no file is found and a template path is provided, create a DataFrame with the same structure
#         if input_template_path:
#             # Load only headers from the input template
#             temp_df = pd.read_csv(input_template_path, nrows=0)
#             columns = temp_df.columns.tolist()  # Get all column names from the input dataset
#         else:
#             # Fallback to a default set of columns if no input template path is provided
#             columns = ['hotel_id', 'name', 'location', 'description']  # Default columns if no template is available

#         # Initialize an empty DataFrame with the structure derived from the input dataset
#         df = pd.DataFrame(columns=columns)
#         df['last_updated'] = pd.to_datetime(datetime.now())  # Ensure this column is added and set to now
#         df['data_hash'] = pd.NA  # Initialize data_hash as missing
        
#     return df

def load_data(filepath, input_template_path=None):
    try:
        df = pd.read_csv(filepath)
        if 'last_updated' in df.columns:
            df['last_updated'] = pd.to_datetime(df['last_updated'])
        else:
            df['last_updated'] = pd.to_datetime(datetime.now())
    except FileNotFoundError:
        print(f"No file found at {filepath}. Initializing based on the input template.")
        if input_template_path:
            temp_df = pd.read_csv(input_template_path, nrows=0)
            df = pd.DataFrame(columns=temp_df.columns)
        else:
            df = pd.DataFrame(columns=['hotel_id', 'name', 'location', 'description'])
        df['last_updated'] = pd.to_datetime(datetime.now())
        df['data_hash'] = pd.NA
    return df

In [None]:
# Function to generate hash
def generate_hash(row):
    hash_obj = hashlib.sha256()
    concatenated_details = ''.join(str(row[col]) for col in row.index if col != 'last_updated')
    hash_obj.update(concatenated_details.encode('utf-8'))
    return hash_obj.hexdigest()


In [None]:
# Function to apply hashes
def apply_hashes(df):
    df['data_hash'] = df.apply(generate_hash, axis=1)
    df['last_updated'] = pd.to_datetime(datetime.now())
    return df

# def apply_hashes(df):
#     def generate_hash(row):
#         hash_obj = hashlib.sha256()
#         concatenated_details = ''.join(str(row[col]) for col in sorted(row.index) if col != 'last_updated')
#         hash_obj.update(concatenated_details.encode('utf-8'))
#         return hash_obj.hexdigest()
    
#     df['data_hash'] = df.apply(generate_hash, axis=1)
#     df['last_updated'] = pd.to_datetime(datetime.now())
#     return df


In [None]:
# Function to detect changes
# def detect_changes(previous_df, current_df):
#     combined_df = pd.merge(previous_df, current_df, on='hotel_id', suffixes=('_prev', '_curr'))
#     changes = combined_df[(combined_df['data_hash_prev'] != combined_df['data_superseded']) | (combined_df['last_updated_prev'] < combined_df['last_updated_curr'])]
#     return changes



# def detect_changes(previous_df, current_df):
#     combined_df = pd.merge(previous_df, current_df, on='hotel_id', how='outer', suffixes=('_prev', '_curr'))
#     changes = combined_df[(combined_df['data_hash_prev'] != combined_df['data_hash_curr']) | 
#                           (combined_df['last_updated_prev'] < combined_df['last_inputd_curr'])]
#     return changes

# Function to detect changes and update
def detect_changes(previous_df, current_df):
    if not previous_df.empty:
        combined_df = pd.merge(previous_df, current_df, on='hotel_id', how='outer', indicator=True)
        changes_df = combined_df[combined_df['_merge'] != 'both']
        return changes_df
    return current_df

In [None]:
def save_changes(df, filepath):
    df.to_csv(filepath, index=False)
    print(f"Data saved to {filepath}.")

In [None]:
# parameters
input_path = "../Master Datasets/raw/hotel_detail_dataset_csv.csv"
output_path = "../Master Datasets/Hashed and etc/hashed_timestamped_hotel_details.csv"

In [None]:
# Load previously stored data or initialize it
previous_df = load_data(output_path, input_template_path=input_path)

# Load current data and apply hashes
current_df = load_data(input_path)
current_df = apply_hashes(current_df)

# Detect changes
changes_df = detect_changes(previous_df, current_df)

# Save the changes if any
# Save the changes if any
if not changes_df.empty:
    print("Changes detected, updating the dataset.")
    save_changes(changes_df, output_path)
else:
    print("No changes detected.")

In [12]:
# if 'df1' in locals():
#     del df1
# if 'df2' in locals():
#     del df2
# if 'df' in locals():
#     del df

In [None]:
# : Define the Hash Function

# As discussed, youâ€™ll create a dynamic hash function that can adapt to changes in the data 
# structure, such as the addition of new columns. This function will hash all the relevant columns 
# except the last_updated column to avoid unnecessary updates due to timestamp changes alone.

In [24]:
# Apply the Hash Function to the DataFrame

# apply this hash function to each row in your DataFrame to create a hash column. This column will 
# represent the current state of each row based on its content.