In [1]:
import os
import pandas as pd
import geopandas as gpd

In [2]:
def load_gdfs():
    # Get the current working directory
    cwd = os.getcwd()

    # Define the relative paths to the data files
    weather_path = os.path.join(cwd, '..', 'data', 'raw', 'weather', 'Scotland_2016-01-01_to_2019-12-31_hourly.csv')
    time_series_gdf_path = os.path.join(cwd, '..', 'data', 'interim', 'train_gdf_forward_geocoded.csv')

    # Load the dataframes
    weather = pd.read_csv(weather_path)
    time_series_gdf =  pd.read_csv(time_series_gdf_path)

    return weather, time_series_gdf

weather_df, time_series_gdf = load_gdfs()

In [3]:
def convert_to_datetime(df, column_name):
    print(f"Converting {column_name} to datetime format...")
    df[column_name] = pd.to_datetime(df[column_name])
    print(f"Conversion successful for {column_name}.")
    return df

def extract_hour(df, column_name):
    print(f"Extracting hour from {column_name}...")
    df['Hour'] = df[column_name].dt.hour
    print(f"Extraction successful for {column_name}.")
    return df

def set_index(df, column_name):
    print(f"Setting {column_name} as index...")
    df.set_index(column_name, inplace=True)
    print(f"{column_name} set as index successfully.")
    return df

def merge_dataframes(df1, df2, method, chunk_size=10000):
    print("Merging dataframes in chunks...")
    merged_df = pd.DataFrame()  # Create an empty dataframe to store the merged data
    num_chunks = len(df1) // chunk_size + 1  # Calculate the number of chunks
    for i in range(num_chunks):
        try:
            print(f"Merging chunk {i+1} of {num_chunks}...")
            start = i * chunk_size
            end = (i + 1) * chunk_size
            chunk = df1.iloc[start:end]  # Get a chunk of df1
            merged_chunk = pd.merge(chunk, df2, left_index=True, right_index=True, how=method)
            merged_df = pd.concat([merged_df, merged_chunk])  # Add the merged chunk to the merged_df
            print(f"Chunk {i+1} merged successfully.")
        except Exception as e:
            print(f"Error occurred while merging chunk {i+1}: {e}")
            return None
    print("Merging completed.")
    return merged_df

def fill_missing_values(df, method):
    print("Filling missing values...")
    df.fillna(method=method, inplace=True)
    print("Missing values filled successfully.")
    return df

def convert_to_same_datatype(df1, df2, column_name):
    print(f"Converting {column_name} to the same data type in both dataframes...")
    df1[column_name] = df1[column_name].astype(str)
    df2[column_name] = df2[column_name].astype(str)
    print(f"Conversion successful for {column_name}.")
    return df1, df2

def ensure_matching_indices(df1, df2):
    print("Ensuring matching indices...")
    common_indices = df1.index.intersection(df2.index)
    df1 = df1.loc[common_indices]
    df2 = df2.loc[common_indices]
    print("Indices match successfully.")
    return df1, df2

# Convert 'Start DateTime' and 'datetime' to datetime format
time_series_gdf = convert_to_datetime(time_series_gdf, 'Start DateTime')
weather_df = convert_to_datetime(weather_df, 'datetime')

# Extract hour from 'Start DateTime' and 'datetime'
time_series_gdf = extract_hour(time_series_gdf, 'Start DateTime')
weather_df = extract_hour(weather_df, 'datetime')

# Convert 'Hour' to the same data type in both dataframes
time_series_gdf, weather_df = convert_to_same_datatype(time_series_gdf, weather_df, 'Hour')

# Set 'Hour' as index
time_series_gdf = set_index(time_series_gdf, 'Hour')
weather_df = set_index(weather_df, 'Hour')

# Ensure matching indices
time_series_gdf, weather_df = ensure_matching_indices(time_series_gdf, weather_df)

# Merge the dataframes
merged_df = merge_dataframes(time_series_gdf, weather_df, 'left')
if merged_df is None:
    print("Merging failed. Exiting...")
    exit()

# Fill missing values
merged_df = fill_missing_values(merged_df, 'ffill')

# Reset index
print("Resetting index...")
merged_df.reset_index(inplace=True)
print("Index reset successfully.")

Converting Start DateTime to datetime format...
Conversion successful for Start DateTime.
Converting datetime to datetime format...
Conversion successful for datetime.
Extracting hour from Start DateTime...
Extraction successful for Start DateTime.
Extracting hour from datetime...
Extraction successful for datetime.
Converting Hour to the same data type in both dataframes...
Conversion successful for Hour.
Setting Hour as index...
Hour set as index successfully.
Setting Hour as index...
Hour set as index successfully.
Ensuring matching indices...
Indices match successfully.
Merging dataframes in chunks...
Merging chunk 1 of 7...


: 

: 