### Import Libraries

In [1]:
import pandas as pd
import os

### Define variables: datasets paths and merged dataset path (for saving output)

In [2]:
## # Dictionary to store the dataset folders and output paths for both UK and Ghana
dataset_info = {
    'UK': {
        'dataset_folders': [
            'data/cams/UK/processed_data', 
            'data/aurn/UK/processed_data', 
            'data/sentinel5p/UK/processed_data'
        ],
        'output_path': 'datasets/UK/sentinel_cams_aurn.csv'
    },
    'Ghana': {
        'dataset_folders': [
            'data/cams/Ghana/processed_data', 
            'data/sentinel5p/Ghana/processed_data'
        ],
        'output_path': 'datasets/Ghana/sentinel_cams_aurn.csv'
    }
}

### 1. Function to load CSV files in dictinary

In [3]:
def read_csv_files(dataset_folder_path):
    files = [f for f in os.listdir(dataset_folder_path) if f.endswith('.csv')]
    data_frames = {}
    
    for file in files:
        df = pd.read_csv(os.path.join(dataset_folder_path, file))
        key = file.split('.')[0]  # Assumes filenames are A.csv, B.csv, etc.
        data_frames[key] = df
    
    return data_frames

### 2. Function to merge the data in each data frame

In [4]:
def merge_data_frames(merged_df, data_frames):
    for key, df in data_frames.items():
        df['Time'] = pd.to_datetime(df['Time'], format='mixed', errors='coerce')
        # Remove duplicates
        df = df.drop_duplicates(subset=['Time', 'SiteName', 'SiteNumber', 'Longitude', 'Latitude'])
        # merged dataframe
        if len(merged_df) == 0:
            merged_df = df
        else:
            merged_df = pd.merge(merged_df, df, on=['Time', 'SiteName', 'SiteNumber', 'Longitude', 'Latitude'], how='outer')
    return merged_df

### 3. Function to save the merged data

In [5]:
# Function Definitions
def save_merged_dataframe(merged_df, output_path):
    # Extract the directory from output_path
    output_dir = os.path.dirname(output_path)
    
    # Ensure the directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Save the merged CSV file
    merged_df.to_csv(output_path, index=False)
    print(f'Merged CSV file saved successfully at {output_path}')

### 4. Merging the Data Frames from all the Datasets

In [6]:
# Process and merge data for each country
for country, info in dataset_info.items():
    merged_df = pd.DataFrame()
    print(f'Merging CSV files for {country}')
    
    for dataset_folder in info['dataset_folders']:
        print(f'Merging CSV files from: {dataset_folder}')
        data_frames = read_csv_files(dataset_folder)
        merged_df = merge_data_frames(merged_df, data_frames)
    
    # Save the merged dataframe to the respective country's output path
    save_merged_dataframe(merged_df, info['output_path'])


Merging CSV files for UK
Merging CSV files from: data/cams/UK/processed_data
Merging CSV files from: data/aurn/UK/processed_data
Merging CSV files from: data/sentinel5p/UK/processed_data
Merged CSV file saved successfully at datasets/UK/sentinel_cams_aurn.csv
Merging CSV files for Ghana
Merging CSV files from: data/cams/Ghana/processed_data
Merging CSV files from: data/sentinel5p/Ghana/processed_data
Merged CSV file saved successfully at datasets/Ghana/sentinel_cams_aurn.csv
