In [4]:
import pandas as pd
from datetime import datetime
import os

# Define input and output directories
input_dir = r"C:\Users\User\Desktop\Data Mining\Project\Data\Dataset_in_csv"
output_dir = r"C:\Users\User\Desktop\Data Mining\Project\Data\Dataset_in_csv_&_with_Demand"

# Ensure output directory exists
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Load demand datasets
balance_df = pd.read_csv(os.path.join(input_dir, "cleaned_balance_data.csv"))
subregion_df = pd.read_csv(os.path.join(input_dir, "cleaned_subregion_data.csv"))

# Prepare datetime column (no truncation to date)
balance_df['datetime'] = pd.to_datetime(balance_df['utc_time'])
subregion_df['datetime'] = pd.to_datetime(subregion_df['utc_time'])

# Define cities and their data sources
cities = {
    'la': 'subregion',
    'nyc': 'subregion',
    'dallas': 'subregion',
    'houston': 'subregion',
    'philadelphia': 'subregion',
    'san_antonio': 'subregion',
    'san_diego': 'subregion',
    'san_jose': 'subregion',
    'phoenix': 'balance',
    'seattle': 'balance'
}

# Cities list and respective demand source
city_name_mapping = {
    'phoenix': 'phoenix',
    'seattle': 'seattle',
    'la': 'la',
    'nyc': 'nyc',
    'dallas': 'dallas',
    'houston': 'houston',
    'philadelphia': 'philadelphia',
    'san_antonio': 'san antonio',
    'san_diego': 'san diego',
    'san_jose': 'san jose'
}

merged_data = {}

for city, source in cities.items():
    print(f"\n🔍 Processing: {city.upper()}")

    # Load weather data
    try:
        weather_df = pd.read_csv(os.path.join(input_dir, f"{city}.csv"))
        weather_df['datetime'] = pd.to_datetime(weather_df['time'], unit='s')
        weather_df['city'] = city_name_mapping[city.lower()]  # Use mapped city name
    except FileNotFoundError:
        print(f"⚠️ Weather file {city}.csv not found in {input_dir}.")
        continue

    # Get correct demand data for city
    demand_city = city_name_mapping[city.lower()]
    if source == 'balance':
        demand_df = balance_df[balance_df['city'].str.lower() == demand_city]
    else:
        demand_df = subregion_df[subregion_df['city'].str.lower() == demand_city]

    # Check if demand data exists for the city
    if demand_df.empty:
        print(f"⚠️ No demand data found for {demand_city} in {source} dataset.")
        continue

    # Debug: Check row counts and unique timestamps
    print(f"📊 Weather data rows: {len(weather_df)}, Unique timestamps: {weather_df['datetime'].nunique()}")
    print(f"📊 Demand data rows: {len(demand_df)}, Unique timestamps: {demand_df['datetime'].nunique()}")

    # Merge on 'datetime' and 'city'
    merged = pd.merge(
        weather_df,
        demand_df[['datetime', 'city', 'demand']],
        on=['datetime', 'city'],
        how='inner'
    )
    merged_data[city] = merged

    # Report
    matched = merged.shape[0]
    print(f"✅ Records matched on datetime & city: {matched}")
    if matched > 0:
        print("📅 Sample matched timestamps:", merged['datetime'].unique()[:5])
    else:
        print(f"⚠️ No matches found. Check datetime and city name consistency (e.g., '{city}' vs. '{demand_city}').")

    # Save results to the specified directory
    if matched > 0:
        output_file = os.path.join(output_dir, f"{city}_with_demand.csv")
        try:
            merged.to_csv(output_file, index=False)
            print(f"💾 Saved: {output_file}")
        except Exception as e:
            print(f"⚠️ Error saving {output_file}: {str(e)}")


🔍 Processing: LA
📊 Weather data rows: 16574, Unique timestamps: 16574
📊 Demand data rows: 16536, Unique timestamps: 16536
✅ Records matched on datetime & city: 16526
📅 Sample matched timestamps: <DatetimeArray>
['2018-07-01 08:00:00', '2018-07-01 09:00:00', '2018-07-01 10:00:00',
 '2018-07-01 11:00:00', '2018-07-01 12:00:00']
Length: 5, dtype: datetime64[ns]
💾 Saved: C:\Users\User\Desktop\Data Mining\Project\Data\Dataset_in_csv_&_with_Demand\la_with_demand.csv

🔍 Processing: NYC
📊 Weather data rows: 16574, Unique timestamps: 16574
📊 Demand data rows: 16536, Unique timestamps: 16536
✅ Records matched on datetime & city: 16503
📅 Sample matched timestamps: <DatetimeArray>
['2018-07-02 04:00:00', '2018-07-02 05:00:00', '2018-07-02 06:00:00',
 '2018-07-02 07:00:00', '2018-07-02 08:00:00']
Length: 5, dtype: datetime64[ns]
💾 Saved: C:\Users\User\Desktop\Data Mining\Project\Data\Dataset_in_csv_&_with_Demand\nyc_with_demand.csv

🔍 Processing: DALLAS
📊 Weather data rows: 16574, Unique timestamp