In [1]:
import pandas as pd
import numpy as np
import os


In [3]:
#RAM 
def randomize_and_filter_dataset(file_path, target_size, random_seed=42):
    df = pd.read_csv(file_path)
    print('Before:',df.shape)
    np.random.seed(random_seed)
    indices_to_keep = np.random.choice(df.index, size=target_size, replace=False)
    filtered_data = df.loc[indices_to_keep]
    return filtered_data


In [6]:
#Read and merge train csv
#weather_train.csv: site_id
#train.csv : building_id
#building_metadata.csv: site_id building_id

def break_datetime(df):
  df['timestamp_x']= pd.to_datetime(df['timestamp_x'])
  df['hour']= np.uint8(df['timestamp_x'].dt.hour)
  df['dayofweek']= np.uint8(df['timestamp_x'].dt.dayofweek)
  df['month']= np.uint8(df['timestamp_x'].dt.month)
  df['dayofyear']= np.uint16(df['timestamp_x'].dt.dayofyear)
  df['day']= np.uint16(df['timestamp_x'].dt.day)
  df['year']= np.uint16(df['timestamp_x'].dt.year)
  return df

def merge_csv_files(weather_file, df, building_file, output):
    # Read the CSV files
    weather_train = pd.read_csv(weather_file)
    #train = pd.read_csv(train_file)
    building_metadata = pd.read_csv(building_file)
    # Merge the dataframes
    merged_data =pd.merge(pd.merge(building_metadata, df, on='building_id'), weather_train, on='site_id')
    #Timestamp processing
    merged_data= break_datetime(merged_data)
    
    merged_data['timestamp'] = merged_data['timestamp_x']
    merged_data.drop(columns=['timestamp_x', 'timestamp_y'], inplace=True)
    
    # Save the merged dataframe to a new CSV file
    merged_data.to_csv(output, index=False)
    return merged_data


# Train

In [2]:
#params
base_path = r"C:\Users\imate\Documents\24.9.Notebooks_training\Energy-predictor\data\0_raw"

weather_file = os.path.join(base_path, "weather_train.csv")
train_file = os.path.join(base_path, "train.csv")
building_file = os.path.join(base_path, "building_metadata.csv")
output_file = os.path.join(base_path, "0_merged_train.csv")


In [None]:
train = randomize_and_filter_dataset(train_file, target_size=30)
print('After:',train.shape)

In [10]:
train_df = merge_csv_files(weather_file, train, building_file, output_file)
print("Merged data saved:", train_df.shape)

Merged data saved to merged_data.csv (262305, 22)


# Test

In [5]:
base_path = r"C:\Users\imate\Documents\24.9.Notebooks_training\Energy-predictor\data\0_raw"
weather_file = os.path.join(base_path, "weather_train.csv")
test_file = os.path.join(base_path, "test.csv")
building_file = os.path.join(base_path, "building_metadata.csv")
output_test = os.path.join(base_path, "0_merged_test.csv")

In [7]:
test = randomize_and_filter_dataset(test_file, target_size=30)
print('After:',test.shape)

Before: (41697600, 4)
After: (30, 4)


In [None]:
merge_csv_files(weather_file, test, building_file, output_test)