In [4]:
import pandas as pd
import numpy as np
import glob
import os

In [8]:
data_dir = r"C:\DECODER\DECODER-Project\data"

# Read all .txt files in the folder
files = glob.glob(os.path.join(data_dir, "*.txt"))

In [9]:
all_data = []

for file in files:
    meter_type = os.path.basename(file).replace(".txt", "")
    print(f"Reading {meter_type} ...")

    # Read as comma-separated
    df = pd.read_csv(file, sep=',')
    df['meter_type'] = meter_type
    all_data.append(df)

Reading chilledwater ...
Reading electricity ...
Reading gas ...
Reading hotwater ...
Reading irrigation ...
Reading solar ...
Reading steam ...
Reading water ...


In [10]:
# Combine all into one DataFrame
data = pd.concat(all_data, ignore_index=True)

print("‚úÖ Combined shape:", data.shape)
print(data.head())

# Save combined data for reference
output_path = os.path.join(data_dir, "combined_data_preview.csv")
data.to_csv(output_path, index=False)
print(f"‚úÖ Combined file saved at: {output_path}")

‚úÖ Combined shape: (140352, 1638)
             timestamp  Panther_office_Clementine  Panther_lodging_Cora  \
0  2016-01-01 00:00:00                        NaN                   NaN   
1  2016-01-01 01:00:00                        NaN                   NaN   
2  2016-01-01 02:00:00                        NaN                   NaN   
3  2016-01-01 03:00:00                        NaN                   NaN   
4  2016-01-01 04:00:00                        NaN                   NaN   

   Panther_education_Teofila  Panther_education_Jerome  \
0                        NaN                       NaN   
1                        NaN                       NaN   
2                        NaN                       NaN   
3                        NaN                       NaN   
4                        NaN                       NaN   

   Panther_retail_Kristina  Panther_education_Neal  Panther_lodging_Kara  \
0                      NaN                     NaN                   NaN   
1            

In [11]:
# Keep only rows that have a timestamp column
if 'timestamp' not in data.columns:
    raise ValueError("No 'timestamp' column found ‚Äî check your input files.")

# Melt (unpivot) so that each reading is one row
data_long = data.melt(
    id_vars=['timestamp', 'meter_type'],
    var_name='building_id',
    value_name='value'
)

print("‚úÖ Long-format shape:", data_long.shape)
print(data_long.head())

# Save a smaller sample for testing the API
sample = data_long.sample(5000, random_state=42)
sample.to_csv("sample_sensor_data.csv", index=False)
print("üìÅ Saved sample_sensor_data.csv (for API testing)")


‚úÖ Long-format shape: (229615872, 4)
             timestamp    meter_type                building_id  value
0  2016-01-01 00:00:00  chilledwater  Panther_office_Clementine    NaN
1  2016-01-01 01:00:00  chilledwater  Panther_office_Clementine    NaN
2  2016-01-01 02:00:00  chilledwater  Panther_office_Clementine    NaN
3  2016-01-01 03:00:00  chilledwater  Panther_office_Clementine    NaN
4  2016-01-01 04:00:00  chilledwater  Panther_office_Clementine    NaN
üìÅ Saved sample_sensor_data.csv (for API testing)


In [12]:
data_long.info()
data_long.describe()
data_long['meter_type'].value_counts()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229615872 entries, 0 to 229615871
Data columns (total 4 columns):
 #   Column       Dtype  
---  ------       -----  
 0   timestamp    object 
 1   meter_type   object 
 2   building_id  object 
 3   value        float64
dtypes: float64(1), object(3)
memory usage: 6.8+ GB


meter_type
chilledwater    28701984
electricity     28701984
gas             28701984
hotwater        28701984
irrigation      28701984
solar           28701984
steam           28701984
water           28701984
Name: count, dtype: int64

In [21]:
data_long['timestamp'] = pd.to_datetime(data_long['timestamp'])
print(data_long['timestamp'].min(), data_long['timestamp'].max())


subset = data_long.sample(n=10000, random_state=42)  # random 10,000 rows
subset.to_csv(r"C:\DECODER\DECODER-Project\data\processed\subset_data.csv", index=False)
print("‚úÖ Random subset saved:", subset.shape)


2016-01-01 00:00:00 2017-12-31 23:00:00
‚úÖ Random subset saved: (10000, 4)
