In [None]:
from google.colab import drive

# This command will prompt you for authorization
drive.mount('/content/drive')

print("Google Drive mounted successfully.")

Mounted at /content/drive
Google Drive mounted successfully.


In [None]:
import zipfile
import os

# Define the paths
# This is the path to the zip file you uploaded
zip_file_path = "/content/drive/MyDrive/Indoor_air_quality.zip"

# This is where we'll unzip the first file
temp_unzip_path = "/content/drive/MyDrive/temp_dataset_folder/"

# This is the path to the *inner* zip file (based on our previous attempts)
inner_zip_path = os.path.join(temp_unzip_path, "aq_dataset.zip")

# This is the final destination for all your JSON files
final_json_path = "/content/drive/MyDrive/aq_dataset_unzipped/"

# Create the directories
os.makedirs(temp_unzip_path, exist_ok=True)
os.makedirs(final_json_path, exist_ok=True)

try:
    # --- First Unzip ---
    print(f"Starting to unzip {zip_file_path}...")
    with zipfile.ZipFile(zip_file_path, 'r') as z:
        z.extractall(temp_unzip_path)
    print(f"First unzip complete. Files are in {temp_unzip_path}")

    # --- Second Unzip ---
    if os.path.exists(inner_zip_path):
        print(f"Found inner zip file: {inner_zip_path}. Unzipping now...")
        with zipfile.ZipFile(inner_zip_path, 'r') as z:
            z.extractall(final_json_path)
        print(f"All JSON files are now in: {final_json_path}")

        # Clean up the temporary folder
        # os.remove(inner_zip_path)
        # os.rmdir(temp_unzip_path)
        print("Cleanup of temp files complete.")

    else:
        print(f"ERROR: Could not find 'aq_dataset.zip' inside the main zip.")
        print("Please check the contents of 'Indoor_air_quality.zip'.")

except FileNotFoundError:
    print(f"ERROR: File not found at {zip_file_path}")
    print("Please make sure the file name is correct and it's in 'My Drive'.")
except Exception as e:
    print(f"An error occurred: {e}")

Starting to unzip /content/drive/MyDrive/Indoor_air_quality.zip...
First unzip complete. Files are in /content/drive/MyDrive/temp_dataset_folder/
ERROR: Could not find 'aq_dataset.zip' inside the main zip.
Please check the contents of 'Indoor_air_quality.zip'.


In [None]:
import pandas as pd
import glob
import os
import json
import gc

# This is the path where you unzipped the files
json_folder_path = "/content/drive/MyDrive/aq_dataset_unzipped/"
json_files = glob.glob(f"{json_folder_path}/*.json")

# --- THIS IS THE KEY ---
# We MUST include the 'time' column to resample
columns_we_need = [
    'time',            # <-- The most important column
    'temperature',
    'humidity',
    'MQ7_CO_ppm',      # Model 1
    'MQ135_CO',        # Model 1
    'MQ9_CH4_ppm',     # Model 2
    'CH4'              # Model 2
]

if not json_files:
    print(f"ERROR: No .json files found in {json_folder_path}")
else:
    print(f"Found {len(json_files)} JSON files. Loading and filtering...")

    df_list = []

    for f in json_files:
        try:
            print(f"Processing {f}...")
            with open(f, 'r') as file:
                data = json.load(file)

            df_temp = pd.DataFrame(data=data['values'], columns=data['columns'])

            # --- MEMORY SAVING STEP ---
            # Filter for *only* the columns we need (now 7)
            df_filtered = df_temp[columns_we_need].copy()
            df_list.append(df_filtered)

            del data, df_temp, df_filtered
            gc.collect()

        except Exception as e:
            print(f"Error loading file {f}: {e}")

    # --- THIS IS THE NEW LOGIC ---
    if df_list:
        print("\nAll files processed. Concatenating...")

        # We'll call this df_raw
        df_raw = pd.concat(df_list, ignore_index=True)
        del df_list
        gc.collect()

        print("Data combined. Now converting time and resampling...")

        # 1. Convert 'time' column to datetime objects
        # This can take a minute
        print("Converting 'time' column to datetime (this may take a while)...")
        df_raw['time'] = pd.to_datetime(df_raw['time'])

        # 2. Set 'time' as the index
        print("Setting time index...")
        df_raw.set_index('time', inplace=True)

        # 3. Resample the data into 1-minute averages
        # This will average all readings within each minute
        print("Resampling data to 1-minute averages...")
        # 'T' stands for minute. We take the mean() of all values in that minute.
        df_resampled = df_raw.resample('1T').mean()

        print(f"Original rows: {len(df_raw)}, Resampled rows: {len(df_resampled)}")

        # Clean up the huge raw dataframe
        del df_raw
        gc.collect()

        # --- Model 1 (CO) DataFrame ---
        print("\n--- Processing CO Model Data ---")
        co_cols = ['MQ7_CO_ppm', 'MQ135_CO', 'temperature', 'humidity']
        df_co = df_resampled[co_cols].copy()
        print(f"Resampled CO rows: {len(df_co)}")
        df_co.dropna(inplace=True) # Drop rows missing CO data
        print(f"Clean CO rows for training: {len(df_co)}")
        print(df_co.head())

        # --- Model 2 (VOC) DataFrame ---
        print("\n--- Processing VOC Model Data ---")
        voc_cols = ['MQ9_CH4_ppm', 'CH4', 'temperature', 'humidity']
        df_voc = df_resampled[voc_cols].copy()
        print(f"Resampled VOC rows: {len(df_voc)}")
        df_voc.dropna(inplace=True) # Drop rows missing VOC data
        print(f"Clean VOC rows for training: {len(df_voc)}")
        print(df_voc.head())

        print("\nSuccessfully created 'df_co' and 'df_voc'.")
        print("You can now run Step 5 and 6.")

    else:
        print("\nNo data was loaded. Please check the file paths and structure.")

Found 13 JSON files. Loading and filtering...
Processing /content/drive/MyDrive/aq_dataset_unzipped/2023-12-09_2023-12-31.json...
Processing /content/drive/MyDrive/aq_dataset_unzipped/2024-01-01_2024-01-31.json...
Processing /content/drive/MyDrive/aq_dataset_unzipped/2024-02-01_2024-02-29.json...
Processing /content/drive/MyDrive/aq_dataset_unzipped/2024-03-01_2024-03-31.json...
Processing /content/drive/MyDrive/aq_dataset_unzipped/2024-04-01_2024-04-30.json...
Processing /content/drive/MyDrive/aq_dataset_unzipped/2024-05-01_2024-05-31.json...
Processing /content/drive/MyDrive/aq_dataset_unzipped/2024-06-01_2024-06-30.json...
Processing /content/drive/MyDrive/aq_dataset_unzipped/2024-07-01_2024-07-31.json...
Processing /content/drive/MyDrive/aq_dataset_unzipped/2024-08-01_2024-08-31.json...
Processing /content/drive/MyDrive/aq_dataset_unzipped/2024-09-01_2024-09-30.json...
Processing /content/drive/MyDrive/aq_dataset_unzipped/2024-10-01_2024-10-31.json...
Processing /content/drive/MyDr

  df_resampled = df_raw.resample('1T').mean()


Original rows: 12790707, Resampled rows: 528480

--- Processing CO Model Data ---
Resampled CO rows: 528480
Clean CO rows for training: 519205
                           MQ7_CO_ppm  MQ135_CO  temperature   humidity
time                                                                   
2023-12-09 00:00:00+00:00        69.0  3.927202    23.950000  33.025000
2023-12-09 00:01:00+00:00        69.0  3.822472    24.200000  33.133333
2023-12-09 00:02:00+00:00        64.0  4.219279    24.166667  33.133333
2023-12-09 00:03:00+00:00        69.0  3.653514    24.166667  33.133333
2023-12-09 00:04:00+00:00        63.0  4.219279    24.166667  33.133333

--- Processing VOC Model Data ---
Resampled VOC rows: 528480
Clean VOC rows for training: 493506
                           MQ9_CH4_ppm    CH4  temperature   humidity
time                                                                 
2023-12-09 00:00:00+00:00    10.437220  576.0    23.950000  33.025000
2023-12-09 00:01:00+00:00    10.437220  575.0

In [None]:
from sklearn.linear_model import LinearRegression

print("--- Training Model 1 (CO Calibration) ---")

# 1. Define our features (X) and target (y) from the 'df_co' DataFrame
features_co = ['MQ7_CO_ppm', 'temperature', 'humidity']
target_co = 'MQ135_CO'

X_co = df_co[features_co]
y_co = df_co[target_co]

# 2. Create and train the model
model_co = LinearRegression()
model_co.fit(X_co, y_co)

# 3. Get the formula!
print("\nModel training complete.")
print("The formula is: y = (A * MQ7_CO) + (B * temp) + (C * humid) + Intercept\n")

# Get the coefficients (A, B, C)
coeffs_co = model_co.coef_
intercept_co = model_co.intercept_

print("--- üìã COPY THESE VALUES FOR YOUR ESP32 ---")
print(f"Coefficient A (for MQ7_CO_ppm): {coeffs_co[0]}")
print(f"Coefficient B (for temperature): {coeffs_co[1]}")
print(f"Coefficient C (for humidity):    {coeffs_co[2]}")
print(f"Intercept:                       {intercept_co}")
print("------------------------------------------")

--- Training Model 1 (CO Calibration) ---

Model training complete.
The formula is: y = (A * MQ7_CO) + (B * temp) + (C * humid) + Intercept

--- üìã COPY THESE VALUES FOR YOUR ESP32 ---
Coefficient A (for MQ7_CO_ppm): 0.2066745367379103
Coefficient B (for temperature): -0.08439702297944282
Coefficient C (for humidity):    0.004321370524230999
Intercept:                       -8.055104846752645
------------------------------------------


In [None]:
from sklearn.linear_model import LinearRegression

print("--- Training Model 2 (VOC/CH4 Calibration) ---")

# 1. Define our features (X) and target (y) from the 'df_voc' DataFrame
features_voc = ['MQ9_CH4_ppm', 'temperature', 'humidity']
target_voc = 'CH4'

X_voc = df_voc[features_voc]
y_voc = df_voc[target_voc]

# 2. Create and train the model
model_voc = LinearRegression()
model_voc.fit(X_voc, y_voc)

# 3. Get the formula!
print("\nModel training complete.")
print("The formula is: y = (A * MQ9_CH4) + (B * temp) + (C * humid) + Intercept\n")

# Get the coefficients (A, B, C)
coeffs_voc = model_voc.coef_
intercept_voc = model_voc.intercept_

print("--- üìã COPY THESE VALUES FOR YOUR ESP32 ---")
print(f"Coefficient A (for MQ9_CH4_ppm): {coeffs_voc[0]}")
print(f"Coefficient B (for temperature): {coeffs_voc[1]}")
print(f"Coefficient C (for humidity):    {coeffs_voc[2]}")
print(f"Intercept:                       {intercept_voc}")
print("------------------------------------------")

--- Training Model 2 (VOC/CH4 Calibration) ---

Model training complete.
The formula is: y = (A * MQ9_CH4) + (B * temp) + (C * humid) + Intercept

--- üìã COPY THESE VALUES FOR YOUR ESP32 ---
Coefficient A (for MQ9_CH4_ppm): 22.68115496453869
Coefficient B (for temperature): -40.81730199744914
Coefficient C (for humidity):    -0.65953960607782
Intercept:                       1811.870146951751
------------------------------------------


In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

print("--- Performance Report: Model 1 (CO Calibration) ---")

# Use our trained model (model_co) to make predictions on all the data
y_pred_co = model_co.predict(X_co)

# Calculate the metrics
r2_co = r2_score(y_co, y_pred_co)
mae_co = mean_absolute_error(y_co, y_pred_co)
rmse_co = np.sqrt(mean_squared_error(y_co, y_pred_co))

print(f"\nSuccessfully evaluated {len(y_co)} data points.")
print("\n--- üìã COPY THESE METRICS ---")
print(f"R-squared (R¬≤):   {r2_co:.4f}")
print(f"Mean Absolute Error (MAE): {mae_co:.4f} ppm")
print(f"Root Mean Squared Error (RMSE): {rmse_co:.4f} ppm")
print("---------------------------------")

--- Performance Report: Model 1 (CO Calibration) ---

Successfully evaluated 519205 data points.

--- üìã COPY THESE METRICS ---
R-squared (R¬≤):   0.3777
Mean Absolute Error (MAE): 2.2279 ppm
Root Mean Squared Error (RMSE): 3.2616 ppm
---------------------------------


In [None]:
print("--- Performance Report: Model 2 (VOC/CH4 Calibration) ---")

# Use our trained model (model_voc) to make predictions on all the data
y_pred_voc = model_voc.predict(X_voc)

# Calculate the metrics
r2_voc = r2_score(y_voc, y_pred_voc)
mae_voc = mean_absolute_error(y_voc, y_pred_voc)
rmse_voc = np.sqrt(mean_squared_error(y_voc, y_pred_voc))

print(f"\nSuccessfully evaluated {len(y_voc)} data points.")
print("\n--- üìã COPY THESE METRICS ---")
print(f"R-squared (R¬≤):   {r2_voc:.4f}")
print(f"Mean Absolute Error (MAE): {mae_voc:.4f} ppm")
print(f"Root Mean Squared Error (RMSE): {rmse_voc:.4f} ppm")
print("---------------------------------")

--- Performance Report: Model 2 (VOC/CH4 Calibration) ---

Successfully evaluated 493506 data points.

--- üìã COPY THESE METRICS ---
R-squared (R¬≤):   0.4114
Mean Absolute Error (MAE): 183.4077 ppm
Root Mean Squared Error (RMSE): 231.7250 ppm
---------------------------------


In [None]:
# --- STEP 1: Install Firebase Admin ---
!pip install firebase-admin

import firebase_admin
from firebase_admin import credentials, firestore
import json
import os
import zipfile
from datetime import datetime

# --- CONFIGURATION ---
# Make sure this matches the filename you uploaded to Colab!
KEY_FILE = 'service-account-key.json'
COLLECTION_NAME = 'logs'
ZIP_FILENAME = 'SCCAQM_Dataset_Final.zip'
EXPORT_DIR = 'temp_logs_export'

# --- STEP 2: Initialize & Authenticate ---
if not firebase_admin._apps:
    cred = credentials.Certificate(KEY_FILE)
    firebase_admin.initialize_app(cred)
db = firestore.client()

print("‚úÖ Authentication Successful.")

# --- STEP 3: Download & Zip ---
if not os.path.exists(EXPORT_DIR):
    os.makedirs(EXPORT_DIR)

print(f"‚è≥ Downloading documents from '{COLLECTION_NAME}'...")
docs = db.collection(COLLECTION_NAME).stream()

count = 0
with zipfile.ZipFile(ZIP_FILENAME, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for doc in docs:
        data = doc.to_dict()

        # Convert timestamps to strings so they are readable in JSON
        for key, value in data.items():
            if hasattr(value, 'isoformat'):  # Check if it's a datetime object
                data[key] = value.isoformat()

        # Create a filename for this log (using the document ID)
        json_filename = f"{doc.id}.json"

        # Save to the zip file directly (writing data as a string)
        zipf.writestr(json_filename, json.dumps(data, indent=4))

        count += 1
        if count % 1000 == 0:
            print(f"   Processed {count} logs...")

print(f"\nüéâ SUCCESS! Download complete.")
print(f"üìä Total Records: {count}")
print(f"üì¶ File created: {ZIP_FILENAME}")

‚úÖ Authentication Successful.
‚è≥ Downloading documents from 'logs'...
   Processed 1000 logs...

üéâ SUCCESS! Download complete.
üìä Total Records: 1312
üì¶ File created: SCCAQM_Dataset_Final.zip


In [None]:
import zipfile
import json
import pandas as pd
import numpy as np
import os

# --- CONFIGURATION ---
ZIP_FILE_NAME = 'SCCAQM_Dataset_Final.zip'

# --- EXTRACT AND PROCESS ---
data_list = []

print("‚è≥ Reading dataset... this may take a moment.")

try:
    with zipfile.ZipFile(ZIP_FILE_NAME, 'r') as z:
        file_list = z.namelist()

        # Filter only JSON files
        json_files = [f for f in file_list if f.endswith('.json')]

        print(f"üìÑ Found {len(json_files)} JSON logs.")

        for filename in json_files:
            with z.open(filename) as f:
                try:
                    entry = json.load(f)
                    data_list.append(entry)
                except Exception as e:
                    print(f"Skipping bad file {filename}: {e}")

    # Convert to DataFrame
    df = pd.DataFrame(data_list)

    # --- CLEANING (THE FIX IS HERE) ---
    # We use format='mixed' to handle timestamps that might look different
    if 'timestamp' in df.columns:
        df['timestamp'] = pd.to_datetime(df['timestamp'], format='mixed')
        df = df.sort_values('timestamp')

    # --- GENERATE REPORT ---
    print("\n" + "="*40)
    print("      DATASET STATISTICAL REPORT      ")
    print("="*40)

    # 1. Temporal Coverage
    print(f"\n--- 1. TEMPORAL COVERAGE ---")
    if 'timestamp' in df.columns:
        start_date = df['timestamp'].min()
        end_date = df['timestamp'].max()
        duration = end_date - start_date
        print(f"Start Date: {start_date}")
        print(f"End Date:   {end_date}")
        print(f"Duration:   {duration}")
    else:
        print("No 'timestamp' column found.")

    # 2. Volume
    print(f"\n--- 2. DATA VOLUME ---")
    print(f"Total Records (Instances): {len(df)}")
    print(f"Total Columns (Attributes): {len(df.columns)}")
    print(f"Attribute Names: {list(df.columns)}")

    # 3. Sensor Statistics (Numerical)
    print(f"\n--- 3. ATTRIBUTE STATISTICS (Numerical) ---")

    # We explicitly look for your specific columns
    target_cols = ['T_C', 'RH_Pct', 'Calibrated_CO', 'Calibrated_VOC', 'Heat_Index', 'MQ7_Raw', 'MQ9_Raw']
    existing_cols = [c for c in target_cols if c in df.columns]

    if existing_cols:
        # round to 2 decimal places for cleaner reading
        stats = df[existing_cols].describe().T[['min', 'max', 'mean', 'std']].round(2)
        print(stats)
    else:
        print("Could not find standard sensor columns. Printing all numeric stats:")
        print(df.describe().T[['min', 'max', 'mean', 'std']].round(2))

    # 4. Categorical Distribution
    print(f"\n--- 4. CATEGORICAL DISTRIBUTIONS ---")
    if 'ML_Diagnosis' in df.columns:
        print("\nDiagnosis Counts:")
        print(df['ML_Diagnosis'].value_counts())

    if 'systemMode' in df.columns:
        print("\nSystem Mode Counts:")
        print(df['systemMode'].value_counts())

    print("\n" + "="*40)
    print("PLEASE COPY THE CONTENT ABOVE THIS LINE")
    print("="*40)

except FileNotFoundError:
    print(f"‚ùå ERROR: Could not find '{ZIP_FILE_NAME}'. Please upload it to Colab.")
except Exception as e:
    print(f"‚ùå An error occurred: {e}")

‚è≥ Reading dataset... this may take a moment.
üìÑ Found 1312 JSON logs.

      DATASET STATISTICAL REPORT      

--- 1. TEMPORAL COVERAGE ---
Start Date: 2025-11-14 05:12:47.599000+00:00
End Date:   2025-11-19 10:13:01.500000+00:00
Duration:   5 days 05:00:13.901000

--- 2. DATA VOLUME ---
Total Records (Instances): 1312
Total Columns (Attributes): 11
Attribute Names: ['Comfort_Metric', 'CO_PPM_ML', 'Flame_Alert', 'ML_Diagnosis', 'RH_Pct', 'T_C', 'VOC_PPM_ML', 'Shock_Mode', 'timestamp', 'Shock_Event', 'Actuator_State']

--- 3. ATTRIBUTE STATISTICS (Numerical) ---
         min   max   mean   std
T_C     25.1  28.3  26.89  0.86
RH_Pct  63.3  95.6  83.51  9.73

--- 4. CATEGORICAL DISTRIBUTIONS ---

Diagnosis Counts:
ML_Diagnosis
Normal             1268
Flame_Alert          43
Intrusion_Alert       1
Name: count, dtype: int64

PLEASE COPY THE CONTENT ABOVE THIS LINE
