In [None]:
from google.colab import drive

# This command will prompt you for authorization
drive.mount('/content/drive')

print("Google Drive mounted successfully.")

Mounted at /content/drive
Google Drive mounted successfully.


In [None]:
import zipfile
import os

# Define the paths
# This is the path to the zip file you uploaded
zip_file_path = "/content/drive/MyDrive/Indoor_air_quality.zip"

# This is where we'll unzip the first file
temp_unzip_path = "/content/drive/MyDrive/temp_dataset_folder/"

# This is the path to the *inner* zip file (based on our previous attempts)
inner_zip_path = os.path.join(temp_unzip_path, "aq_dataset.zip")

# This is the final destination for all your JSON files
final_json_path = "/content/drive/MyDrive/aq_dataset_unzipped/"

# Create the directories
os.makedirs(temp_unzip_path, exist_ok=True)
os.makedirs(final_json_path, exist_ok=True)

try:
    # --- First Unzip ---
    print(f"Starting to unzip {zip_file_path}...")
    with zipfile.ZipFile(zip_file_path, 'r') as z:
        z.extractall(temp_unzip_path)
    print(f"First unzip complete. Files are in {temp_unzip_path}")

    # --- Second Unzip ---
    if os.path.exists(inner_zip_path):
        print(f"Found inner zip file: {inner_zip_path}. Unzipping now...")
        with zipfile.ZipFile(inner_zip_path, 'r') as z:
            z.extractall(final_json_path)
        print(f"All JSON files are now in: {final_json_path}")

        # Clean up the temporary folder
        # os.remove(inner_zip_path)
        # os.rmdir(temp_unzip_path)
        print("Cleanup of temp files complete.")

    else:
        print(f"ERROR: Could not find 'aq_dataset.zip' inside the main zip.")
        print("Please check the contents of 'Indoor_air_quality.zip'.")

except FileNotFoundError:
    print(f"ERROR: File not found at {zip_file_path}")
    print("Please make sure the file name is correct and it's in 'My Drive'.")
except Exception as e:
    print(f"An error occurred: {e}")

Starting to unzip /content/drive/MyDrive/Indoor_air_quality.zip...
First unzip complete. Files are in /content/drive/MyDrive/temp_dataset_folder/
ERROR: Could not find 'aq_dataset.zip' inside the main zip.
Please check the contents of 'Indoor_air_quality.zip'.


In [None]:
import pandas as pd
import glob
import os
import json
import gc

# This is the path where you unzipped the files
json_folder_path = "/content/drive/MyDrive/aq_dataset_unzipped/"
json_files = glob.glob(f"{json_folder_path}/*.json")

# --- THIS IS THE KEY ---
# We MUST include the 'time' column to resample
columns_we_need = [
    'time',            # <-- The most important column
    'temperature',
    'humidity',
    'MQ7_CO_ppm',      # Model 1
    'MQ135_CO',        # Model 1
    'MQ9_CH4_ppm',     # Model 2
    'CH4'              # Model 2
]

if not json_files:
    print(f"ERROR: No .json files found in {json_folder_path}")
else:
    print(f"Found {len(json_files)} JSON files. Loading and filtering...")

    df_list = []

    for f in json_files:
        try:
            print(f"Processing {f}...")
            with open(f, 'r') as file:
                data = json.load(file)

            df_temp = pd.DataFrame(data=data['values'], columns=data['columns'])

            # --- MEMORY SAVING STEP ---
            # Filter for *only* the columns we need (now 7)
            df_filtered = df_temp[columns_we_need].copy()
            df_list.append(df_filtered)

            del data, df_temp, df_filtered
            gc.collect()

        except Exception as e:
            print(f"Error loading file {f}: {e}")

    # --- THIS IS THE NEW LOGIC ---
    if df_list:
        print("\nAll files processed. Concatenating...")

        # We'll call this df_raw
        df_raw = pd.concat(df_list, ignore_index=True)
        del df_list
        gc.collect()

        print("Data combined. Now converting time and resampling...")

        # 1. Convert 'time' column to datetime objects
        # This can take a minute
        print("Converting 'time' column to datetime (this may take a while)...")
        df_raw['time'] = pd.to_datetime(df_raw['time'])

        # 2. Set 'time' as the index
        print("Setting time index...")
        df_raw.set_index('time', inplace=True)

        # 3. Resample the data into 1-minute averages
        # This will average all readings within each minute
        print("Resampling data to 1-minute averages...")
        # 'T' stands for minute. We take the mean() of all values in that minute.
        df_resampled = df_raw.resample('1T').mean()

        print(f"Original rows: {len(df_raw)}, Resampled rows: {len(df_resampled)}")

        # Clean up the huge raw dataframe
        del df_raw
        gc.collect()

        # --- Model 1 (CO) DataFrame ---
        print("\n--- Processing CO Model Data ---")
        co_cols = ['MQ7_CO_ppm', 'MQ135_CO', 'temperature', 'humidity']
        df_co = df_resampled[co_cols].copy()
        print(f"Resampled CO rows: {len(df_co)}")
        df_co.dropna(inplace=True) # Drop rows missing CO data
        print(f"Clean CO rows for training: {len(df_co)}")
        print(df_co.head())

        # --- Model 2 (VOC) DataFrame ---
        print("\n--- Processing VOC Model Data ---")
        voc_cols = ['MQ9_CH4_ppm', 'CH4', 'temperature', 'humidity']
        df_voc = df_resampled[voc_cols].copy()
        print(f"Resampled VOC rows: {len(df_voc)}")
        df_voc.dropna(inplace=True) # Drop rows missing VOC data
        print(f"Clean VOC rows for training: {len(df_voc)}")
        print(df_voc.head())

        print("\nSuccessfully created 'df_co' and 'df_voc'.")
        print("You can now run Step 5 and 6.")

    else:
        print("\nNo data was loaded. Please check the file paths and structure.")

Found 13 JSON files. Loading and filtering...
Processing /content/drive/MyDrive/aq_dataset_unzipped/2023-12-09_2023-12-31.json...
Processing /content/drive/MyDrive/aq_dataset_unzipped/2024-01-01_2024-01-31.json...
Processing /content/drive/MyDrive/aq_dataset_unzipped/2024-02-01_2024-02-29.json...
Processing /content/drive/MyDrive/aq_dataset_unzipped/2024-03-01_2024-03-31.json...
Processing /content/drive/MyDrive/aq_dataset_unzipped/2024-04-01_2024-04-30.json...
Processing /content/drive/MyDrive/aq_dataset_unzipped/2024-05-01_2024-05-31.json...
Processing /content/drive/MyDrive/aq_dataset_unzipped/2024-06-01_2024-06-30.json...
Processing /content/drive/MyDrive/aq_dataset_unzipped/2024-07-01_2024-07-31.json...
Processing /content/drive/MyDrive/aq_dataset_unzipped/2024-08-01_2024-08-31.json...
Processing /content/drive/MyDrive/aq_dataset_unzipped/2024-09-01_2024-09-30.json...
Processing /content/drive/MyDrive/aq_dataset_unzipped/2024-10-01_2024-10-31.json...
Processing /content/drive/MyDr

  df_resampled = df_raw.resample('1T').mean()


Original rows: 12790707, Resampled rows: 528480

--- Processing CO Model Data ---
Resampled CO rows: 528480
Clean CO rows for training: 519205
                           MQ7_CO_ppm  MQ135_CO  temperature   humidity
time                                                                   
2023-12-09 00:00:00+00:00        69.0  3.927202    23.950000  33.025000
2023-12-09 00:01:00+00:00        69.0  3.822472    24.200000  33.133333
2023-12-09 00:02:00+00:00        64.0  4.219279    24.166667  33.133333
2023-12-09 00:03:00+00:00        69.0  3.653514    24.166667  33.133333
2023-12-09 00:04:00+00:00        63.0  4.219279    24.166667  33.133333

--- Processing VOC Model Data ---
Resampled VOC rows: 528480
Clean VOC rows for training: 493506
                           MQ9_CH4_ppm    CH4  temperature   humidity
time                                                                 
2023-12-09 00:00:00+00:00    10.437220  576.0    23.950000  33.025000
2023-12-09 00:01:00+00:00    10.437220  575.0

In [None]:
from sklearn.linear_model import LinearRegression

print("--- Training Model 1 (CO Calibration) ---")

# 1. Define our features (X) and target (y) from the 'df_co' DataFrame
features_co = ['MQ7_CO_ppm', 'temperature', 'humidity']
target_co = 'MQ135_CO'

X_co = df_co[features_co]
y_co = df_co[target_co]

# 2. Create and train the model
model_co = LinearRegression()
model_co.fit(X_co, y_co)

# 3. Get the formula!
print("\nModel training complete.")
print("The formula is: y = (A * MQ7_CO) + (B * temp) + (C * humid) + Intercept\n")

# Get the coefficients (A, B, C)
coeffs_co = model_co.coef_
intercept_co = model_co.intercept_

print("--- ðŸ“‹ COPY THESE VALUES FOR YOUR ESP32 ---")
print(f"Coefficient A (for MQ7_CO_ppm): {coeffs_co[0]}")
print(f"Coefficient B (for temperature): {coeffs_co[1]}")
print(f"Coefficient C (for humidity):    {coeffs_co[2]}")
print(f"Intercept:                       {intercept_co}")
print("------------------------------------------")

--- Training Model 1 (CO Calibration) ---

Model training complete.
The formula is: y = (A * MQ7_CO) + (B * temp) + (C * humid) + Intercept

--- ðŸ“‹ COPY THESE VALUES FOR YOUR ESP32 ---
Coefficient A (for MQ7_CO_ppm): 0.20667453673791028
Coefficient B (for temperature): -0.0843970229794428
Coefficient C (for humidity):    0.004321370524231025
Intercept:                       -8.055104846752645
------------------------------------------


In [None]:
from sklearn.linear_model import LinearRegression

print("--- Training Model 2 (VOC/CH4 Calibration) ---")

# 1. Define our features (X) and target (y) from the 'df_voc' DataFrame
features_voc = ['MQ9_CH4_ppm', 'temperature', 'humidity']
target_voc = 'CH4'

X_voc = df_voc[features_voc]
y_voc = df_voc[target_voc]

# 2. Create and train the model
model_voc = LinearRegression()
model_voc.fit(X_voc, y_voc)

# 3. Get the formula!
print("\nModel training complete.")
print("The formula is: y = (A * MQ9_CH4) + (B * temp) + (C * humid) + Intercept\n")

# Get the coefficients (A, B, C)
coeffs_voc = model_voc.coef_
intercept_voc = model_voc.intercept_

print("--- ðŸ“‹ COPY THESE VALUES FOR YOUR ESP32 ---")
print(f"Coefficient A (for MQ9_CH4_ppm): {coeffs_voc[0]}")
print(f"Coefficient B (for temperature): {coeffs_voc[1]}")
print(f"Coefficient C (for humidity):    {coeffs_voc[2]}")
print(f"Intercept:                       {intercept_voc}")
print("------------------------------------------")

--- Training Model 2 (VOC/CH4 Calibration) ---

Model training complete.
The formula is: y = (A * MQ9_CH4) + (B * temp) + (C * humid) + Intercept

--- ðŸ“‹ COPY THESE VALUES FOR YOUR ESP32 ---
Coefficient A (for MQ9_CH4_ppm): 22.681154964538678
Coefficient B (for temperature): -40.81730199744915
Coefficient C (for humidity):    -0.6595396060778214
Intercept:                       1811.8701469517512
------------------------------------------
