<a href="https://colab.research.google.com/github/Krishnadev-cmd/House-Energy/blob/main/training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
import argparse
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import json

In [27]:
def load_data(data_dir):
    """Load parquet files from directory"""
    print(f"Loading data from {data_dir}")

    files = [f for f in os.listdir(data_dir) if f.endswith('.parquet')]
    print(f"Found {len(files)} parquet files")
    dfs = []
    for file in files:
        filepath = os.path.join(data_dir, file)
        df = pd.read_parquet(filepath)
        dfs.append(df)

    df = pd.concat(dfs, ignore_index=True)
    print(f"Total rows loaded: {len(df)}")

    return df
df=load_data('/content/drive/MyDrive/DataSets/parquet/')

print(load_data('/content/drive/MyDrive/DataSets/parquet/'))

Loading data from /content/drive/MyDrive/DataSets/parquet/
Found 1 parquet files
Total rows loaded: 33418
Loading data from /content/drive/MyDrive/DataSets/parquet/
Found 1 parquet files
Total rows loaded: 33418
                 datetime  hour  day_of_week  day  month  year  is_weekend  \
0     2006-12-23 17:00:00    17            7   23     12  2006           1   
1     2006-12-23 18:00:00    18            7   23     12  2006           1   
2     2006-12-23 19:00:00    19            7   23     12  2006           1   
3     2006-12-23 20:00:00    20            7   23     12  2006           1   
4     2006-12-23 21:00:00    21            7   23     12  2006           1   
...                   ...   ...          ...  ...    ...   ...         ...   
33413 2010-11-26 17:00:00    17            6   26     11  2010           0   
33414 2010-11-26 18:00:00    18            6   26     11  2010           0   
33415 2010-11-26 19:00:00    19            6   26     11  2010           0   
33416 20

In [28]:
feature_cols=['hour', 'day_of_week', 'day', 'month', 'is_weekend',
        'Global_active_power_max', 'Global_active_power_min',
        'Global_active_power_std', 'Voltage_mean',
        'power_lag_1h', 'power_lag_24h', 'power_lag_168h',
        'power_rolling_mean_7d', 'power_rolling_std_7d']
target_cols='target_power'

In [29]:
X, y = df[feature_cols], df[target_cols]

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(26734, 14)
(26734,)
(6684, 14)
(6684,)


In [35]:
model=RandomForestRegressor(n_estimators=100,
        max_depth=20,
        min_samples_split=10,
        min_samples_leaf=5,
        random_state=42,
        n_jobs=-1,
        verbose=1)
model.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   13.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   29.1s finished


In [36]:
y_train_pred=model.predict(X_train)
y_test_pred=model.predict(X_test)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.1s finished


In [40]:
train_metrics = {
        'train_rmse': np.sqrt(mean_squared_error(y_train, y_train_pred)),
        'train_mae': mean_absolute_error(y_train, y_train_pred),
        'train_r2': r2_score(y_train, y_train_pred)
    }

test_metrics = {
        'test_rmse': np.sqrt(mean_squared_error(y_test,y_test_pred)),
        'test_mae': mean_absolute_error(y_test,y_test_pred),
        'test_r2': r2_score(y_test, y_test_pred)
    }

print("\nTraining Metrics:")
print(json.dumps(train_metrics, indent=2))

print("\nTest Metrics:")
print(json.dumps(test_metrics, indent=2))


Training Metrics:
{
  "train_rmse": 0.1378240692238027,
  "train_mae": 0.07706410376700092,
  "train_r2": 0.9765201782739339
}

Test Metrics:
{
  "test_rmse": 0.20528608809427282,
  "test_mae": 0.11902636800683324,
  "test_r2": 0.9460570295825881
}


In [41]:
feature_importance = pd.DataFrame({
        'feature': feature_cols,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)

print("\nTop 10 Important Features:")
print(feature_importance.head(10))



Top 10 Important Features:
                    feature  importance
6   Global_active_power_min    0.646039
5   Global_active_power_max    0.291309
7   Global_active_power_std    0.032746
8              Voltage_mean    0.006098
9              power_lag_1h    0.004580
0                      hour    0.003411
11           power_lag_168h    0.002806
10            power_lag_24h    0.002734
12    power_rolling_mean_7d    0.002722
13     power_rolling_std_7d    0.002544
