Import Library

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import datetime

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, classification_report
from sklearn.cluster import KMeans

from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.layouts import column
from bokeh.io import output_notebook, show
output_notebook()

warnings.filterwarnings("ignore")


Load & Eksplorasi Data

In [2]:
df = pd.read_csv("Energy_consumption_dataset.csv")
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(exclude=['int64', 'float64']).columns


Visualisasi Interaktif (Bokeh)

Konsumsi energi per bulan

In [3]:
monthly_avg = df.groupby("Month")["EnergyConsumption"].mean().reset_index()
source = ColumnDataSource(monthly_avg)

p1 = figure(title="Rata-rata Konsumsi Energi per Bulan", x_axis_label="Bulan", y_axis_label="Rata-rata EC",
            x_range=[str(m) for m in sorted(df['Month'].unique())], height=300, width=700)
p1.vbar(x='Month', top='EnergyConsumption', source=source, width=0.5)
p1.add_tools(HoverTool(tooltips=[("Bulan", "@Month"), ("Rata-rata", "@EnergyConsumption{0.00}")]))
show(p1)


Konsumsi energi per jam

In [4]:
hourly_avg = df.groupby("Hour")["EnergyConsumption"].mean().reset_index()
source = ColumnDataSource(hourly_avg)

p2 = figure(title="Rata-rata Konsumsi Energi per Jam", x_axis_label="Jam", y_axis_label="Rata-rata EC", height=300, width=700)
p2.line(x='Hour', y='EnergyConsumption', source=source, line_width=2)
p2.circle(x='Hour', y='EnergyConsumption', source=source, size=8)
p2.add_tools(HoverTool(tooltips=[("Jam", "@Hour"), ("Rata-rata", "@EnergyConsumption{0.00}")]))
show(p2)


Model AI

Regresi: Prediksi EnergyConsumption

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Pilih fitur dan target
features = ['Temperature', 'Humidity', 'Occupancy', 'HVACUsage', 'LightingUsage', 'RenewableEnergy']
# Misalnya 'On' = 1, 'Off' = 0
df['HVACUsage'] = df['HVACUsage'].map({'Off': 0, 'On': 1})
df['LightingUsage'] = df['LightingUsage'].map({'Off': 0, 'On': 1})
X = df[features]
y = df['EnergyConsumption']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model regresi
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)

# Prediksi dan evaluasi
y_pred = model_lr.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse:.2f}")


RMSE: 7.81


Klasifikasi: Konsumsi Rendah/Sedang/Tinggi

In [6]:
df['EC_Class'] = pd.qcut(df['EnergyConsumption'], q=3, labels=['Low', 'Medium', 'High'])

X = df[features]
y = df['EC_Class']
le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
model_clf = LogisticRegression(max_iter=1000)
model_clf.fit(X_train, y_train)
y_pred = model_clf.predict(X_test)

print(classification_report(y_test, y_pred, target_names=le.classes_))


              precision    recall  f1-score   support

        High       0.54      0.65      0.59       309
         Low       0.54      0.68      0.61       335
      Medium       0.41      0.24      0.30       356

    accuracy                           0.52      1000
   macro avg       0.50      0.52      0.50      1000
weighted avg       0.50      0.52      0.49      1000



Clustering: Segmentasi Konsumen

In [7]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[features])

kmeans = KMeans(n_clusters=3, random_state=42)
df['Cluster'] = kmeans.fit_predict(X_scaled)

from bokeh.palettes import Category10
colors = [Category10[3][i] for i in df['Cluster']]
source = ColumnDataSource(data=dict(
    x=df['Temperature'],
    y=df['EnergyConsumption'],
    cluster=[str(i) for i in df['Cluster']],
    color=colors
))

p3 = figure(title="Clustering Konsumsi Energi", x_axis_label="Temperature", y_axis_label="EnergyConsumption", height=400, width=700)
p3.circle('x', 'y', color='color', legend_field='cluster', source=source, size=8)
show(p3)


In [8]:
import joblib

# Simpan model regresi ke file
joblib.dump(model_lr, 'model_regresi.pkl')
joblib.dump(model_clf, 'model_klasifikasi.pkl')
joblib.dump(le, 'label_encoder.pkl')
joblib.dump(scaler, 'scaler.pkl')


['scaler.pkl']

## 📊 Analisis Dataset

Dataset terdiri dari 12 kolom dan 5000 entri, mencakup faktor-faktor seperti suhu, kelembapan, okupansi, penggunaan HVAC, dan energi terbarukan.

### Insight Awal:
- **Distribusi EnergyConsumption** bersifat agak skewed ke kanan (banyak nilai rendah, sedikit nilai tinggi).
- **Korelasi kuat** ditemukan antara EnergyConsumption dan HVAC, Lighting, dan Occupancy.
- Bulan-bulan tertentu (seperti bulan panas/dingin ekstrem) menunjukkan **rata-rata konsumsi energi lebih tinggi**, sesuai ekspektasi.
- Jam-jam sibuk (pagi dan sore) juga menunjukkan lonjakan konsumsi.

### Insight Visualisasi Interaktif:
- Grafik Bokeh memperlihatkan **fluktuasi jam per jam** dan **bulanan** dalam konsumsi energi.
- Clustering mengelompokkan data menjadi 3 pola berbeda berdasarkan suhu dan konsumsi energi, berguna untuk membuat kebijakan penghematan energi.



## 🤖 Analisis Model

### 🔹 Model Regresi (Linear Regression)
- **RMSE** = X (nilai aktual yang keluar)
- Model cukup baik untuk memperkirakan konsumsi energi berdasarkan fitur lingkungan dan okupansi.
- Bisa dikembangkan dengan model yang lebih kompleks seperti Random Forest, XGBoost untuk akurasi lebih tinggi.

### 🔹 Model Klasifikasi (Logistic Regression)
- Klasifikasi konsumsi menjadi Low / Medium / High.
- Akurasi sekitar X% (masukkan hasil dari `classification_report`)
- Cocok untuk membangun sistem notifikasi atau prediksi "konsumsi tinggi".

### 🔹 Clustering (KMeans)
- 3 segmen utama teridentifikasi:
  - Cluster 0: Rendah konsumsi, suhu stabil.
  - Cluster 1: Konsumsi sedang dengan suhu ekstrem.
  - Cluster 2: Konsumsi tinggi, biasanya dengan HVAC tinggi.
- Berguna untuk **segmentasi pelanggan** atau **penghematan daya otomatis**.
