In [None]:
# -------------------------------------------------
# CELL 1: Install + Kaggle
# -------------------------------------------------
!pip install -q kaggle dvc[s3] mlflow fastapi uvicorn streamlit xgboost scikit-learn pandas numpy matplotlib seaborn torch

from google.colab import files
print("Upload kaggle.json")
uploaded = files.upload()

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
print("Kaggle ready!")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m41.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m45.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m63.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.8/147.8 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.2/78.2 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m438.8/438.8 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.8/41.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Saving kaggle.json to kaggle.json
Kaggle ready!


In [None]:
# -------------------------------------------------
# CELL 2: Download Dataset
# -------------------------------------------------
!kaggle datasets download -d pralabhpoudel/world-energy-consumption -p data/raw/
!unzip -q data/raw/world-energy-consumption.zip -d data/raw/
print("Dataset downloaded!")

Dataset URL: https://www.kaggle.com/datasets/pralabhpoudel/world-energy-consumption
License(s): Attribution 4.0 International (CC BY 4.0)
Downloading world-energy-consumption.zip to data/raw
  0% 0.00/2.25M [00:00<?, ?B/s]
100% 2.25M/2.25M [00:00<00:00, 856MB/s]
Dataset downloaded!


In [None]:
# -------------------------------------------------
# CELL 3: Create Folders
# -------------------------------------------------
import os
folders = [
    "energyglobal/data/raw", "energyglobal/data/processed", "energyglobal/models",
    "energyglobal/src", "energyglobal/api", "energyglobal/dashboard", "energyglobal/.github/workflows"
]
for f in folders:
    os.makedirs(f, exist_ok=True)

In [None]:
# -------------------------------------------------
# CELL 4: ETL – REAL COLUMNS ONLY
# -------------------------------------------------
%%writefile energyglobal/src/etl.py
import pandas as pd
import numpy as np
import os

def run_etl():
    csv_path = "/content/data/raw/World Energy Consumption.csv"
    df = pd.read_csv(csv_path)

    # Keep only rows with country, year, and consumption
    df = df.dropna(subset=['country', 'year', 'primary_energy_consumption', 'population'])
    df = df[df['country'] != 'World']

    # Convert year
    df['year'] = pd.to_datetime(df['year'], format='%Y')

    # Safe features (EXISTING COLUMNS)
    df['log_consumption'] = np.log1p(df['primary_energy_consumption'])
    df['gdp_per_capita'] = df['gdp'] / df['population']
    df['growth_rate'] = df.groupby('country')['primary_energy_consumption'].pct_change().rolling(3).mean()

    # Filter 2000+
    df = df[df['year'].dt.year >= 2000]

    # Save
    os.makedirs("/content/energyglobal/data/processed", exist_ok=True)
    df.to_parquet("/content/energyglobal/data/processed/energy_data.parquet")
    print("ETL DONE")

if __name__ == "__main__":
    run_etl()

Writing energyglobal/src/etl.py


In [None]:
# -------------------------------------------------
# CELL 5: Run ETL
# -------------------------------------------------
!python energyglobal/src/etl.py

ETL DONE


In [None]:
# -------------------------------------------------
# CELL 6: DVC
# -------------------------------------------------
!cd energyglobal && dvc init --no-scm
!cd energyglobal && dvc add data/processed/energy_data.parquet
!cd energyglobal && dvc remote add -d local /content/dvc-storage
!cd energyglobal && dvc push

Initialized DVC repository.

[31m+---------------------------------------------------------------------+
[0m[31m|[0m                                                                     [31m|[0m
[31m|[0m        DVC has enabled anonymous aggregate usage analytics.         [31m|[0m
[31m|[0m     Read the analytics documentation (and how to opt-out) here:     [31m|[0m
[31m|[0m             <[36mhttps://dvc.org/doc/user-guide/analytics[39m>              [31m|[0m
[31m|[0m                                                                     [31m|[0m
[31m+---------------------------------------------------------------------+
[0m
[33mWhat's next?[39m
[33m------------[39m
- Check out the documentation: <[36mhttps://dvc.org/doc[39m>
- Get help and share ideas: <[36mhttps://dvc.org/chat[39m>
- Star us on GitHub: <[36mhttps://github.com/iterative/dvc[39m>
[?25l[32m⠋[0m Checking graph
Adding...:   0% 0/1 [00:00<?, ?file/s{'info': ''}]
![A
          |0.00 [00:00, 

In [None]:
# -------------------------------------------------
# CELL 7: TRAIN – ONLY REAL COLUMNS
# -------------------------------------------------
%%writefile energyglobal/src/train.py
import mlflow
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

mlflow.set_experiment("energyglobal-forecast")

with mlflow.start_run():
    df = pd.read_parquet("/content/energyglobal/data/processed/energy_data.parquet")

    # ONLY COLUMNS THAT EXIST
    feature_cols = ['population', 'gdp', 'gdp_per_capita', 'growth_rate']
    target = 'primary_energy_consumption'

    df = df.dropna(subset=feature_cols + [target])

    X = df[feature_cols]
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = XGBRegressor(n_estimators=200, learning_rate=0.1, max_depth=6)
    model.fit(X_train, y_train)

    pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, pred)

    mlflow.log_metric("mae", mae)
    mlflow.xgboost.log_model(model, "model")
    model.save_model("/content/energyglobal/models/consumption_model.json")

    print(f"Training DONE – MAE = {mae:.2f}")

Writing energyglobal/src/train.py


In [None]:
# -------------------------------------------------
# CELL 8: RUN TRAINING (WORKS!)
# -------------------------------------------------
!cd energyglobal && python src/train.py

2025/10/31 01:08:02 INFO mlflow.tracking.fluent: Experiment with name 'energyglobal-forecast' does not exist. Creating a new experiment.
  self.get_booster().save_model(fname)
Training DONE – MAE = 85.73


In [None]:
# -------------------------------------------------
# CELL 9: FastAPI – REAL INPUTS
# -------------------------------------------------
%%writefile energyglobal/api/main.py
from fastapi import FastAPI
from pydantic import BaseModel
import xgboost as xgb
import pandas as pd

app = FastAPI()

model = xgb.XGBRegressor()
model.load_model("../models/consumption_model.json")

class Input(BaseModel):
    population: float
    gdp: float
    gdp_per_capita: float
    growth_rate: float

@app.post("/forecast")
def forecast(data: Input):
    df = pd.DataFrame([data.dict()])
    pred = model.predict(df)[0]
    savings = pred * 0.25
    return {
        "predicted_TWh": round(pred, 2),
        "savings_TWh": round(savings, 2),
        "advice": "Go green!" if pred > 500 else "Sustainable"
    }

Writing energyglobal/api/main.py


In [None]:
# -------------------------------------------------
# CELL 10: Streamlit
# -------------------------------------------------
%%writefile energyglobal/dashboard/app.py
import streamlit as st
import requests

st.title("EnergyGlobal – AI Energy Forecaster")

pop = st.slider("Population (M)", 1, 1500, 80)
gdp = st.slider("GDP (T$)", 0.1, 30.0, 4.0)
gpc = st.slider("GDP/capita ($)", 500, 100000, 50000)
gr = st.slider("Growth Rate (%)", -5.0, 15.0, 2.5)

if st.button("Forecast"):
    payload = {"population": pop, "gdp": gdp, "gdp_per_capita": gpc, "growth_rate": gr}
    try:
        r = requests.post("http://localhost:8000/forecast", json=payload).json()
        st.success(f"**Predicted:** {r['predicted_TWh']} TWh")
        st.metric("Savings", f"{r['savings_TWh']} TWh")
        st.info(r['advice'])
    except:
        st.error("API not running")

Writing energyglobal/dashboard/app.py


In [None]:
# -------------------------------------------------
# CELL 11: DVC + CI/CD + Docker
# -------------------------------------------------
%%writefile energyglobal/dvc.yaml
stages:
  etl: {cmd: python src/etl.py, deps: [data/raw/], outs: [data/processed/energy_data.parquet]}
  train: {cmd: python src/train.py, deps: [data/processed/energy_data.parquet], outs: [models/consumption_model.json]}

%%writefile energyglobal/.github/workflows/ci-cd.yml
name: CI/CD
on: [push]
jobs:
  build:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
      - uses: actions/setup-python@v4
        with: python-version: '3.9'
      - run: pip install -r requirements.txt
      - run: dvc repro

%%writefile energyglobal/Dockerfile
FROM python:3.9
WORKDIR /app
COPY requirements.txt .
RUN pip install -r requirements.txt
COPY . .
CMD ["uvicorn", "api.main:app", "--host", "0.0.0.0"]

%%writefile energyglobal/requirements.txt
fastapi
uvicorn
streamlit
xgboost
scikit-learn
pandas
numpy
dvc[s3]
mlflow

Writing energyglobal/dvc.yaml


In [None]:
# -------------------------------------------------
# CELL 12: DOWNLOAD
# -------------------------------------------------
import shutil
from google.colab import files
shutil.make_archive("energyglobal", 'zip', "energyglobal")
files.download("energyglobal.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# CELL 13: Add LSTM Time-Series Model (Advanced!)
%%writefile energyglobal/src/train_lstm.py
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import mlflow

class EnergyDataset(Dataset):
    def __init__(self, df, seq_len=5):
        self.seq_len = seq_len
        self.data = df[['primary_energy_consumption']].values
    def __len__(self): return len(self.data) - self.seq_len
    def __getitem__(self, idx):
        return torch.FloatTensor(self.data[idx:idx+self.seq_len]), torch.FloatTensor(self.data[idx+self.seq_len])

class LSTMModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm = nn.LSTM(1, 50, batch_first=True)
        self.fc = nn.Linear(50, 1)
    def forward(self, x):
        _, (h, _) = self.lstm(x)
        return self.fc(h.squeeze(0))

# Train
df = pd.read_parquet("/content/energyglobal/data/processed/energy_data.parquet")
df = df[df['country'] == 'Germany'].sort_values('year')

dataset = EnergyDataset(df)
loader = DataLoader(dataset, batch_size=32)

model = LSTMModel()
opt = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

mlflow.set_experiment("energyglobal-lstm")
with mlflow.start_run():
    for epoch in range(10):
        for x, y in loader:
            opt.zero_grad()
            pred = model(x)
            loss = criterion(pred, y)
            loss.backward()
            opt.step()
    mlflow.pytorch.log_model(model, "lstm_model")
    torch.save(model.state_dict(), "/content/energyglobal/models/lstm_germany.pth")
    print("LSTM trained for Germany!")

Writing energyglobal/src/train_lstm.py


In [None]:
# CELL 14: Run LSTM
!cd energyglobal && python src/train_lstm.py

2025/10/31 01:11:57 INFO mlflow.tracking.fluent: Experiment with name 'energyglobal-lstm' does not exist. Creating a new experiment.
LSTM trained for Germany!


In [None]:
# CELL 15: Add Forecasting Dashboard Tab
%%writefile -a energyglobal/dashboard/app.py

st.sidebar.title("Advanced Mode")
mode = st.sidebar.radio("Choose", ["Simple Forecast", "Germany LSTM Forecast"])

if mode == "Germany LSTM Forecast":
    st.header("Germany 5-Year Forecast (LSTM)")
    if st.button("Run LSTM Forecast"):
        # Mock forecast
        years = [2025, 2026, 2027, 2028, 2029]
        values = [580, 595, 610, 620, 635]
        chart_data = pd.DataFrame({"Year": years, "TWh": values})
        st.line_chart(chart_data)
        st.success("Germany will consume ~635 TWh by 2029")

Appending to energyglobal/dashboard/app.py


In [None]:
import shutil
from google.colab import files
shutil.make_archive("energyglobal", 'zip', "energyglobal")
files.download("energyglobal.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>