# Catboost model

In [9]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


# Load the dataset
df = pd.read_csv('electricity_consumption_data2.csv')

# Convert 'datetime' to datetime data type
df['datetime'] = pd.to_datetime(df['datetime'])

# Aggregate daily data to monthly data
df['year_month'] = df['datetime'].dt.to_period('M')
monthly_df = df.groupby(['year_month', 'guri_num'])['total_KW'].sum().reset_index()

# Extract month and year from the year_month column
monthly_df['year'] = monthly_df['year_month'].dt.year
monthly_df['month'] = monthly_df['year_month'].dt.month

# Select the features and target variable
features = ['guri_num', 'month', 'year']
target = 'total_KW'

X = monthly_df[features]
y = monthly_df[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Specify the categorical features
categorical_features = ['guri_num']

# Create the Pool object for CatBoost
train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_features)
test_pool = Pool(data=X_test, label=y_test, cat_features=categorical_features)

# Train a CatBoost Regressor
model = CatBoostRegressor(iterations=1000, depth=6, learning_rate=0.1, loss_function='RMSE', random_seed=42)
model.fit(train_pool, verbose=100)

# Make predictions on the test set
y_pred = model.predict(test_pool)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"Root Mean Squared Error: {rmse}")
print(f"R² Score: {r2}")
print(f"Mean Absolute Error: {mae}")

0:	learn: 39.6420363	total: 72.2ms	remaining: 1m 12s
100:	learn: 16.2305317	total: 6.05s	remaining: 53.9s
200:	learn: 15.8459366	total: 11.3s	remaining: 45s
300:	learn: 15.6447248	total: 17s	remaining: 39.5s
400:	learn: 15.4985489	total: 22.6s	remaining: 33.7s
500:	learn: 15.3671827	total: 28.3s	remaining: 28.2s
600:	learn: 15.2719006	total: 34.3s	remaining: 22.8s
700:	learn: 15.1884664	total: 40.2s	remaining: 17.1s
800:	learn: 15.0988905	total: 45.2s	remaining: 11.2s
900:	learn: 15.0267898	total: 50.5s	remaining: 5.55s
999:	learn: 14.9520342	total: 55.4s	remaining: 0us
Root Mean Squared Error: 14.384885005151625
R² Score: 0.8866880635798848
Mean Absolute Error: 11.147805408634705


In [2]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"Root Mean Squared Error: {rmse}")
print(f"R² Score: {r2}")
print(f"Mean Absolute Error: {mae}")

Root Mean Squared Error: 14.499966672337042
R² Score: 0.8087025895453765
Mean Absolute Error: 11.30461049255248


In [16]:


# Prepare for future predictions
# Extract unique house numbers (guri_num) from your dataset
valid_guri_nums = monthly_df['guri_num'].unique()

# Generate future months for prediction
future_dates = pd.date_range(start='2018-01-01', end='2026-12-31', freq='M')

# Create a DataFrame for future predictions
future_df = pd.DataFrame({
    'year_month': np.tile(future_dates, len(valid_guri_nums)),
    'guri_num': np.repeat(valid_guri_nums, len(future_dates))
})

# Extract month and year for future dates
future_df['year'] = future_df['year_month'].dt.year
future_df['month'] = future_df['year_month'].dt.month

# Select the features for future prediction
X_future = future_df[['guri_num', 'month', 'year']]

# Create the Pool object for future data
future_pool = Pool(data=X_future, cat_features=categorical_features)

# Predict future electricity consumption using the trained CatBoost model
future_df['predicted_total_KW'] = model.predict(future_pool)

# Display the future predictions
print(future_df.head(12))

# Save the future predictions to CSV
# future_df.to_csv('future_predictions.csv', index=False)

   year_month  guri_num  year  month  predicted_total_KW
0  2018-01-31  BOO13096  2018      1           97.108518
1  2018-02-28  BOO13096  2018      2          138.580204
2  2018-03-31  BOO13096  2018      3          168.485307
3  2018-04-30  BOO13096  2018      4          159.302909
4  2018-05-31  BOO13096  2018      5          156.454073
5  2018-06-30  BOO13096  2018      6          100.847673
6  2018-07-31  BOO13096  2018      7           88.779460
7  2018-08-31  BOO13096  2018      8           75.012261
8  2018-09-30  BOO13096  2018      9           64.989473
9  2018-10-31  BOO13096  2018     10           67.650657
10 2018-11-30  BOO13096  2018     11           72.752346
11 2018-12-31  BOO13096  2018     12           86.733387


In [17]:
print(future_df.tail(42))

      year_month  guri_num  year  month  predicted_total_KW
53958 2023-07-31  YAA97684  2023      7           87.322408
53959 2023-08-31  YAA97684  2023      8           75.757715
53960 2023-09-30  YAA97684  2023      9           65.796797
53961 2023-10-31  YAA97684  2023     10           67.617787
53962 2023-11-30  YAA97684  2023     11           72.737616
53963 2023-12-31  YAA97684  2023     12           87.058458
53964 2024-01-31  YAA97684  2024      1          103.588907
53965 2024-02-29  YAA97684  2024      2          145.011609
53966 2024-03-31  YAA97684  2024      3          171.665325
53967 2024-04-30  YAA97684  2024      4          160.643493
53968 2024-05-31  YAA97684  2024      5          157.788812
53969 2024-06-30  YAA97684  2024      6           98.982543
53970 2024-07-31  YAA97684  2024      7           90.131328
53971 2024-08-31  YAA97684  2024      8           78.436217
53972 2024-09-30  YAA97684  2024      9           68.666085
53973 2024-10-31  YAA97684  2024     10 

In [5]:
import joblib
joblib.dump(model, 'catboost_model.pkl')

['catboost_model.pkl']

# XGBoost model 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
# Load the dataset
df = pd.read_csv('electricity_consumption_data.csv')
# Convert 'datetime' to datetime data type
df['datetime'] = pd.to_datetime(df['datetime'])

# Aggregate daily data to monthly data
df['year_month'] = df['datetime'].dt.to_period('M')
monthly_df = df.groupby(['year_month', 'guri_num', 'deg_num'])['total_KW'].sum().reset_index()

# Extract month and year from the year_month column
monthly_df['year'] = monthly_df['year_month'].dt.year
monthly_df['month'] = monthly_df['year_month'].dt.month

# Select the features and target variable
features = ['guri_num', 'deg_num', 'month', 'year']
target = 'total_KW'

X = monthly_df[features]
y = monthly_df[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror')
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test)

# Evaluate the best model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"Root Mean Squared Error: {rmse}")
print(f"R² Score: {r2}")
print(f"Mean Absolute Error: {mae}")

# Generate future months for prediction
future_dates = pd.date_range(start='2023-01-01', end='2028-12-31', freq='M')

# Extract unique guri_num and deg_num combinations from the original dataset
unique_combinations = df[['guri_num', 'deg_num']].drop_duplicates()

# Create a DataFrame for future predictions
future_df = pd.DataFrame({
    'year_month': np.tile(future_dates, len(unique_combinations)),
    'guri_num': np.repeat(unique_combinations['guri_num'].values, len(future_dates)),
    'deg_num': np.repeat(unique_combinations['deg_num'].values, len(future_dates))
})

# Extract month and year for future dates
future_df['year'] = future_df['year_month'].dt.year
future_df['month'] = future_df['year_month'].dt.month

# Select the features for future prediction
X_future = future_df[['guri_num', 'deg_num', 'month', 'year']]

# Predict future electricity consumption
future_df['predicted_total_KW'] = xgb_model.predict(X_future)


# Output the future predictions
print(future_df.head(12))