In [2]:
!python --version

Python 3.10.11


<h3>Phase 1: Imports, the usual ones in neural network applications</h3>

In [3]:
!pip install pandas
!pip install numpy
!pip install seaborn
!pip install scikit-learn
!pip install tensorflow




[notice] A new release of pip is available: 23.0.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.0.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.0.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.0.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.0.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
# pip install scikit-learn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics

# pip install tensorflow
import tensorflow as tf
import keras
from keras import layers




<b>Load the data in pandas, and inspect its features</b>

In [5]:
df = pd.read_csv("Household energy bill data.csv")

In [6]:
df.describe()

Unnamed: 0,num_rooms,num_people,housearea,is_ac,is_tv,is_flat,ave_monthly_income,num_children,is_urban,amount_paid
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,1.962,4.897,794.70342,0.376,0.798,0.477,24684.98655,1.078,0.608,600.396359
std,1.030348,2.007591,147.771736,0.484622,0.401693,0.499721,9678.228224,0.934232,0.488441,181.40621
min,-1.0,-1.0,244.4,0.0,0.0,0.0,-1576.44,0.0,0.0,87.85185
25%,1.0,4.0,691.0375,0.0,1.0,0.0,18036.7625,0.0,0.0,475.065141
50%,2.0,5.0,789.97,0.0,1.0,0.0,24742.575,1.0,1.0,598.331726
75%,3.0,6.0,892.955,1.0,1.0,1.0,31402.3575,2.0,1.0,729.930489
max,5.0,11.0,1189.12,1.0,1.0,1.0,56531.08,4.0,1.0,1102.994109


In [7]:
df.head()

Unnamed: 0,num_rooms,num_people,housearea,is_ac,is_tv,is_flat,ave_monthly_income,num_children,is_urban,amount_paid
0,3,3,742.57,1,1,1,9675.93,2,0,560.481447
1,1,5,952.99,0,1,0,35064.79,1,1,633.283679
2,3,1,761.44,1,1,1,22292.44,0,0,511.879157
3,0,5,861.32,1,1,0,12139.08,0,0,332.992035
4,1,8,731.61,0,1,0,17230.1,2,1,658.285625


<b>Always check first: do we have duplicates and do we have missing values?</b>

In [None]:
# check for duplicates, this time, no duplicates (0)
df.duplicated().sum()

In [None]:
# check any missing values and in which columns?
# no missing values here either
df.isna().sum()

In [None]:
# it seems the data is more or less following
# a standard normal distribution
df['amount_paid'].hist()

<h3>Typically the data processing phase is a lot longer, in this case there's really not that much to fix.</h3>
<b>Therefore the data seems to be good to go</b>

In [None]:
# check our columns (for easy copy-paste for the X/y -division later)
df.columns

In [None]:
df.head()

In [None]:
# if you have more than one independent variables, list them all here
# leave out the target variable! (dependent variable)
X = df[['num_rooms', 'num_people', 'housearea', 'is_ac', 'is_tv', 'is_flat',
       'ave_monthly_income', 'num_children', 'is_urban']]

# have only the target variable here (dependent variable)
y = df['amount_paid']

In [None]:
# a nice trick to get all 6 variables for our neural network
# first split 70% for training data, and save the last 30% for temporary variables
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=101)

# then split again the temporary variables by 50% (so if it was 30% originally => now it's
# 15%/15% (because 30% * 50% = 15%))

# so after this, we have 70% for training, 15% testing, and 15% validation
# => 100% in total
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=101)

In [None]:
df.describe()

In [None]:
df.columns

<b>Build our neural network</b>

In [None]:
# 9 variables in this case for training
training_variable_amount = len(X.columns)

# create the structure of our neural network
model = keras.Sequential(
    [
        layers.Dense(12, activation="relu", input_shape=(training_variable_amount,)),
        layers.Dense(8, activation="relu"),
        layers.Dense(1)
    ]
)

# compile the neural network, use adam (most common one) as the optimizer
# and mean square error (mse) for the loss function (most common for regression problems)
model.compile(optimizer='adam', loss='mse')

# show a quick recap of our structure
model.summary()

# start training 
model.fit(x=X_train, y=y_train, epochs=1500, validation_data=(X_val, y_val))

<b>Training metrics, comparing losses</b>

In [None]:
# let's see if our models overfits
loss_df = pd.DataFrame(model.history.history)
loss_df.plot()

In [None]:
# these should be as relatively close to each other as possible
# in this case, ~4600 to 4450 is quite close
print("Test data evaluation:")
print(model.evaluate(X_test, y_test, verbose=0))
print("\nTrain data evaluation:")
print(model.evaluate(X_train, y_train, verbose=0))

In [None]:
# we need to calculate test predictions based on the TEST DATA SET
# for metrics that follow
test_predictions = model.predict(X_test)

# reshape the data for easier comparison table
test_predictions = pd.Series(test_predictions.reshape(len(y_test),))
pred_df = pd.DataFrame(np.asarray(y_test), columns=['Test True Y'])
pred_df = pd.concat([pred_df, test_predictions], axis=1)
pred_df.columns = ['Test True Y', 'Model Predictions']

# print the comparison table - true values vs. model predicted values
# we can nicely see here how far off our model is in some cases
pred_df

In [None]:
# compare the predictions to real values
# if these follow a diagonal line => they correlate well
sns.scatterplot(x='Test True Y', y='Model Predictions', data=pred_df)

In [None]:
# MAE - Mean average error
print("MAE")
print(round(metrics.mean_absolute_error(y_test, test_predictions), 2), "$")

# MSE - Mean square error
print("\nMSE")
print(round(metrics.mean_squared_error(y_test, test_predictions), 2), "$^2")

# RMSE - Root mean square error
print('\nRMSE:')
print(round(np.sqrt(metrics.mean_squared_error(y_test, test_predictions)), 2), "$")

# R-squared. 0 = the model descibes the dataset poorly
# 1 = model describes the dataset perfectly
print('\nR-squared:')
print(round(metrics.r2_score(y_test, test_predictions), 2))

# Explained Variance Score => 0 = the model descibes the dataset poorly
# 1 = model describes the dataset perfectly
# high variance score = model is a good fit for the data 
# low variance score = model is not a good fit for the data
# the higher the score, the model is more able to explain the variation in the data
# if score is low, we might need more and better data
print("\nExplained variance score:")
print(round(metrics.explained_variance_score(y_test, test_predictions), 2))

In [None]:
# residual histogram, does our model also follow 
# a normal distribution
sns.distplot((y_test - test_predictions))
plt.show()
plt.close()

In [None]:
df.columns

In [None]:
df.head(3)

In [None]:
# let's try with some new imaginary data
# one imaginary household without the price data (because that's what we will predict)
# modify this as needed regarding your own dataset
tester_row = {
    'num_rooms': 4, 
    'num_people': 3, 
    'housearea': 829, 
    'is_ac': 1, 
    'is_tv': 1, 
    'is_flat': 0,
    'ave_monthly_income': 18500, 
    'num_children': 1,
    'is_urban': 0
}

# convert to pandas-format
tester_row = pd.DataFrame([tester_row])

In [None]:
# finally get the prediction from our model
# based on the imaginary values above
result = model.predict(tester_row)[0]

print()
print(f"Estimated electricity bill for this household $:")
print(f"{round(float(result), 2)}")
print("----------------")