In [1]:
# Datset source
# https://archive.ics.uci.edu/ml/datasets/Combined+Cycle+Power+Plant

In [2]:
# Problem Statement: To predict the net hourly electrical energy output (PE) of the plant based on Temperature (AT), Ambient Pressure (AP), Relative Humidity (RH) and Exhaust Vacuum (V) 

In [3]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [4]:
# Read the dataset

import pandas as pd
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
ccpp_df = pd.read_excel('CCPP/Folds5x2_pp.xlsx')
print(ccpp_df.shape)
ccpp_df.head()

(9568, 5)


Unnamed: 0,AT,V,AP,RH,PE
0,14.96,41.76,1024.07,73.17,463.26
1,25.18,62.96,1020.04,59.08,444.37
2,5.11,39.4,1012.16,92.14,488.56
3,20.86,57.32,1010.24,76.64,446.48
4,10.82,37.5,1009.23,96.62,473.9


In [5]:
# Check for NAN values in the entire dataframe

ccpp_df.isnull().sum().sum()

0

In [6]:
# Info about the dataframe

ccpp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9568 entries, 0 to 9567
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AT      9568 non-null   float64
 1   V       9568 non-null   float64
 2   AP      9568 non-null   float64
 3   RH      9568 non-null   float64
 4   PE      9568 non-null   float64
dtypes: float64(5)
memory usage: 373.9 KB


In [7]:
# Some statistics about the dataframe

ccpp_df.describe()

Unnamed: 0,AT,V,AP,RH,PE
count,9568.0,9568.0,9568.0,9568.0,9568.0
mean,19.651231,54.305804,1013.259078,73.308978,454.365009
std,7.452473,12.707893,5.938784,14.600269,17.066995
min,1.81,25.36,992.89,25.56,420.26
25%,13.51,41.74,1009.1,63.3275,439.75
50%,20.345,52.08,1012.94,74.975,451.55
75%,25.72,66.54,1017.26,84.83,468.43
max,37.11,81.56,1033.3,100.16,495.76


In [8]:
# To make this notebook's output identical at every run

np.random.seed(2)

In [9]:
# Plot correlation between mpg and other features

corr_matrix = ccpp_df.corr()
corr_matrix['PE'].sort_values(ascending=False)

PE    1.000000
AP    0.518429
RH    0.389794
V    -0.869780
AT   -0.948128
Name: PE, dtype: float64

In [10]:
# Split the dataframe into features and labels

X = ccpp_df.drop(['PE'], axis=1).values
y = ccpp_df.loc[:, 'PE'].values
print("X shape: ", X.shape, "y shape: ", y.shape)
print("Sample X values: ", X[:5], "\n", "Sample y values: ", y[:5])

X shape:  (9568, 4) y shape:  (9568,)
Sample X values:  [[  14.96   41.76 1024.07   73.17]
 [  25.18   62.96 1020.04   59.08]
 [   5.11   39.4  1012.16   92.14]
 [  20.86   57.32 1010.24   76.64]
 [  10.82   37.5  1009.23   96.62]] 
 Sample y values:  [463.26 444.37 488.56 446.48 473.9 ]


In [11]:
# Split the dataset into train, validation and test sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.05, random_state=2)
print(" X_train shape: ", X_train.shape,"\n", "y_train shape: ", y_train.shape,"\n",
     "X_val shape: ", X_val.shape,"\n", "y_val shape: ", y_val.shape,"\n",
     "X_test shape: ", X_test.shape,"\n", "y_test shape: ", y_test.shape,"\n")

 X_train shape:  (8634, 4) 
 y_train shape:  (8634,) 
 X_val shape:  (455, 4) 
 y_val shape:  (455,) 
 X_test shape:  (479, 4) 
 y_test shape:  (479,) 



In [12]:
# Model 1
# Sklearn Simple Linear Regression model with default parameters

from sklearn.linear_model import LinearRegression
lr_model_1 = LinearRegression()
lr_model_1.fit(X_train, y_train)
print("Train set score: ", lr_model_1.score(X_train, y_train))
print("Validation set score: ", lr_model_1.score(X_val, y_val))
print("Test set score: ", lr_model_1.score(X_test, y_test))

Train set score:  0.9280737562687583
Validation set score:  0.9404015383445053
Test set score:  0.9282067529660801


In [13]:
# Mean Squared Errors of train, validation and test set predictions

from sklearn.metrics import mean_squared_error
print("Train set mse: ", mean_squared_error(y_train, lr_model_1.predict(X_train)))
print("Validation set mse: ", mean_squared_error(y_val, lr_model_1.predict(X_val)))
print("Test set mse: ", mean_squared_error(y_test, lr_model_1.predict(X_test)))

Train set mse:  20.956017779730466
Validation set mse:  18.051001736831182
Test set mse:  19.971756679011065


In [14]:
# Mean Absolute Errors of train, validation and test set predictions

from sklearn.metrics import mean_absolute_error
print("Train set mse: ", mean_absolute_error(y_train, lr_model_1.predict(X_train)))
print("Validation set mse: ", mean_absolute_error(y_val, lr_model_1.predict(X_val)))
print("Test set mse: ", mean_absolute_error(y_test, lr_model_1.predict(X_test)))

Train set mse:  3.631047939737698
Validation set mse:  3.4686948028862408
Test set mse:  3.690398134822012


In [15]:
# Since the R^2 values of validation and test set are close to 1 and low values of mean squared error, mean absolute errors, no complex models are developed.