In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

# Loading dataset

In [3]:
data = pd.read_csv('global_heat_index.csv')

In [4]:
data.head(3)

Unnamed: 0,Year,Month,Day,Hour,Dew Point,Temperature,Pressure,Relative Humidity,Wind Direction,Wind Speed,Solar Radiation (GHI)
0,2011,1,1,0,8,13.522659,986.761841,72.295858,37.288387,3.011042,0
1,2011,1,1,1,8,12.835814,986.441406,75.376186,37.686718,3.091243,0
2,2011,1,1,2,8,12.198058,985.736511,78.405198,35.053905,3.007649,0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17520 entries, 0 to 17519
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Year                   17520 non-null  int64  
 1   Month                  17520 non-null  int64  
 2   Day                    17520 non-null  int64  
 3   Hour                   17520 non-null  int64  
 4   Dew Point              17520 non-null  int64  
 5   Temperature            17520 non-null  float64
 6   Pressure               17520 non-null  float64
 7   Relative Humidity      17520 non-null  float64
 8   Wind Direction         17520 non-null  float64
 9   Wind Speed             17520 non-null  float64
 10  Solar Radiation (GHI)  17520 non-null  int64  
dtypes: float64(5), int64(6)
memory usage: 1.5 MB


In [6]:
data.shape

(17520, 11)

# Cleaning Data

In [7]:
data.isnull().sum() # check for missing entries

Year                     0
Month                    0
Day                      0
Hour                     0
Dew Point                0
Temperature              0
Pressure                 0
Relative Humidity        0
Wind Direction           0
Wind Speed               0
Solar Radiation (GHI)    0
dtype: int64

In [8]:
data['Solar Radiation (GHI)'].nunique()

992

For the purpose of predicting Solar Radiation, there is no point in studying rows where Hour < 6 (6 am) or Hour > 19 (7 pm), because Solar Radiation does not happen after SUNSET (as exploration of data confirms)

In [9]:
data = data[data['Hour'] >= 7]

In [10]:
data.shape

(12410, 11)

In [11]:
data = data[ data['Hour'] <= 19]

In [12]:
data.shape

(9490, 11)

In [13]:
data.describe()

Unnamed: 0,Year,Month,Day,Hour,Dew Point,Temperature,Pressure,Relative Humidity,Wind Direction,Wind Speed,Solar Radiation (GHI)
count,9490.0,9490.0,9490.0,9490.0,9490.0,9490.0,9490.0,9490.0,9490.0,9490.0,9490.0
mean,2011.5,6.526027,15.720548,13.0,11.135722,30.275155,979.764586,40.148897,193.659705,2.418787,433.066386
std,0.500026,3.448033,8.79671,3.741855,11.219832,7.907448,6.206627,25.004457,100.563939,1.235776,308.524214
min,2011.0,1.0,1.0,7.0,-28.0,6.457841,964.755859,0.921771,0.03466,0.030113,0.0
25%,2011.0,4.0,8.0,10.0,2.0,25.474942,974.348267,18.042185,95.632074,1.504627,148.0
50%,2011.5,7.0,16.0,13.0,11.0,30.3548,980.541962,38.279008,231.0215,2.268829,446.0
75%,2012.0,10.0,23.0,16.0,22.0,35.255237,984.994659,59.969883,261.233154,3.195885,693.0
max,2012.0,12.0,31.0,19.0,27.0,52.157927,993.35321,97.663827,359.821167,7.287084,1001.0


Input and Output Data

In [14]:
x = data.iloc[:, 4:10].values
y = data.iloc[:, 10].values

In [15]:
y.shape

(9490,)

In [16]:
y.reshape(-1, 1)

array([[  0],
       [159],
       [363],
       ...,
       [ 16],
       [  0],
       [  0]], dtype=int64)

In [17]:
from sklearn.model_selection import train_test_split

x_train, y_train, x_test, y_test = train_test_split(x, y, test_size = 0.3)

In [18]:
from sklearn.preprocessing import StandardScaler

scaler_X = StandardScaler()
x_train = scaler_X.fit_transform(x_train)
scaler_y = StandardScaler()
y_train = scaler_y.fit_transform(y_train)

In [19]:
from sklearn.decomposition import PCA

In [20]:
pca = PCA(n_components=2)

In [21]:
pca.fit(x_train)

In [22]:
data_after_PCA = pca.transform(x_train)

In [23]:
data_after_PCA

array([[ 1.57756543,  1.0226408 ],
       [-0.30721093,  2.53748617],
       [ 0.70249661,  0.17783929],
       ...,
       [ 2.614763  , -1.31021334],
       [-1.57790344, -1.88368939],
       [-1.21172138,  1.51080932]])

In [24]:
data_after_PCA.shape # we have selected the best two directions out of an 10-dimensional data

(6643, 2)

In [25]:
pd.DataFrame(data_after_PCA).head() # the features are just the general direction of Principal Components

Unnamed: 0,0,1
0,1.577565,1.022641
1,-0.307211,2.537486
2,0.702497,0.177839
3,3.042173,-0.540205
4,1.841118,-0.358823
