In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
import datetime as dt
import math
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

import helper
from helper import *

In [2]:
df = load_and_preprocess()
df = df.reset_index(drop = True)

In [3]:
df = feature_engineer(df)

  df['week'] = df['datetime'].dt.weekofyear


Unnamed: 0_level_0,Description,Humidity,Wind Direction,Temperature,Pressure,Wind Speed,clouds,rain,mist,snow,...,rolling_min_temp,rolling_min_pressure,rolling_min_wind_dir,rolling_min_wind_speed,rolling_min_humidity,rolling_max_temp,rolling_max_pressure,rolling_max_wind_dir,rolling_max_wind_speed,rolling_max_humidity
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-10-01 13:00:00,heavy clouds,93.0,1001.0,285.830000,230.0,4.0,1,0,0,0,...,,,,,,,,,,
2012-10-01 14:00:00,sky is clear,91.0,986.0,285.834650,230.0,4.0,0,0,0,0,...,285.83,230.0,986.0,4.0,91.0,285.834650,230.0,1001.0,4.0,93.0
2012-10-01 15:00:00,sky is clear,87.0,945.0,285.847790,231.0,4.0,0,0,0,0,...,285.83,230.0,945.0,4.0,87.0,285.847790,231.0,1001.0,4.0,93.0
2012-10-01 16:00:00,sky is clear,84.0,904.0,285.860929,233.0,4.0,0,0,0,0,...,285.83,230.0,904.0,4.0,84.0,285.860929,233.0,1001.0,4.0,93.0
2012-10-01 17:00:00,sky is clear,80.0,863.0,285.874069,234.0,3.0,0,0,0,0,...,285.83,230.0,863.0,3.0,80.0,285.874069,234.0,1001.0,4.0,93.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-11-29 20:00:00,moderate clouds,64.0,1021.0,275.130000,300.0,10.0,1,0,0,0,...,243.30,0.0,822.0,0.0,7.0,307.880000,360.0,1054.0,24.0,100.0
2017-11-29 21:00:00,moderate clouds,59.0,1023.0,274.130000,300.0,7.0,1,0,0,0,...,243.30,0.0,822.0,0.0,7.0,307.880000,360.0,1054.0,24.0,100.0
2017-11-29 22:00:00,light clouds,66.0,1024.0,273.480000,290.0,7.0,1,0,0,0,...,243.30,0.0,822.0,0.0,7.0,307.880000,360.0,1054.0,24.0,100.0
2017-11-29 23:00:00,light clouds,58.0,1026.0,272.480000,290.0,4.0,1,0,0,0,...,243.30,0.0,822.0,0.0,7.0,307.880000,360.0,1054.0,24.0,100.0


In [None]:
cutoff = math.floor(0.75*len(df)) #train/test cutoff

#### Just Using Date/Previous Hour Temp.

In [None]:
df.head()

In [None]:
train = df[:cutoff].reset_index(drop = True)
train = train.iloc[:, 4:5].values
test = df[cutoff:]
test = test.iloc[:, 4:5].values

In [None]:
sc = StandardScaler()
train_std = sc.fit_transform(train)
test_std = sc.transform(test)

In [None]:
def split(sequence, n_timestamp):
    X, y = [], []
    
    for i in range(len(sequence)):
        end = i + n_timestamp
        if end > len(sequence)-1:
            break
        sequence_x, sequence_y = sequence[i:end], sequence[end]
        X.append(sequence_x)
        y.append(sequence_y)
    return np.array(X), np.array(y)

In [None]:
X_train, y_train = split(sequence = train_std, n_timestamp = 10)
X_test, y_test = split(sequence = test_std, n_timestamp = 10)

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
print(y_train.shape)
print(y_test.shape)

In [None]:
model = keras.models.Sequential()

model.add(keras.layers.LSTM(units = 50, return_sequences = True, input_shape = X_train.shape[1:]))
model.add(keras.layers.LSTM(units = 50))
model.add(keras.layers.Dense(1))

In [None]:
model.compile(loss = 'mse', optimizer = 'adam')

In [None]:
history = model.fit(X_train, y_train, epochs = 3)

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_test_inv = sc.inverse_transform(y_test)
y_pred_inv = sc.inverse_transform(y_pred)

In [None]:
from matplotlib import pyplot as plt
plt.figure(figsize = (20, 10))
plt.plot(y_test_inv, label = 'Actual')
plt.plot(y_pred_inv, label = 'Predicted')
plt.ylab('Temperature')
plt.xlab('Day')
plt.xlim([0, 365])
plt.ylim([278, 305])
plt.legend()
plt.show()

In [None]:
print('MSE:', mean_squared_error(y_test_inv, y_pred_inv))
print('R-Squared:', r2_score(y_test_inv, y_pred_inv))

In [None]:
math.sqrt(mean_squared_error(y_test_inv, y_pred_inv))

#### Trying with multiple predictors

In [None]:
def split_multiple(sequence, n_timestamp, target):
    X, y = [], []
    
    for i in range(len(sequence)):
        end = i + n_timestamp
        if end > len(sequence)-1:
            break
        sequence_x, sequence_y = sequence[i:end], target[end]
        X.append(sequence_x)
        y.append(sequence_y)
    return np.array(X), np.array(y)

In [None]:
train = df[:cutoff].reset_index(drop = True)
target_train = train[['Temperature']].values
train = train.drop(['datetime', 'Description'], axis = 1).values

test = df[:cutoff].reset_index(drop = True)
target_test = test[['Temperature']].values
test = test.drop(['datetime', 'Description'], axis = 1).values

In [None]:
sc = StandardScaler()
sc2 = StandardScaler()
train_std = sc.fit_transform(train)
target_train_std = sc2.fit_transform(target_train)
test_std = sc.transform(test)
target_test_std = sc2.transform(target_test)

In [None]:
X_train, y_train = split_multiple(train_std, 10, target_train_std)

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
model = keras.models.Sequential()

model.add(keras.layers.LSTM(units = 50, return_sequences = True, input_shape = X_train.shape[1:]))
model.add(keras.layers.LSTM(units = 50))
model.add(keras.layers.Dense(1))

In [None]:
model.compile(loss = 'mse', optimizer = 'adam')

In [None]:
history = model.fit(X_train, y_train, epochs = 3)

In [None]:
X_test, y_test = split_multiple(test_std, 10, target_test_std)

In [None]:
y_pred = model.predict(X_test)

y_test_inv = sc2.inverse_transform(y_test)
y_pred_inv = sc2.inverse_transform(y_pred)

In [None]:
from matplotlib import pyplot as plt
plt.figure(figsize = (20, 10))
plt.plot(y_test_inv, label = 'Actual')
plt.plot(y_pred_inv, label = 'Predicted')
plt.ylab('Temperature')
plt.xlab('Day')
plt.xlim([0, 365])
plt.ylim([270, 295])
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

print('MSE:', mean_squared_error(y_test_inv, y_pred_inv))
print('R-Squared:', r2_score(y_test_inv, y_pred_inv))

In [None]:
math.sqrt(mean_squared_error(y_test_inv, y_pred_inv))

### Incorporating Engineered Features

In [None]:
df['datetime'] = pd.to_datetime(df['datetime'])
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['hour'] = df['datetime'].dt.hour
df['minute'] = df['datetime'].dt.minute
df['weekday'] = df['datetime'].dt.weekday
df['week'] = df['datetime'].dt.weekofyear
df['quarter'] = df['datetime'].dt.quarter
df['month start'] = df['datetime'].dt.is_month_start
df['month end'] = df['datetime'].dt.is_month_end
df['quarter start'] = df['datetime'].dt.is_quarter_start
df['quarter end'] = df['datetime'].dt.is_quarter_end

df = df.set_index('datetime')
df['max daily temp']=df.resample('D')['Temperature'].transform('max')
df['max daily temp']=df['max daily temp'].shift(24)
df['max daily hum']=df.resample('D')['Humidity'].transform('max')
df['max daily hum']=df['max daily hum'].shift(24)
df['max daily wind speed']=df.resample('D')['Wind Speed'].transform('max')
df['max daily wind speed']=df['max daily wind speed'].shift(24)
df['max daily wind direction']=df.resample('D')['Wind Direction'].transform('max')
df['max daily wind direction']=df['max daily wind direction'].shift(24)
df['max daily pressure']=df.resample('D')['Pressure'].transform('max')
df['max daily pressure']=df['max daily pressure'].shift(24)

df['max weekly temp']=df.resample('W')['Temperature'].transform('max')
df['max weekly temp']=df['max weekly temp'].shift(168)
df['max weekly hum']=df.resample('W')['Humidity'].transform('max')
df['max weekly hum']=df['max weekly hum'].shift(168)
df['max weekly wind speed']=df.resample('W')['Wind Speed'].transform('max')
df['max weekly wind speed']=df['max weekly wind speed'].shift(168)
df['max weekly wind direction']=df.resample('W')['Wind Direction'].transform('max')
df['max weekly wind direction']=df['max weekly wind direction'].shift(168)
df['max weekly pressure']=df.resample('W')['Pressure'].transform('max')
df['max weekly pressure']=df['max weekly pressure'].shift(168)

df['min daily temp']=df.resample('D')['Temperature'].transform('min')
df['min daily temp']=df['min daily temp'].shift(24)
df['min daily hum']=df.resample('D')['Humidity'].transform('min')
df['min daily hum']=df['min daily temp'].shift(24)
df['min daily wind speed']=df.resample('D')['Wind Speed'].transform('min')
df['min daily wind speed']=df['min daily temp'].shift(24)
df['min daily wind direction']=df.resample('D')['Wind Direction'].transform('min')
df['min daily wind direction']=df['min daily temp'].shift(24)
df['min daily pressure']=df.resample('D')['Pressure'].transform('min')
df['min daily pressure']=df['min daily temp'].shift(24)


df['min weekly temp']=df.resample('W')['Temperature'].transform('min')
df['min weekly temp']=df['min weekly temp'].shift(168)
df['min weekly hum']=df.resample('W')['Humidity'].transform('min')
df['min weekly hum']=df['min weekly hum'].shift(168)
df['min weekly wind speed']=df.resample('W')['Wind Speed'].transform('min')
df['min weekly wind speed']=df['min weekly wind speed'].shift(168)
df['min weekly wind direction']=df.resample('W')['Wind Direction'].transform('min')
df['min weekly wind direction']=df['min weekly wind direction'].shift(168)
df['min weekly pressure']=df.resample('W')['Pressure'].transform('min')
df['min weekly pressure']=df['min weekly pressure'].shift(168)

df['mean daily temp']=df.resample('D')['Temperature'].transform('max')
df['mean daily temp']=df['mean daily temp'].shift(24)
df['mean daily hum']=df.resample('D')['Humidity'].transform('max')
df['mean daily hum']=df['mean daily hum'].shift(24)
df['mean daily wind speed']=df.resample('D')['Wind Speed'].transform('max')
df['mean daily wind speed']=df['mean daily wind speed'].shift(24)
df['mean daily wind direction']=df.resample('D')['Wind Direction'].transform('max')
df['mean daily wind direction']=df['mean daily wind direction'].shift(24)
df['mean daily pressure']=df.resample('D')['Pressure'].transform('max')
df['mean daily pressure']=df['mean daily pressure'].shift(24)

df['mean weekly temp']=df.resample('W')['Temperature'].transform('mean')
df['mean weekly temp']=df['mean weekly temp'].shift(168)
df['mean weekly hum']=df.resample('W')['Humidity'].transform('mean')
df['mean weekly hum']=df['mean weekly hum'].shift(168)
df['mean weekly wind speed']=df.resample('W')['Wind Speed'].transform('mean')
df['mean weekly wind speed']=df['mean weekly wind speed'].shift(168)
df['mean weekly wind direction']=df.resample('W')['Wind Direction'].transform('mean')
df['mean weekly wind direction']=df['mean weekly wind direction'].shift(168)
df['mean weekly pressure']=df.resample('W')['Pressure'].transform('mean')
df['mean weekly pressure']=df['mean weekly pressure'].shift(168)

df['rolling_mean_temp'] = df['Temperature'].rolling(window=24).mean()
df['rolling_mean_pressure'] = df['Pressure'].rolling(window=24).mean()
df['rolling_mean_wind_dir'] = df['Wind Direction'].rolling(window=24).mean()
df['rolling_mean_wind_speed'] = df['Wind Speed'].rolling(window=24).mean()
df['rolling_mean_humidity'] = df['Humidity'].rolling(window=24).mean()

df['rolling_min_temp'] = df['Temperature'].rolling(window=24).min()
df['rolling_min_pressure'] = df['Pressure'].rolling(window=24).min()
df['rolling_min_wind_dir'] = df['Wind Direction'].rolling(window=24).min()
df['rolling_min_wind_speed'] = df['Wind Speed'].rolling(window=24).min()
df['rolling_min_humidity'] = df['Humidity'].rolling(window=24).min()

df['rolling_max_temp'] = df['Temperature'].rolling(window=24).max()
df['rolling_max_pressure'] = df['Pressure'].rolling(window=24).max()
df['rolling_max_wind_dir'] = df['Wind Direction'].rolling(window=24).max()
df['rolling_max_wind_speed'] = df['Wind Speed'].rolling(window=24).max()
df['rolling_max_humidity'] = df['Humidity'].rolling(window=24).max()

df['rolling_mean_temp'] = df['Temperature'].expanding(2).mean()
df['rolling_mean_pressure'] = df['Pressure'].expanding(2).mean()
df['rolling_mean_wind_dir'] = df['Wind Direction'].expanding(2).mean()
df['rolling_mean_wind_speed'] = df['Wind Speed'].expanding(2).mean()
df['rolling_mean_humidity'] = df['Humidity'].expanding(2).mean()

df['rolling_min_temp'] = df['Temperature'].expanding(2).min()
df['rolling_min_pressure'] = df['Pressure'].expanding(2).min()
df['rolling_min_wind_dir'] = df['Wind Direction'].expanding(2).min()
df['rolling_min_wind_speed'] = df['Wind Speed'].expanding(2).min()
df['rolling_min_humidity'] = df['Humidity'].expanding(2).min()

df['rolling_max_temp'] = df['Temperature'].expanding(2).max()
df['rolling_max_pressure'] = df['Pressure'].expanding(2).max()
df['rolling_max_wind_dir'] = df['Wind Direction'].expanding(2).max()
df['rolling_max_wind_speed'] = df['Wind Speed'].expanding(2).max()
df['rolling_max_humidity'] = df['Humidity'].expanding(2).max()

In [None]:
df.head()

##### RFE

In [None]:
df2 = load_and_preprocess()
df2 = df2.reset_index(drop=True)

In [None]:
df2 = feature_engineer(df2)

In [None]:
df = df.dropna()
# split into input and output
X = df.drop('Temperature',axis=1)
X = X.drop('Description',axis=1)
y = df['Temperature']

