## **Using weather dataset of Basal City of Germany. Its 10 years of data from 2006–2016 and it has hourly entries of the weather related features.**

## Importing necessary libraries

In [42]:
import warnings
warnings.filterwarnings('ignore')

In [43]:
import os
import numpy as np
import pandas as pd

In [44]:
from matplotlib import pyplot as plt
%matplotlib inline

In [45]:
import sklearn
from sklearn.model_selection  import train_test_split
from sklearn.metrics import accuracy_score



In [46]:
from sklearn.linear_model import LinearRegression

In [47]:
import tensorflow as tf
from tensorflow import keras

In [48]:
from  sklearn import preprocessing 

## Importing the dataframe

In [49]:
#os.chdir("..")
weather_df = pd.read_csv("/home/eric/Documents/github_projects/WeatherPrediction/Basel.csv")


In [50]:
weather_df.head()

Unnamed: 0,timestamp,Basel Temperature [2 m elevation corrected],Basel Precipitation Total,Basel Wind Speed [10 m],Basel Wind Direction [10 m]
0,20201029T0000,11.315029,0.0,26.674032,238.24052
1,20201029T0100,10.845029,0.0,26.865265,237.5877
2,20201029T0200,10.855029,0.0,25.762016,236.97614
3,20201029T0300,11.015029,0.0,25.77962,234.09029
4,20201029T0400,10.70503,0.1,24.192429,233.47115


In [51]:
weather_df.columns

Index(['timestamp', 'Basel Temperature [2 m elevation corrected]',
       'Basel Precipitation Total', 'Basel Wind Speed [10 m]',
       'Basel Wind Direction [10 m]'],
      dtype='object')

In [52]:
weather_df.shape

(192, 5)

In [53]:
weather_df.describe()

Unnamed: 0,Basel Temperature [2 m elevation corrected],Basel Precipitation Total,Basel Wind Speed [10 m],Basel Wind Direction [10 m]
count,192.0,192.0,192.0,192.0
mean,12.231696,0.040625,11.380468,180.892226
std,3.534925,0.142217,5.120395,84.807062
min,4.815029,0.0,3.545589,7.594635
25%,9.672529,0.0,7.901569,135.0
50%,11.930029,0.0,10.721701,196.017075
75%,14.080029,0.0,13.898518,235.561
max,21.745028,1.1,26.865265,360.0


In [54]:
weather_df.isnull().any()

timestamp                                      False
Basel Temperature [2 m elevation corrected]    False
Basel Precipitation Total                      False
Basel Wind Speed [10 m]                        False
Basel Wind Direction [10 m]                    False
dtype: bool

In [55]:
weather_df_num = weather_df[list(weather_df.dtypes[weather_df.dtypes!='object'].index)]

In [56]:
weather_y = weather_df_num.pop('Basel Temperature [2 m elevation corrected]')
weather_x = weather_df_num

In [57]:
train_x,test_x,train_y,test_y = train_test_split(weather_x,weather_y,test_size = 0.2,random_state = 4)

In [58]:
train_x.head()

Unnamed: 0,Basel Precipitation Total,Basel Wind Speed [10 m],Basel Wind Direction [10 m]
26,0.0,15.913465,232.35239
80,0.0,9.779817,186.3402
190,0.0,7.771331,76.6075
113,0.0,10.883676,214.2157
105,0.0,11.159999,180.0


## **Implementation of Models**

## **Linear Regression**

In [59]:
model = LinearRegression()
model.fit(train_x,train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [60]:
prediction = model.predict(test_x)

In [61]:
## Calculating the error
linear_regression_error = np.mean((prediction-test_y)**2)
linear_regression_error

7.5770207963850025

In [62]:
pd.DataFrame({'actual':test_y,
              'prediction':prediction,
              'diff':(test_y-prediction)})

Unnamed: 0,actual,prediction,diff
11,12.595029,11.648674,0.946355
142,10.235029,14.055137,-3.820108
34,12.315029,13.494959,-1.17993
169,7.905028,9.474662,-1.569633
1,10.845029,16.061806,-5.216777
69,11.375029,12.12425,-0.749221
152,5.845029,8.71586,-2.870831
188,9.595029,10.244431,-0.649402
48,9.915029,10.392667,-0.477638
178,8.715029,9.690444,-0.975415


## **Polynomial Regression**

In [63]:
from sklearn.preprocessing import PolynomialFeatures

In [64]:
poly = PolynomialFeatures(degree=4)
x_poly = poly.fit_transform(train_x)

poly.fit(x_poly,train_y)
lin2 = LinearRegression()
lin2.fit(x_poly,train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [65]:
prediction2 = lin2.predict(poly.fit_transform(test_x))
## Calculating the error
polynomical_regression_error = np.mean((prediction2-test_y)**2)
polynomical_regression_error

144.06419390642378

In [66]:
pd.DataFrame({'actual':test_y,
              'prediction':prediction2,
              'diff':(test_y-prediction2)})

Unnamed: 0,actual,prediction,diff
11,12.595029,85.676164,-73.081135
142,10.235029,12.666774,-2.431745
34,12.315029,14.582501,-2.267472
169,7.905028,8.031607,-0.126578
1,10.845029,9.246128,1.598901
69,11.375029,13.304405,-1.929376
152,5.845029,6.547222,-0.702193
188,9.595029,10.167893,-0.572864
48,9.915029,10.218702,-0.303673
178,8.715029,8.565643,0.149386


## **Decision Tree Regression — CART**

In [67]:
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(train_x,train_y)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=0, splitter='best')

In [68]:
prediction3 = regressor.predict(test_x)
decisiontree_regression_error = np.mean((prediction3-test_y)**2)
decisiontree_regression_error

5.847035267436357

In [69]:
pd.DataFrame({'actual':test_y,
              'prediction':prediction3,
              'diff':(test_y-prediction3)})

Unnamed: 0,actual,prediction,diff
11,12.595029,10.445029,2.15
142,10.235029,16.115028,-5.879999
34,12.315029,13.155029,-0.84
169,7.905028,8.875029,-0.970001
1,10.845029,11.015029,-0.17
69,11.375029,11.745029,-0.37
152,5.845029,6.745029,-0.899999
188,9.595029,7.995029,1.6
48,9.915029,9.685029,0.23
178,8.715029,7.995029,0.72


## **Random Forest with maximum Dept — 10**

In [70]:
from sklearn.ensemble import RandomForestRegressor

regr = RandomForestRegressor(max_depth=10, random_state=0, n_estimators=100)
regr.fit(train_x, train_y)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=10, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

In [71]:
prediction4 = regr.predict(test_x)
randomforest_dept10_error = np.mean((prediction4-test_y)**2)
randomforest_dept10_error

5.396570018009168

In [72]:
pd.DataFrame({'actual':test_y,
              'prediction':prediction4,
              'diff':(test_y-prediction4)})

Unnamed: 0,actual,prediction,diff
11,12.595029,11.684349,0.91068
142,10.235029,14.948647,-4.713618
34,12.315029,12.495907,-0.180878
169,7.905028,8.442886,-0.537857
1,10.845029,11.21741,-0.372381
69,11.375029,11.704106,-0.329077
152,5.845029,7.018029,-1.173
188,9.595029,8.802162,0.792867
48,9.915029,9.708129,0.2069
178,8.715029,8.862296,-0.147267


## **Random Forest with maximum depth - 50**

In [73]:
regr = RandomForestRegressor(max_depth=50, random_state=0, n_estimators=100)
regr.fit(train_x, train_y)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=50, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

In [74]:
prediction5 = regr.predict(test_x)
randomforest_dept50_error = np.mean((prediction5-test_y)**2)
randomforest_dept50_error

5.429224822158827

In [75]:
pd.DataFrame({'actual':test_y,
              'prediction':prediction5,
              'diff':(test_y-prediction5)})

Unnamed: 0,actual,prediction,diff
11,12.595029,11.696129,0.8989
142,10.235029,14.937062,-4.702033
34,12.315029,12.487529,-0.1725
169,7.905028,8.441186,-0.536157
1,10.845029,11.222929,-0.3779
69,11.375029,11.583929,-0.2089
152,5.845029,7.018029,-1.173
188,9.595029,8.801796,0.793233
48,9.915029,9.708129,0.2069
178,8.715029,8.849596,-0.134567


## **Error Comparison between different models**

In [76]:
print("linear_regression_error :", linear_regression_error)
print("polynomical_regression_error :", polynomical_regression_error)
print("decisiontree_regression_error :", decisiontree_regression_error)
print("randomforest_dept10_error :", randomforest_dept10_error)
print("randomforest_dept50_error :", randomforest_dept50_error)

linear_regression_error : 7.5770207963850025
polynomical_regression_error : 144.06419390642378
decisiontree_regression_error : 5.847035267436357
randomforest_dept10_error : 5.396570018009168
randomforest_dept50_error : 5.429224822158827
