# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Loading CSV File

In [2]:
data = pd.read_csv("WeatherHistory.csv")

In [3]:
data

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
0,2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.
1,2006-04-01 01:00:00.000 +0200,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.
2,2006-04-01 02:00:00.000 +0200,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.
3,2006-04-01 03:00:00.000 +0200,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.
4,2006-04-01 04:00:00.000 +0200,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.
...,...,...,...,...,...,...,...,...,...,...,...,...
96448,2016-09-09 19:00:00.000 +0200,Partly Cloudy,rain,26.016667,26.016667,0.43,10.9963,31.0,16.1000,0.0,1014.36,Partly cloudy starting in the morning.
96449,2016-09-09 20:00:00.000 +0200,Partly Cloudy,rain,24.583333,24.583333,0.48,10.0947,20.0,15.5526,0.0,1015.16,Partly cloudy starting in the morning.
96450,2016-09-09 21:00:00.000 +0200,Partly Cloudy,rain,22.038889,22.038889,0.56,8.9838,30.0,16.1000,0.0,1015.66,Partly cloudy starting in the morning.
96451,2016-09-09 22:00:00.000 +0200,Partly Cloudy,rain,21.522222,21.522222,0.60,10.5294,20.0,16.1000,0.0,1015.95,Partly cloudy starting in the morning.


In [4]:
data.shape

(96453, 12)

In [5]:
data.isnull().any()

Formatted Date              False
Summary                     False
Precip Type                  True
Temperature (C)             False
Apparent Temperature (C)    False
Humidity                    False
Wind Speed (km/h)           False
Wind Bearing (degrees)      False
Visibility (km)             False
Loud Cover                  False
Pressure (millibars)        False
Daily Summary               False
dtype: bool

# Data Pre-Processing

In [6]:
data['Precip Type'].value_counts()

rain    85224
snow    10712
Name: Precip Type, dtype: int64

In [7]:
data['Precip Type'] = data['Precip Type'].fillna(data['Precip Type'].mode()[0])

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96453 entries, 0 to 96452
Data columns (total 12 columns):
Formatted Date              96453 non-null object
Summary                     96453 non-null object
Precip Type                 96453 non-null object
Temperature (C)             96453 non-null float64
Apparent Temperature (C)    96453 non-null float64
Humidity                    96453 non-null float64
Wind Speed (km/h)           96453 non-null float64
Wind Bearing (degrees)      96453 non-null float64
Visibility (km)             96453 non-null float64
Loud Cover                  96453 non-null float64
Pressure (millibars)        96453 non-null float64
Daily Summary               96453 non-null object
dtypes: float64(8), object(4)
memory usage: 8.8+ MB


In [9]:
data.describe()

Unnamed: 0,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars)
count,96453.0,96453.0,96453.0,96453.0,96453.0,96453.0,96453.0,96453.0
mean,11.932678,10.855029,0.734899,10.81064,187.509232,10.347325,0.0,1003.235956
std,9.551546,10.696847,0.195473,6.913571,107.383428,4.192123,0.0,116.969906
min,-21.822222,-27.716667,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.688889,2.311111,0.6,5.8282,116.0,8.3398,0.0,1011.9
50%,12.0,12.0,0.78,9.9659,180.0,10.0464,0.0,1016.45
75%,18.838889,18.838889,0.89,14.1358,290.0,14.812,0.0,1021.09
max,39.905556,39.344444,1.0,63.8526,359.0,16.1,0.0,1046.38


In [10]:
Precip_Type = pd.get_dummies(data['Precip Type'], drop_first = True)

In [11]:
Precip_Type

Unnamed: 0,snow
0,0
1,0
2,0
3,0
4,0
...,...
96448,0
96449,0
96450,0
96451,0


In [12]:
data = pd.concat([data, Precip_Type], axis = 1)

In [13]:
data.columns

Index(['Formatted Date', 'Summary', 'Precip Type', 'Temperature (C)',
       'Apparent Temperature (C)', 'Humidity', 'Wind Speed (km/h)',
       'Wind Bearing (degrees)', 'Visibility (km)', 'Loud Cover',
       'Pressure (millibars)', 'Daily Summary', 'snow'],
      dtype='object')

In [14]:
data.drop(['Formatted Date', 'Summary', 'Precip Type', 'Daily Summary',], axis = 1, inplace = True)

In [15]:
data.head()

Unnamed: 0,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),snow
0,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,0
1,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,0
2,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,0
3,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,0
4,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,0


In [16]:
data.columns

Index(['Temperature (C)', 'Apparent Temperature (C)', 'Humidity',
       'Wind Speed (km/h)', 'Wind Bearing (degrees)', 'Visibility (km)',
       'Loud Cover', 'Pressure (millibars)', 'snow'],
      dtype='object')

In [17]:
X = data.iloc[:, 1:]
y = data.iloc[:, 0]

In [18]:
X.shape

(96453, 8)

# Dividing Data into Train and Test

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 5)

In [21]:
X_train.head()

Unnamed: 0,Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),snow
17232,8.772222,0.8,3.4293,33.0,14.9086,0.0,1019.03,0
55631,-4.177778,0.69,28.98,290.0,16.1,0.0,1016.0,0
3339,-15.711111,0.87,6.3917,48.0,11.6725,0.0,1031.04,1
84619,2.05,0.73,16.7923,1.0,16.1,0.0,1024.01,0
26376,13.333333,0.55,11.27,150.0,16.1,0.0,1013.2,0


In [22]:
X_test.head()

Unnamed: 0,Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),snow
31418,2.222222,0.96,3.22,40.0,9.982,0.0,1018.2,0
52873,-2.822222,0.61,6.44,200.0,16.1,0.0,1016.5,1
32723,17.222222,0.52,8.05,180.0,11.27,0.0,1020.2,0
2194,10.95,0.71,20.6402,149.0,9.982,0.0,1015.17,0
7586,1.394444,0.83,6.8103,140.0,15.8263,0.0,1014.19,0


# Linear Regression

In [23]:
from sklearn.linear_model import LinearRegression

linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [24]:
linear_prediction = linear_reg.predict(X_test)

In [25]:
linear_prediction[:5]

array([ 3.51124243, -0.08552297, 17.57538123, 12.84233745,  3.27259053])

In [26]:
np.mean((y_test - linear_prediction) ** 2)

0.8947929505133739

In [27]:
pd.DataFrame({'Actual': y_test,
              'Predicted': linear_prediction,
              'Difference': (y_test - linear_prediction)})

Unnamed: 0,Actual,Predicted,Difference
31418,2.222222,3.511242,-1.289020
52873,-0.555556,-0.085523,-0.470033
32723,17.222222,17.575381,-0.353159
2194,10.950000,12.842337,-1.892337
7586,3.200000,3.272591,-0.072591
...,...,...,...
76328,27.244444,26.855017,0.389427
42855,8.888889,8.712931,0.175958
91229,-2.177778,-3.741665,1.563887
8325,17.222222,16.875002,0.347220


# Polynomial Regression

In [28]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree = 4)
X_poly = poly.fit_transform(X_train)

In [29]:
polynomial_reg = LinearRegression()
polynomial_reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [30]:
polynomial_prediction = polynomial_reg.predict(X_test)

In [31]:
polynomial_prediction[:5]

array([ 3.51124243, -0.08552297, 17.57538123, 12.84233745,  3.27259053])

In [32]:
np.mean((y_test - polynomial_prediction) ** 2)

0.8947929505133739

In [33]:
pd.DataFrame({'Actual': y_test,
              'Predicted': polynomial_prediction,
              'Difference': y_test - polynomial_prediction})

Unnamed: 0,Actual,Predicted,Difference
31418,2.222222,3.511242,-1.289020
52873,-0.555556,-0.085523,-0.470033
32723,17.222222,17.575381,-0.353159
2194,10.950000,12.842337,-1.892337
7586,3.200000,3.272591,-0.072591
...,...,...,...
76328,27.244444,26.855017,0.389427
42855,8.888889,8.712931,0.175958
91229,-2.177778,-3.741665,1.563887
8325,17.222222,16.875002,0.347220


# Decision Tree Regressor

In [34]:
from sklearn.tree import DecisionTreeRegressor

In [35]:
dtree_reg = DecisionTreeRegressor(random_state = 3)

In [36]:
dtree_reg.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=3, splitter='best')

In [37]:
dtree_prediction = dtree_reg.predict(X_test)

In [38]:
dtree_prediction[:5]

array([ 2.22222222, -0.55555556, 17.22222222, 10.95      ,  3.13333333])

In [39]:
np.mean((y_test - dtree_prediction) ** 2)

0.0060247966332408415

In [40]:
pd.DataFrame({'Actual': y_test,
              'Predicted': dtree_prediction,
              'Difference': y_test - dtree_prediction})

Unnamed: 0,Actual,Predicted,Difference
31418,2.222222,2.222222,-8.881784e-16
52873,-0.555556,-0.555556,-1.110223e-16
32723,17.222222,17.222222,-1.776357e-14
2194,10.950000,10.950000,1.776357e-15
7586,3.200000,3.133333,6.666667e-02
...,...,...,...
76328,27.244444,27.205556,3.888889e-02
42855,8.888889,8.888889,0.000000e+00
91229,-2.177778,-2.200000,2.222222e-02
8325,17.222222,17.222222,-1.776357e-14


# Random Forest Regressor

In [41]:
from sklearn.ensemble import RandomForestRegressor

In [42]:
rf_reg = RandomForestRegressor(max_depth = 50, random_state = 3)

In [43]:
rf_reg.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=50,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=3, verbose=0,
                      warm_start=False)

In [44]:
rf_prediction = rf_reg.predict(X_test)

In [45]:
rf_prediction[:5]

array([ 2.22222222, -0.55555556, 17.22222222, 10.95      ,  3.19166667])

In [46]:
np.mean((y_test - rf_prediction) ** 2)

0.003119216534800628

In [47]:
pd.DataFrame({'Actual': y_test,
              'Predicted': rf_prediction,
              'Difference': y_test - rf_prediction})

Unnamed: 0,Actual,Predicted,Difference
31418,2.222222,2.222222,0.000000e+00
52873,-0.555556,-0.555556,-1.110223e-16
32723,17.222222,17.222222,1.065814e-14
2194,10.950000,10.950000,-1.776357e-15
7586,3.200000,3.191667,8.333333e-03
...,...,...,...
76328,27.244444,27.228889,1.555556e-02
42855,8.888889,8.894444,-5.555556e-03
91229,-2.177778,-2.163333,-1.444444e-02
8325,17.222222,17.222222,1.065814e-14
