In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
%matplotlib inline

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LinearRegression

import tensorflow as tf
from tensorflow import keras

from sklearn import preprocessing

In [2]:
os.chdir("/content/drive/My Drive/")
weather_df = pd.read_csv('weatherHistory.csv')

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
weather_df.head()

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
0,2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.
1,2006-04-01 01:00:00.000 +0200,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.
2,2006-04-01 02:00:00.000 +0200,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.
3,2006-04-01 03:00:00.000 +0200,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.
4,2006-04-01 04:00:00.000 +0200,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.


In [7]:
weather_df['Precip Type'].value_counts()

rain    85224
snow    10712
Name: Precip Type, dtype: int64

In [8]:
weather_df.loc[weather_df['Precip Type'].isnull(),'Precip Type']='rain'

In [9]:
round(100*(weather_df.isnull().sum()/len(weather_df.index)),2)

Formatted Date              0.0
Summary                     0.0
Precip Type                 0.0
Temperature (C)             0.0
Apparent Temperature (C)    0.0
Humidity                    0.0
Wind Speed (km/h)           0.0
Wind Bearing (degrees)      0.0
Visibility (km)             0.0
Loud Cover                  0.0
Pressure (millibars)        0.0
Daily Summary               0.0
dtype: float64

In [10]:
weather_df.loc[weather_df['Precip Type']=='rain','Precip Type'] = 1
weather_df.loc[weather_df['Precip Type']=='snow','Precip Type'] = 0

In [11]:
weather_df_num = weather_df[list(weather_df.dtypes[weather_df.dtypes!='object'].index)]

In [14]:
weather_y = weather_df_num.pop('Temperature (C)')
weather_x = weather_df_num

In [15]:
train_x,test_x,train_y,test_y = train_test_split(weather_x,weather_y,test_size = 0.2,random_state=4)

In [16]:
train_x.head()

Unnamed: 0,Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars)
70626,21.061111,0.31,12.558,110.0,16.1,0.0,1005.87
52457,25.016667,0.36,18.4989,352.0,10.3523,0.0,1025.36
90690,0.738889,0.89,17.1304,270.0,15.8263,0.0,1014.75
69528,13.772222,0.78,14.49,300.0,15.8263,0.0,1014.56
92419,23.288889,0.82,6.3917,357.0,16.1,0.0,1022.05


In [17]:
model = LinearRegression()
model.fit(train_x,train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [18]:
prediction = model.predict(test_x)

In [19]:
np.mean((prediction-test_y)**2)

0.902274371188337

In [24]:
pd.DataFrame({'actual':test_y,
              'prediction':prediction,
              'diff':(test_y-prediction)})

Unnamed: 0,actual,prediction,diff
37443,-2.288889,-3.355714,1.066825
86534,8.861111,9.418530,-0.557419
2082,9.805556,9.701321,0.104235
53130,27.222222,27.096837,0.125385
45196,17.705556,17.302053,0.403503
...,...,...,...
55387,-10.066667,-11.960987,1.894320
25976,9.972222,9.831699,0.140523
32423,12.777778,15.436904,-2.659127
26570,15.277778,16.307788,-1.030010


In [40]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree = 4)
x_poly = poly.fit_transform(train_x)

poly.fit(x_poly, train_y)
lin2 = LinearRegression()
lin2.fit(x_poly, train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [41]:
prediction2 = lin2.predict(poly.fit_transform(test_x))
np.mean((prediction2-test_y)**2)

0.14602317508410093

In [42]:
pd.DataFrame({'actual':test_y,
              'prediction':prediction2,
              'diff':(test_y-prediction2)})

Unnamed: 0,actual,prediction,diff
37443,-2.288889,-2.188155,-0.100734
86534,8.861111,9.113336,-0.252225
2082,9.805556,9.602271,0.203285
53130,27.222222,27.130414,0.091808
45196,17.705556,17.770751,-0.065196
...,...,...,...
55387,-10.066667,-10.172420,0.105753
25976,9.972222,9.773758,0.198464
32423,12.777778,13.294539,-0.516761
26570,15.277778,15.414607,-0.136829


In [43]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(train_x,train_y)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=0, splitter='best')

In [45]:
prediction3 = regressor.predict(test_x)
np.mean((prediction3-test_y)**2)

0.006723574480775507

In [46]:
pd.DataFrame({'actual':test_y,
              'prediction':prediction3,
              'diff':(test_y-prediction3)})

Unnamed: 0,actual,prediction,diff
37443,-2.288889,-2.272222,-1.666667e-02
86534,8.861111,8.855556,5.555556e-03
2082,9.805556,9.788889,1.666667e-02
53130,27.222222,27.222222,-3.552714e-15
45196,17.705556,17.705556,-7.105427e-15
...,...,...,...
55387,-10.066667,-10.000000,-6.666667e-02
25976,9.972222,9.972222,0.000000e+00
32423,12.777778,12.777778,-4.263256e-14
26570,15.277778,15.277778,3.552714e-15


In [47]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(max_depth = 10 , random_state = 0 , n_estimators = 100)
regr.fit(train_x,train_y)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=10, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

In [49]:
prediction4 = regr.predict(test_x)
np.mean((prediction4 - test_y)**2)

0.0071767589236793015

In [50]:
pd.DataFrame({'actual':test_y,
              'prediction':prediction4,
              'diff':(test_y-prediction4)})

Unnamed: 0,actual,prediction,diff
37443,-2.288889,-2.366102,0.077213
86534,8.861111,8.892260,-0.031149
2082,9.805556,9.847246,-0.041690
53130,27.222222,27.202118,0.020104
45196,17.705556,17.713716,-0.008160
...,...,...,...
55387,-10.066667,-10.100922,0.034255
25976,9.972222,9.913953,0.058269
32423,12.777778,12.781718,-0.003940
26570,15.277778,15.275648,0.002130


In [52]:
from sklearn.ensemble import RandomForestRegressor
regr1 = RandomForestRegressor(max_depth = 50 , random_state = 0 , n_estimators = 100)
regr1.fit(train_x,train_y)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=50, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

In [53]:
prediction5 = regr1.predict(test_x)
np.mean((prediction5 - test_y)**2)

0.001618068523446305

In [54]:
pd.DataFrame({'actual':test_y,
              'prediction':prediction4,
              'diff':(test_y-prediction4)})

Unnamed: 0,actual,prediction,diff
37443,-2.288889,-2.366102,0.077213
86534,8.861111,8.892260,-0.031149
2082,9.805556,9.847246,-0.041690
53130,27.222222,27.202118,0.020104
45196,17.705556,17.713716,-0.008160
...,...,...,...
55387,-10.066667,-10.100922,0.034255
25976,9.972222,9.913953,0.058269
32423,12.777778,12.781718,-0.003940
26570,15.277778,15.275648,0.002130
