In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import scipy.stats
from os.path import exists


In [2]:
# Load pre processed data (run explore.py if it doesn't exist)
processed_df = pd.read_csv("processed.csv")

In [3]:
processed_df.head(15)

Unnamed: 0.1,Unnamed: 0,index,Date,Time,Temperature,Dew Point,Humidity,Wind,Wind Speed,Wind Gust,Pressure,Precip.,Condition,TemperaturePrev,target
0,0,24,2014-01-01,12:00 PM,7.222222,36,71,S,6,0,29.57,0.0,Fair,2.222222,-4.444444
1,1,25,2014-01-01,12:30 PM,7.777778,36,66,S,6,0,29.57,0.0,Fair,2.777778,-5.0
2,2,26,2014-01-01,1:00 PM,7.777778,34,62,SW,10,0,29.57,0.0,Fair,2.777778,-5.0
3,3,27,2014-01-01,1:30 PM,7.777778,34,62,SW,12,0,29.57,0.0,Fair,2.777778,-5.555556
4,4,28,2014-01-01,2:00 PM,7.222222,34,66,SW,12,0,29.57,0.0,Fair,2.777778,-3.888889
5,5,29,2014-01-01,2:30 PM,7.222222,34,66,SW,9,0,29.57,0.0,Fair,2.777778,-2.777778
6,6,30,2014-01-01,3:00 PM,7.222222,34,66,SW,8,0,29.57,0.0,Fair,2.777778,-2.777778
7,7,31,2014-01-01,3:30 PM,6.111111,0,66,WSW,5,0,29.57,0.0,Fair,2.777778,0.0
8,8,32,2014-01-01,4:00 PM,5.0,0,70,VAR,2,0,29.57,0.0,Fair,2.777778,0.0
9,9,33,2014-01-01,4:30 PM,3.888889,0,75,VAR,2,0,29.57,0.0,Fair,3.888889,-1.111111


In [4]:
processed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29608 entries, 0 to 29607
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       29608 non-null  int64  
 1   index            29608 non-null  int64  
 2   Date             29608 non-null  object 
 3   Time             29608 non-null  object 
 4   Temperature      29608 non-null  float64
 5   Dew Point        29608 non-null  int64  
 6   Humidity         29608 non-null  int64  
 7   Wind             29608 non-null  object 
 8   Wind Speed       29608 non-null  int64  
 9   Wind Gust        29608 non-null  int64  
 10  Pressure         29608 non-null  float64
 11  Precip.          29608 non-null  float64
 12  Condition        29608 non-null  object 
 13  TemperaturePrev  29608 non-null  float64
 14  target           29608 non-null  float64
dtypes: float64(5), int64(6), object(4)
memory usage: 3.4+ MB


## Using 1 hot encoding to change 'Wind' and 'Condition' columns from categorical to numerical data 

In [21]:
numerical_df =pd.get_dummies(processed_df, columns = ['Wind', 'Condition'])

In [23]:
numerical_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29608 entries, 0 to 29607
Data columns (total 85 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Unnamed: 0                           29608 non-null  int64  
 1   index                                29608 non-null  int64  
 2   Date                                 29608 non-null  object 
 3   Time                                 29608 non-null  object 
 4   Temperature                          29608 non-null  float64
 5   Dew Point                            29608 non-null  int64  
 6   Humidity                             29608 non-null  int64  
 7   Wind Speed                           29608 non-null  int64  
 8   Wind Gust                            29608 non-null  int64  
 9   Pressure                             29608 non-null  float64
 10  Precip.                              29608 non-null  float64
 11  TemperaturePrev             

In [54]:
numerical_df

Unnamed: 0.1,Unnamed: 0,index,Date,Time,Temperature,Dew Point,Humidity,Wind Speed,Wind Gust,Pressure,...,Condition_Showers in the Vicinity,Condition_Small Hail,Condition_Smoke,Condition_Snow,Condition_Snow / Windy,Condition_Snow Grains,Condition_Snow Shower,Condition_Snow Shower / Windy,Condition_Wintry Mix,Condition_Wintry Mix / Windy
0,0,24,2014-01-01,12:00 PM,7.222222,36,71,6,0,29.57,...,0,0,0,0,0,0,0,0,0,0
1,1,25,2014-01-01,12:30 PM,7.777778,36,66,6,0,29.57,...,0,0,0,0,0,0,0,0,0,0
2,2,26,2014-01-01,1:00 PM,7.777778,34,62,10,0,29.57,...,0,0,0,0,0,0,0,0,0,0
3,3,27,2014-01-01,1:30 PM,7.777778,34,62,12,0,29.57,...,0,0,0,0,0,0,0,0,0,0
4,4,28,2014-01-01,2:00 PM,7.222222,34,66,12,0,29.57,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29603,29603,4288,2020-03-30,8:30 AM,0.000000,21,64,8,0,29.87,...,0,0,0,0,0,0,0,0,0,0
29604,29604,4289,2020-03-30,9:00 AM,1.111111,21,60,10,0,29.87,...,0,0,0,0,0,0,0,0,0,0
29605,29605,4290,2020-03-30,9:30 AM,2.222222,19,52,9,0,29.87,...,0,0,0,0,0,0,0,0,0,0
29606,29606,4291,2020-03-30,10:00 AM,2.222222,19,52,10,0,29.87,...,0,0,0,0,0,0,0,0,0,0


In [26]:
# Saving dataset
numerical_df.to_csv("1HotEncoded.csv")

# Linear Regression

In [29]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [31]:
train_df, test_df = train_test_split(numerical_df, test_size = 0.1)

In [44]:
features = list(numerical_df.columns)
features.remove('Date')
features.remove('Time')
features.remove('target')
train_df[features].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26647 entries, 10272 to 20185
Data columns (total 82 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Unnamed: 0                           26647 non-null  int64  
 1   index                                26647 non-null  int64  
 2   Temperature                          26647 non-null  float64
 3   Dew Point                            26647 non-null  int64  
 4   Humidity                             26647 non-null  int64  
 5   Wind Speed                           26647 non-null  int64  
 6   Wind Gust                            26647 non-null  int64  
 7   Pressure                             26647 non-null  float64
 8   Precip.                              26647 non-null  float64
 9   TemperaturePrev                      26647 non-null  float64
 10  Wind_CALM                            26647 non-null  uint8  
 11  Wind_E                  

In [66]:
X_train = np.array(train_df[features])
Y_train = np.array(train_df['target'])
Y_train = Y_train.reshape(-1, 1)

In [67]:
X_test = np.array(test_df[features])
Y_test = np.array(test_df['target'])
Y_test = Y_test.reshape(-1, 1)

In [68]:
reg = LinearRegression().fit(X_train, Y_train)

In [69]:
reg.score(X_train, Y_train)

0.31505213821203204

In [70]:
reg.score(X_test, Y_test)

0.29462940676199456

In [73]:
reg.coef_

array([[-1.12491367e-05,  6.71010736e-04,  2.40143855e-01,
         4.71861557e-03,  6.54255068e-02,  2.12689192e-03,
         7.22642488e-03,  3.05098746e-01,  1.69331216e-12,
        -4.53237750e-01,  1.93249290e+00, -6.36242226e-01,
        -1.74888899e+00,  5.35929390e-01, -1.93742725e+00,
        -2.14170592e+00, -2.25959446e+00, -1.51826222e+00,
        -1.10541225e+00,  1.64404037e+00,  1.09944654e+00,
         1.18459555e+00,  1.64883428e+00,  1.45020918e+00,
         5.07987237e-01,  3.20081837e-01, -2.18111629e-01,
         1.24202765e+00, -2.23272606e-01, -1.19981862e-01,
        -7.43955427e-02,  2.16879545e+00, -9.24319267e-01,
        -1.06219544e+00,  1.25825083e+00,  5.73183408e+00,
         6.50299285e-01,  4.13099803e-01,  6.18553607e-01,
         1.66526779e+00, -8.19603301e-01, -1.05868309e+00,
         2.51693014e+00, -7.77817620e-01, -2.87146515e+00,
         1.19670687e-01,  3.30420428e+00, -2.55422789e+00,
        -2.93941683e-01,  7.60117259e-04,  1.21507284e+0

In [74]:
coefficients_df = pd.DataFrame()

In [78]:
coefficients_df['coef'] = req.coef_[0]

In [80]:
coefficients_df['features'] = features
