In [1]:
# Set imports
import copy
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import RandomOverSampler

### Dataset:
Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA:
University of California, School of Information and Computer Science.

Source: Data Source: http://data.seoul.go.kr/
SOUTH KOREA PUBLIC HOLIDAYS. URL: publicholidays.go.kr

### Evaluating A Linear Regression Model

1 - Mean Absolute Error <br>
&emsp;&emsp;* Lets take all of the errors/residuals, sum the distance, take the average <br>
&emsp;&emsp;* This says how far off we are <br>
&emsp;&emsp;* $\displaystyle \Bigg[\frac{\sum \limits _{i=1} ^{n} |y_{i} - \hat{y_{i}}|}{n}\Bigg]$ <br><br>

2 - Mean Squared Error <br>
&emsp;&emsp;* Same concept but to punish distance from linear model more harshly <br>
&emsp;&emsp;* To do this we square the summation <br>
&emsp;&emsp;* This is good for removing outliers as they will face a more intense value weighting. <br>
&emsp;&emsp;* $\displaystyle \Bigg[\frac{\sum \limits _{i=1} ^{n} |y_{i} - \hat{y_{i}}|^{2}}{n}\Bigg]$ <br><br>

3 - Root Mean Squared Error <br>
&emsp;&emsp;* Same concept but to punish distance from linear model more harshly <br>
&emsp;&emsp;* To do this we square the summation <br>
&emsp;&emsp;* This is good for more direct comparison to Y. The root mean squared provides an easier y comparion <br>
&emsp;&emsp;* $\displaystyle \sqrt{\Bigg[\frac{\sum \limits _{i=1} ^{n} |y_{i} - \hat{y_{i}}|^{2}}{n}\Bigg]}$<br><br>

4 - Coefficient of Determination <br>
&emsp;&emsp;* RSS - Sum of Squared Residuals -> The Numerator in the Above Functions<br>
&emsp;&emsp;* TSS - Total Sum of Squares<br><br>
&emsp;&emsp;* TSS = $\displaystyle \sum \limits _{i=1} ^{n} |y_{i} - \bar{y_{i}}|^{2}$<br><br>
&emsp;&emsp;* Notice Instead of predicted value '^' we use average/mean value '-'<br>
&emsp;&emsp;* A value close to 0 with $R^{2}$ is a good indicator usually<br><br>
&emsp;&emsp;* $R^{2} = 1 - \Bigg[\frac{RSS}{TSS}\Bigg]$ <br><br>


### Data Organization and Cleanup

In [2]:
# Let's name the columns
dataset_cols = ['bike_count', 'hour', 'temp', 'humidity', 'wind', 'visibility', 'dew_pt_temp', 'radiation', 'rain', 'snow', 'functional']

# To remove select columns add "drop", dropping... the date, holiday, and seasons columns, applying to columns axis
df = pd.read_csv("DataSets/seoulBikeSharingDemand/SeoulBikeData.csv").drop(["Date", "Holiday", "Seasons"], axis=1)
df.head()

Unnamed: 0,Rented Bike Count,Hour,Temperature(C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Functioning Day
0,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Yes
1,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Yes
2,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Yes
3,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Yes
4,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Yes


In [3]:
df.columns = dataset_cols
df['functional'] = (df['functional'] == 'YES').astype(int)
df = df[df['hour'] == 12]
df = df.drop(['hour'], axis=1)
df.head()

Unnamed: 0,bike_count,temp,humidity,wind,visibility,dew_pt_temp,radiation,rain,snow,functional
12,449,1.7,23,1.4,2000,-17.2,1.11,0.0,0.0,0
36,479,4.3,41,1.3,1666,-7.8,1.09,0.0,0.0,0
60,333,5.8,85,1.7,349,3.4,0.43,0.0,0.0,0
84,393,-0.3,38,4.8,1823,-12.9,1.11,0.0,0.0,0
108,321,-2.3,25,0.0,1962,-19.7,0.0,0.0,0.0,0
