# Regression Analysis

In [36]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns # Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt # Visualization
%matplotlib inline
mpl.rcParams['figure.dpi'] = 150

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score

np.random.seed(0)

### Load Data

In [None]:
cd '/Users/shifraisaacs/Documents/Externship/cgi_flood_prediction_mitigation'

/Users/shifraisaacs/Documents/Externship/cgi_flood_prediction_mitigation


In [6]:
gauge = pd.read_csv('Data/Gauge_Windows.csv', index_col=0)
gauge.head()

Unnamed: 0,Day1,Day2,Day3,Day4,Day5,Day6,Day7,rainfall_ft_mean,next_day_guage
0,2.39,2.42,2.43,2.43,2.43,3.48,4.08,0.002857,3.02
1,3.02,2.86,2.69,2.72,2.69,2.67,2.64,0.002857,2.61
2,2.61,3.27,3.33,2.7,2.64,2.59,2.68,0.005714,3.07
3,3.07,3.36,6.56,4.43,3.49,3.18,3.01,0.0,2.89
4,2.89,2.81,2.76,2.71,2.69,2.89,2.77,0.001429,2.66


### Pre-processing

In [None]:
gauge.columns = gauge.columns.str.lower()
gauge.head()

Unnamed: 0,day1,day2,day3,day4,day5,day6,day7,rainfall_ft_mean,next_day_guage
0,2.39,2.42,2.43,2.43,2.43,3.48,4.08,0.002857,3.02
1,3.02,2.86,2.69,2.72,2.69,2.67,2.64,0.002857,2.61
2,2.61,3.27,3.33,2.7,2.64,2.59,2.68,0.005714,3.07
3,3.07,3.36,6.56,4.43,3.49,3.18,3.01,0.0,2.89
4,2.89,2.81,2.76,2.71,2.69,2.89,2.77,0.001429,2.66


In [None]:
gauge.describe()

Unnamed: 0,day1,day2,day3,day4,day5,day6,day7,rainfall_ft_mean,next_day_guage
count,782.0,782.0,782.0,782.0,782.0,782.0,782.0,782.0,782.0
mean,3.457775,3.446969,3.448504,3.451151,3.484182,3.487916,3.477302,0.003285,3.454719
std,1.0102,0.927379,0.903554,1.029832,1.097917,1.029804,0.976931,0.003848,1.017027
min,2.32,2.33,2.33,2.33,2.33,2.32,2.33,0.0,0.0
25%,2.83,2.83,2.84,2.84,2.87,2.88,2.87,0.0,2.83
50%,3.17,3.21,3.21,3.185,3.2,3.195,3.185,0.002857,3.17
75%,3.7275,3.74,3.69,3.7075,3.775,3.79,3.77,0.004286,3.7275
max,10.15,8.98,8.28,15.12,14.99,11.62,10.63,0.03,10.15


### Data Scaling

In [None]:
scaler = MinMaxScaler()

In [None]:
scaled = scaler.fit_transform(gauge)

In [None]:
scaled_gauge = pd.DataFrame(scaled, columns=gauge.columns)
scaled_gauge.head()

Unnamed: 0,day1,day2,day3,day4,day5,day6,day7,rainfall_ft_mean,next_day_guage
0,0.00894,0.013534,0.016807,0.007819,0.007899,0.124731,0.210843,0.095238,0.297537
1,0.0894,0.079699,0.060504,0.030493,0.028436,0.037634,0.037349,0.095238,0.257143
2,0.037037,0.141353,0.168067,0.028929,0.024487,0.029032,0.042169,0.190476,0.302463
3,0.095785,0.154887,0.710924,0.164191,0.091627,0.092473,0.081928,0.0,0.284729
4,0.072797,0.07218,0.072269,0.029711,0.028436,0.06129,0.053012,0.047619,0.262069


### Data Split

In [None]:
y = scaled_gauge.pop('next_day_guage')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(scaled_gauge, y, test_size=0.2, random_state=0)
y_test[:5]

382    0.305419
487    0.281773
351    0.365517
14     0.350739
145    0.253202
Name: next_day_guage, dtype: float64

### Model Comparison

In [None]:
models = [LinearRegression(), Ridge(), Lasso(), ElasticNet(), RandomForestRegressor(), GradientBoostingRegressor()]

In [None]:
mae, mse, msle, r2 = [], [], [], []

In [37]:
for model in models:
	model = TransformedTargetRegressor(regressor= model,
                                        transformer = MinMaxScaler()
                                        ).fit(X_train, y_train)
	preds = model.predict(X_test)
	
	mae_temp, mse_temp, msle_temp, r2_temp = \
	mean_absolute_error(preds, y_test), mean_squared_error(preds, y_test), \
	mean_squared_log_error(preds, y_test), r2_score(preds, y_test)
	
	mae.append(mae_temp)
	mse.append(mse_temp)
	msle.append(msle_temp)
	r2.append(r2_temp)
	
print('All models scored')

All models scored


In [38]:
model_results = pd.DataFrame({'model': models, 'r2': r2, 'mse': mse, 'mae': mae, 'msle': mse})
model_results

Unnamed: 0,model,r2,mse,mae,msle
0,LinearRegression(),0.1343516,0.004419,0.032392,0.004419
1,Ridge(),-0.05827299,0.004721,0.0343,0.004721
2,Lasso(),-3.464793e+30,0.010677,0.071499,0.010677
3,ElasticNet(),-3.464793e+30,0.010677,0.071499,0.010677
4,RandomForestRegressor(),0.03392498,0.007228,0.043679,0.007228
5,GradientBoostingRegressor(),0.03817115,0.007148,0.042071,0.007148
