In [1]:
# Libraries for data loading, data manipulation and data visulisation
import numpy as np
import pandas as pd
import re

import matplotlib.pyplot as plt
import seaborn as sns

from  scipy import stats
from scipy.stats import pearsonr
from statsmodels.graphics.tsaplots import plot_pacf, plot_acf
from statsmodels.graphics.correlation import plot_corr

from scipy.stats import chi2_contingency

from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold



import calendar
from datetime import datetime

from sklearn import *
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


sns.set_style('whitegrid')

# Libraries for data preparation and model building
#import *


from sklearn.preprocessing import StandardScaler,PowerTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn import metrics
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import cross_val_score

# Setting global constants to ensure notebook results are reproducible
PARAMETER_CONSTANT = 4

In [2]:
df = pd.read_csv('df_train.csv')
df_test = pd.read_csv("df_test.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8763 entries, 0 to 8762
Data columns (total 49 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            8763 non-null   int64  
 1   time                  8763 non-null   object 
 2   Madrid_wind_speed     8763 non-null   float64
 3   Valencia_wind_deg     8763 non-null   object 
 4   Bilbao_rain_1h        8763 non-null   float64
 5   Valencia_wind_speed   8763 non-null   float64
 6   Seville_humidity      8763 non-null   float64
 7   Madrid_humidity       8763 non-null   float64
 8   Bilbao_clouds_all     8763 non-null   float64
 9   Bilbao_wind_speed     8763 non-null   float64
 10  Seville_clouds_all    8763 non-null   float64
 11  Bilbao_wind_deg       8763 non-null   float64
 12  Barcelona_wind_speed  8763 non-null   float64
 13  Barcelona_wind_deg    8763 non-null   float64
 14  Madrid_clouds_all     8763 non-null   float64
 15  Seville_wind_speed   

In [4]:
df.isnull().sum()

Unnamed: 0                 0
time                       0
Madrid_wind_speed          0
Valencia_wind_deg          0
Bilbao_rain_1h             0
Valencia_wind_speed        0
Seville_humidity           0
Madrid_humidity            0
Bilbao_clouds_all          0
Bilbao_wind_speed          0
Seville_clouds_all         0
Bilbao_wind_deg            0
Barcelona_wind_speed       0
Barcelona_wind_deg         0
Madrid_clouds_all          0
Seville_wind_speed         0
Barcelona_rain_1h          0
Seville_pressure           0
Seville_rain_1h            0
Bilbao_snow_3h             0
Barcelona_pressure         0
Seville_rain_3h            0
Madrid_rain_1h             0
Barcelona_rain_3h          0
Valencia_snow_3h           0
Madrid_weather_id          0
Barcelona_weather_id       0
Bilbao_pressure            0
Seville_weather_id         0
Valencia_pressure       2068
Seville_temp_max           0
Madrid_pressure            0
Valencia_temp_max          0
Valencia_temp              0
Bilbao_weather

In [5]:
# Drop "Unnamed: 0" column from data
df =  df.drop(["Unnamed: 0"], axis = "columns")

In [6]:
# creating dummy variables to convert categorical into numeric values
mylist = list(df.select_dtypes(include=['object']).columns)
dummies = pd.get_dummies(df[mylist], prefix= mylist)
df.drop(mylist, axis=1, inplace = True)
X = pd.concat([df,dummies], axis =1 )

In [7]:
# Comparing different imputing values:

print("Mode ---> ", df["Valencia_pressure"].mode())

print("Median ---> ", df["Valencia_pressure"].median())

print("Mean ---> ", df["Valencia_pressure"].mean())

Mode --->  0    1018.0
Name: Valencia_pressure, dtype: float64
Median --->  1015.0
Mean --->  1012.0514065222798


In [8]:
# imputing missing values
for col in df:
    df["Valencia_pressure"] = df["Valencia_pressure"].fillna(df["Valencia_pressure"].mode()[0])
    df[col] = df[col].fillna(df[col].mode()[0])

In [9]:
print(df.isnull().sum())

Madrid_wind_speed       0
Bilbao_rain_1h          0
Valencia_wind_speed     0
Seville_humidity        0
Madrid_humidity         0
Bilbao_clouds_all       0
Bilbao_wind_speed       0
Seville_clouds_all      0
Bilbao_wind_deg         0
Barcelona_wind_speed    0
Barcelona_wind_deg      0
Madrid_clouds_all       0
Seville_wind_speed      0
Barcelona_rain_1h       0
Seville_rain_1h         0
Bilbao_snow_3h          0
Barcelona_pressure      0
Seville_rain_3h         0
Madrid_rain_1h          0
Barcelona_rain_3h       0
Valencia_snow_3h        0
Madrid_weather_id       0
Barcelona_weather_id    0
Bilbao_pressure         0
Seville_weather_id      0
Valencia_pressure       0
Seville_temp_max        0
Madrid_pressure         0
Valencia_temp_max       0
Valencia_temp           0
Bilbao_weather_id       0
Seville_temp            0
Valencia_humidity       0
Valencia_temp_min       0
Barcelona_temp_max      0
Madrid_temp_max         0
Barcelona_temp          0
Bilbao_temp_min         0
Bilbao_temp 

# Ridge regression model

In [10]:
# for cross validation
X = df.drop('load_shortfall_3h',1)
Y = df['load_shortfall_3h']
x_train, x_validate, y_train, y_validate = train_test_split(X,Y, test_size =0.2)

  X = df.drop('load_shortfall_3h',1)


In [11]:
## training the model
ridge = Ridge()
ridge.fit(x_train,y_train)

In [12]:
# predicting on validation
pred = ridge.predict(x_validate)

In [13]:
# calculating mse
MSE = np.mean((pred - y_validate)**2)
print('MSE:', MSE)

MSE: 23448128.602873057


In [14]:
## calculating r2 
R2 = ridge.score(x_validate,y_validate) 
print('R2:', R2)

R2: 0.12829177148102766


In [15]:
# Extract the model intercept value
b0 = float(ridge.intercept_)
print("Intercept:", float(b0))

Intercept: -57768.247072966886


In [16]:
# Extract the model coefficient value
coeff = pd.DataFrame(ridge.coef_, X.columns, columns=['Coefficient'])
print(coeff)

                      Coefficient
Madrid_wind_speed     -297.938832
Bilbao_rain_1h        -485.306446
Valencia_wind_speed    -76.793920
Seville_humidity       -46.086741
Madrid_humidity          4.350615
Bilbao_clouds_all       -5.114637
Bilbao_wind_speed      -56.193083
Seville_clouds_all       3.583897
Bilbao_wind_deg         -2.112684
Barcelona_wind_speed   -80.144651
Barcelona_wind_deg      -3.106705
Madrid_clouds_all        6.859224
Seville_wind_speed      -7.483991
Barcelona_rain_1h     -177.888342
Seville_rain_1h        769.026327
Bilbao_snow_3h          27.476496
Barcelona_pressure      -0.007228
Seville_rain_3h      -4250.735490
Madrid_rain_1h           7.522522
Barcelona_rain_3h    -3297.981313
Valencia_snow_3h     -2365.109485
Madrid_weather_id       -0.688626
Barcelona_weather_id     1.270573
Bilbao_pressure         -8.182663
Seville_weather_id       1.710115
Valencia_pressure       47.732931
Seville_temp_max      -199.952801
Madrid_pressure          0.186652
Valencia_temp_

# Assess the predictive accuracy of the model

In [17]:
#LINEAR REGRESSION TO COMPARE WITH
#Define linear model
lreg = LinearRegression()
# training a linear regression model on train
lreg.fit(x_train,y_train)
# predicting on validation
pred_validate = lreg.predict(x_validate)
# calculating mse
MSE = np.mean((pred_validate - y_validate)**2)
print('MSE:', MSE)
# evaluation using r-square
R2 = lreg.score(x_validate,y_validate)
print('R2:', R2)


MSE: 23432948.394602984
R2: 0.12885611129184027


In [18]:
# Extract the model coefficient value
coeff = pd.DataFrame(lreg.coef_, X.columns, columns=['Coefficient'])
print(coeff)

                       Coefficient
Madrid_wind_speed      -299.579700
Bilbao_rain_1h         -496.038281
Valencia_wind_speed     -74.661914
Seville_humidity        -45.780270
Madrid_humidity           4.466720
Bilbao_clouds_all        -5.443902
Bilbao_wind_speed       -63.039875
Seville_clouds_all        4.359374
Bilbao_wind_deg          -2.086737
Barcelona_wind_speed    -74.108017
Barcelona_wind_deg       -3.278971
Madrid_clouds_all         6.879268
Seville_wind_speed      -10.479900
Barcelona_rain_1h      -175.021374
Seville_rain_1h         731.535867
Bilbao_snow_3h           47.358370
Barcelona_pressure       -0.007178
Seville_rain_3h      -46146.384684
Madrid_rain_1h           -5.102550
Barcelona_rain_3h    -25473.599133
Valencia_snow_3h      -4706.858964
Madrid_weather_id        -0.976107
Barcelona_weather_id      1.392016
Bilbao_pressure          -8.042452
Seville_weather_id        1.880302
Valencia_pressure        47.958724
Seville_temp_max       -197.166138
Madrid_pressure     

In [19]:
# Check training accuracy
train_lreg = lreg.predict(x_train)
train_ridge = ridge.predict(x_train)

print('Training MSE')
print('Linear:', metrics.mean_squared_error(y_train, train_lreg))
print('Ridge :', metrics.mean_squared_error(y_train, train_ridge))

Training MSE
Linear: 23124994.316230554
Ridge : 23161614.755832475


In [20]:
test_lreg = lreg.predict(x_validate)
test_ridge = ridge.predict(x_validate)

print('Testing MSE')
print('Linear:', metrics.mean_squared_error(y_validate, test_lreg))
print('Ridge :', metrics.mean_squared_error(y_validate, test_ridge))

Testing MSE
Linear: 23432948.394602984
Ridge : 23448128.602873057


# submission to Kaggle

In [21]:
# Training the model on complete test dataset:
ridge.fit(X, Y)

# Making predictions for the test values of y:
ridge_submission_predictions = ridge.predict(X)

In [22]:
# Make Dataframe from predicted values
ridge_pred_df = pd.DataFrame(ridge_submission_predictions, columns=['load_shortfall_3h'])
# Make Dataframe from corresponding dates
output_df = pd.DataFrame({'time':df_test['time']})

# Join dataframes to create a single submission dataframe
submission_df = output_df.join(ridge_pred_df)

#Save submission as csv
submission_df.to_csv('TeamGM4_Ridge_Regression_Kaggle_Submission', index=False)