In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/regression-data/sample_submission_load_shortfall (1).csv
/kaggle/input/regression-data/df_train.csv
/kaggle/input/regression-data/df_test.csv


2. Loading the data

In [16]:
df_train = pd.read_csv('/kaggle/input/regression-data/df_train.csv')
df_test = pd.read_csv('/kaggle/input/regression-data/df_test.csv')

train.head(5)

Unnamed: 0.1,Unnamed: 0,time,Madrid_wind_speed,Valencia_wind_deg,Bilbao_rain_1h,Valencia_wind_speed,Seville_humidity,Madrid_humidity,Bilbao_clouds_all,Bilbao_wind_speed,...,Madrid_temp_max,Barcelona_temp,Bilbao_temp_min,Bilbao_temp,Barcelona_temp_min,Bilbao_temp_max,Seville_temp_min,Madrid_temp,Madrid_temp_min,load_shortfall_3h
0,0,2015-01-01 03:00:00,0.666667,level_5,0.0,0.666667,74.333333,64.0,0.0,1.0,...,265.938,281.013,269.338615,269.338615,281.013,269.338615,274.254667,265.938,265.938,6715.666667
1,1,2015-01-01 06:00:00,0.333333,level_10,0.0,1.666667,78.333333,64.666667,0.0,1.0,...,266.386667,280.561667,270.376,270.376,280.561667,270.376,274.945,266.386667,266.386667,4171.666667
2,2,2015-01-01 09:00:00,1.0,level_9,0.0,1.0,71.333333,64.333333,0.0,1.0,...,272.708667,281.583667,275.027229,275.027229,281.583667,275.027229,278.792,272.708667,272.708667,4274.666667
3,3,2015-01-01 12:00:00,1.0,level_8,0.0,1.0,65.333333,56.333333,0.0,1.0,...,281.895219,283.434104,281.135063,281.135063,283.434104,281.135063,285.394,281.895219,281.895219,5075.666667
4,4,2015-01-01 15:00:00,1.0,level_7,0.0,1.0,59.0,57.0,2.0,0.333333,...,280.678437,284.213167,282.252063,282.252063,284.213167,282.252063,285.513719,280.678437,280.678437,6620.666667


3. Exploratory Data Analysis (EDA)

In [None]:
# Look at data statistics
print(df_train.describe())

# Plot relevant feature interactions
sns.pairplot(df_train)
plt.show()

# Evaluate correlation
sns.heatmap(df_train.corr())
plt.show()

# Have a look at feature distributions
df_train.hist()
plt.show()


        Unnamed: 0  Madrid_wind_speed  Bilbao_rain_1h  Valencia_wind_speed  \
count  8763.000000        8763.000000     8763.000000          8763.000000   
mean   4381.000000           2.425729        0.135753             2.586272   
std    2529.804538           1.850371        0.374901             2.411190   
min       0.000000           0.000000        0.000000             0.000000   
25%    2190.500000           1.000000        0.000000             1.000000   
50%    4381.000000           2.000000        0.000000             1.666667   
75%    6571.500000           3.333333        0.100000             3.666667   
max    8762.000000          13.000000        3.000000            52.000000   

       Seville_humidity  Madrid_humidity  Bilbao_clouds_all  \
count       8763.000000      8763.000000        8763.000000   
mean          62.658793        57.414717          43.469132   
std           22.621226        24.335396          32.551044   
min            8.333333         6.333333     

4. Data Engineering

In [None]:
# Remove Missing Values/Features
df_train.dropna(inplace=True)
df_test.dropna(inplace=True)

# Create New Features
# Example: Create a new feature 'renewable_ratio' which is the ratio of renewable energy to total energy generated
df_train['renewable_ratio'] = df_train['renewable_energy'] / (df_train['renewable_energy'] + df_train['fossil_fuel_energy'])
df_test['renewable_ratio'] = df_test['renewable_energy'] / (df_test['renewable_energy'] + df_test['fossil_fuel_energy'])

# Engineer Existing Features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_train[['renewable_energy', 'fossil_fuel_energy']] = scaler.fit_transform(df_train[['renewable_energy', 'fossil_fuel_energy']])
df_test[['renewable_energy', 'fossil_fuel_energy']] = scaler.transform(df_test[['renewable_energy', 'fossil_fuel_energy']])



5. Modelling

In [None]:
# Split Data
X_train = df_train.drop('target_variable', axis=1)
y_train = df_train['target_variable']
X_test = df_test.drop('time', axis=1)

# Create Target and Features Data
# The target variable is the variable we are trying to predict, and the features are the input variables.

# Create Machine Learning Models
# Linear Regression Model
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Random Forest Regressor Model
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Evaluate ML Models Selected
# For Training Data
y_train_pred_lr = lr.predict(X_train)
y_train_pred_rf = rf.predict(X_train)
print('Linear Regression RMSE (Train):', np.sqrt(mean_squared_error(y_train, y_train_pred_lr)))
print('Random Forest RMSE (Train):', np.sqrt(mean_squared_error(y_train, y_train_pred_rf)))
print('Linear Regression R2 (Train):', r2_score(y_train, y_train_pred_lr))
print('Random Forest R2 (Train):', r2_score(y_train, y_train_pred_rf))


6. Model Performance

In [None]:
# Compare Model Performance
# For Test Data
print('Linear Regression RMSE (Test):', np.sqrt(mean_squared_error(y_test, y_pred_lr)))
print('Random Forest RMSE (Test):', np.sqrt(mean_squared_error(y_test, y_pred_rf)))
print('Linear Regression R2 (Test):', r2_score(y_test, y_pred_lr))
print('Random Forest R2 (Test):', r2_score(y_test, y_pred_rf))

# Choose Best Model and Motivate Why
# Based on the RMSE and R2 score on the test data, choose the model with the lowest RMSE and highest R2 score.

In [None]:
output = pd.DataFrame({'time': df_test['time'], 'load_shortfall_3h': y_pred_rf})  # change y_pred_rf to y_pred_lr if Linear Regression is the best model
output.to_csv('output.csv', index=False)