In [1]:
# Basic Linear Regression Model, few seconds duration 80 10 10
# ------------------------------------------------------------
# 1. Import Libraries and Mount Google Drive
# ------------------------------------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# ------------------------------------------------------------
# 2. Load and Preview the Dataset
# ------------------------------------------------------------
file_path = "/content/drive/My Drive/Colab Notebooks/Data/2020_Data_V0.1.csv"
df = pd.read_csv(file_path)
print("Dataset preview:")
print(df.head())

# ------------------------------------------------------------
# 3. Data Preprocessing and Basic Feature Engineering
# ------------------------------------------------------------
# Convert 'Charging Time (hh:mm:ss)' to total seconds
def convert_to_seconds(time_str):
    h, m, s = map(int, time_str.split(':'))
    return h * 3600 + m * 60 + s

df['Charging Time (seconds)'] = df['Charging Time (hh:mm:ss)'].apply(convert_to_seconds)

# Extract simple time-based features
df['Start Date'] = pd.to_datetime(df['Start Date'])
df['Start Hour'] = df['Start Date'].dt.hour
df['Start Day'] = df['Start Date'].dt.day
df['Start Month'] = df['Start Date'].dt.month

# For a basic model, we use only these numerical features.
features = ['Charging Time (seconds)', 'Start Hour', 'Start Day', 'Start Month']
X = df[features]
y = df['Energy (kWh)']

# ------------------------------------------------------------
# 4. Normalize Features and Split the Data
# ------------------------------------------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# First, split off 10% for testing.
X_temp, X_test, y_temp, y_test = train_test_split(X_scaled, y, test_size=0.1, random_state=42)
# Then split the remaining 90% into ~80% training and ~10% validation.
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1111, random_state=42)

print("\nData Split:")
print(f"Training samples: {X_train.shape[0]}")
print(f"Validation samples: {X_val.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")

# ------------------------------------------------------------
# 5. Build and Train a Basic Linear Regression Model
# ------------------------------------------------------------
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# ------------------------------------------------------------
# 6. Evaluate the Model
# ------------------------------------------------------------
# Generate predictions on each set.
y_train_pred = lr_model.predict(X_train)
y_val_pred   = lr_model.predict(X_val)
y_test_pred  = lr_model.predict(X_test)

# Compute performance metrics.
def compute_metrics(y_true, y_pred):
    r2   = r2_score(y_true, y_pred)
    mae  = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return r2, mae, rmse

r2_train, mae_train, rmse_train = compute_metrics(y_train, y_train_pred)
r2_val, mae_val, rmse_val       = compute_metrics(y_val, y_val_pred)
r2_test, mae_test, rmse_test    = compute_metrics(y_test, y_test_pred)

# Print the results.
print("\nLinear Regression Performance:")
print("Training Set:")
print(f"  R² Score: {r2_train:.3f}")
print(f"  MAE: {mae_train:.3f} kWh")
print(f"  RMSE: {rmse_train:.3f} kWh")

print("Validation Set:")
print(f"  R² Score: {r2_val:.3f}")
print(f"  MAE: {mae_val:.3f} kWh")
print(f"  RMSE: {rmse_val:.3f} kWh")

print("Test Set:")
print(f"  R² Score: {r2_test:.3f}")
print(f"  MAE: {mae_test:.3f} kWh")
print(f"  RMSE: {rmse_test:.3f} kWh")


Mounted at /content/drive
Dataset preview:
            Start Date        End Date Charging Time (hh:mm:ss)  Energy (kWh)  \
0  2020-01-01 09:11:00  1/1/2020 11:20                  2:08:25        12.885   
1  2020-01-01 09:32:00  1/1/2020 11:19                  1:47:06         5.936   
2  2020-01-01 09:44:00  1/1/2020 10:57                  1:12:40         3.652   
3  2020-01-01 09:45:00  1/1/2020 10:45                  1:00:15         6.031   
4  2020-01-01 09:45:00  1/1/2020 15:08                  5:22:07        32.260   

  Port Type  Port Number Plug Type  
0   Level 2            2     J1772  
1   Level 2            2     J1772  
2   Level 2            2     J1772  
3   Level 2            1     J1772  
4   Level 2            1     J1772  

Data Split:
Training samples: 16077
Validation samples: 2010
Test samples: 2010

Linear Regression Performance:
Training Set:
  R² Score: 0.852
  MAE: 2.529 kWh
  RMSE: 3.496 kWh
Validation Set:
  R² Score: 0.853
  MAE: 2.547 kWh
  RMSE: 3.593 kWh

In [2]:
# Basic Linear Regression Model, few seconds duration 70 15 15
# ------------------------------------------------------------
# 1. Import Libraries and Mount Google Drive
# ------------------------------------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# ------------------------------------------------------------
# 2. Load and Preview the Dataset
# ------------------------------------------------------------
file_path = "/content/drive/My Drive/Colab Notebooks/Data/2020_Data_V0.1.csv"
df = pd.read_csv(file_path)
print("Dataset preview:")
print(df.head())

# ------------------------------------------------------------
# 3. Data Preprocessing and Basic Feature Engineering
# ------------------------------------------------------------
# Convert 'Charging Time (hh:mm:ss)' to total seconds
def convert_to_seconds(time_str):
    h, m, s = map(int, time_str.split(':'))
    return h * 3600 + m * 60 + s

df['Charging Time (seconds)'] = df['Charging Time (hh:mm:ss)'].apply(convert_to_seconds)

# Extract simple time-based features
df['Start Date'] = pd.to_datetime(df['Start Date'])
df['Start Hour'] = df['Start Date'].dt.hour
df['Start Day'] = df['Start Date'].dt.day
df['Start Month'] = df['Start Date'].dt.month

# For a basic model, we use only these numerical features.
features = ['Charging Time (seconds)', 'Start Hour', 'Start Day', 'Start Month']
X = df[features]
y = df['Energy (kWh)']

# ------------------------------------------------------------
# 4. Normalize Features and Split the Data (70% Train, 15% Test, 15% Validation)
# ------------------------------------------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# First, split off 30% for testing+validation.
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
# Then split the temporary set equally into test and validation sets (each 15% of total data).
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print("\nData Split:")
print(f"Training samples: {X_train.shape[0]}")   # ~70%
print(f"Validation samples: {X_val.shape[0]}")     # ~15%
print(f"Test samples: {X_test.shape[0]}")            # ~15%

# ------------------------------------------------------------
# 5. Build and Train a Basic Linear Regression Model
# ------------------------------------------------------------
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# ------------------------------------------------------------
# 6. Evaluate the Model
# ------------------------------------------------------------
# Generate predictions on each set.
y_train_pred = lr_model.predict(X_train)
y_val_pred   = lr_model.predict(X_val)
y_test_pred  = lr_model.predict(X_test)

# Compute performance metrics.
def compute_metrics(y_true, y_pred):
    r2   = r2_score(y_true, y_pred)
    mae  = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return r2, mae, rmse

r2_train, mae_train, rmse_train = compute_metrics(y_train, y_train_pred)
r2_val, mae_val, rmse_val       = compute_metrics(y_val, y_val_pred)
r2_test, mae_test, rmse_test    = compute_metrics(y_test, y_test_pred)

# Print the results.
print("\nLinear Regression Performance:")
print("Training Set:")
print(f"  R² Score: {r2_train:.3f}")
print(f"  MAE: {mae_train:.3f} kWh")
print(f"  RMSE: {rmse_train:.3f} kWh")

print("Validation Set:")
print(f"  R² Score: {r2_val:.3f}")
print(f"  MAE: {mae_val:.3f} kWh")
print(f"  RMSE: {rmse_val:.3f} kWh")

print("Test Set:")
print(f"  R² Score: {r2_test:.3f}")
print(f"  MAE: {mae_test:.3f} kWh")
print(f"  RMSE: {rmse_test:.3f} kWh")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Dataset preview:
            Start Date        End Date Charging Time (hh:mm:ss)  Energy (kWh)  \
0  2020-01-01 09:11:00  1/1/2020 11:20                  2:08:25        12.885   
1  2020-01-01 09:32:00  1/1/2020 11:19                  1:47:06         5.936   
2  2020-01-01 09:44:00  1/1/2020 10:57                  1:12:40         3.652   
3  2020-01-01 09:45:00  1/1/2020 10:45                  1:00:15         6.031   
4  2020-01-01 09:45:00  1/1/2020 15:08                  5:22:07        32.260   

  Port Type  Port Number Plug Type  
0   Level 2            2     J1772  
1   Level 2            2     J1772  
2   Level 2            2     J1772  
3   Level 2            1     J1772  
4   Level 2            1     J1772  

Data Split:
Training samples: 14067
Validation samples: 3015
Test samples: 3015

Linear Regression Performance:
Training Set:
  R² Score: 0.852
