In [None]:

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Load the dataset
# Assuming 'df' is your DataFrame containing the dataset

# Step 1: Handle missing values
# You can either drop missing values or fill them with a suitable method (mean/median/mode)
# Separate numerical and categorical columns
numerical_features = data.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = data.select_dtypes(include=[object]).columns.tolist()

# Fill missing values for numerical features with median
for col in numerical_features:
    data[col].fillna(data[col].median(), inplace=True)

# Fill missing values for categorical features with mode
for col in categorical_features:
    data[col].fillna(data[col].mode()[0], inplace=True)
# Step 2: Label encoding for categorical features
label_encoder = LabelEncoder()
# categorical_features = ['Building_Type', 'Building_Status', 'Green_Certified', 'Maintenance_Priority']

for col in categorical_features:
    data[col] = label_encoder.fit_transform(data[col])

# Step 3: Split data into features (X) and target (y)
X = data.drop(columns=['Electricity_Bill'])  # All columns except target
y = data['Electricity_Bill']  # Target column
# Step 4: Normalize numerical features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 5: Train-test split (80:20 ratio)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 6: Apply Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# Step 7: Predict on training and test data
y_train_pred = lin_reg.predict(X_train)
y_test_pred = lin_reg.predict(X_test)

# Step 8: Calculate performance metrics

# Mean Squared Error (MSE)
mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)

# Root Mean Squared Error (RMSE)
rmse_train = np.sqrt(mse_train)
rmse_test = np.sqrt(mse_test)

# Mean Absolute Error (MAE)
mae_train = mean_absolute_error(y_train, y_train_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
print(y_train)
print(y_train_pred)
# R2 Score
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

# Adjusted R2 Score
def adjusted_r2_score(r2, X):
    n = X.shape[0]  # Number of samples
    p = X.shape[1]  # Number of features
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)

adj_r2_train = adjusted_r2_score(r2_train, X_train)
adj_r2_test = adjusted_r2_score(r2_test, X_test)

# Step 9: Print results

print("Training Metrics:")
print(f"  MSE: {mse_train:.2f}")
print(f"  RMSE: {rmse_train:.2f}")
print(f"  MAE: {mae_train:.2f}")
print(f"  R2 Score: {r2_train:.4f}")
print(f"  Adjusted R2 Score: {adj_r2_train:.2f}")

print("\nTest Metrics:")
print(f"  MSE: {mse_test:.2f}")
print(f"  RMSE: {rmse_test:.2f}")
print(f"  MAE: {mae_test:.2f}")
print(f"  R2 Score: {r2_test:.4f}")
print(f"  Adjusted R2 Score: {adj_r2_test:.2f}")


### PART D

In [40]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from sklearn.linear_model import LinearRegression
# from sklearn.feature_selection import RFE
# from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
# import numpy as np

# # Load your dataset
# data = pd.read_csv('path_to_your_electricity_bill_dataset.csv')

# # Define feature columns and target variable
# feature_columns = ['Building_Type', 'Construction_Year', 'Number_of_Floors', 'Energy_Consumption_Per_SqM', 
#                     'Water_Usage_Per_Building', 'Waste_Recycled_Percentage', 'Occupancy_Rate', 'Indoor_Air_Quality', 
#                     'Smart_Devices_Count', 'Green_Certified', 'Maintenance_Resolution_Time', 'Building_Status', 
#                     'Maintenance_Priority', 'Energy_Per_SqM', 'Number_of_Residents']
# target_column = 'Electricity_Bill'

# X = data[feature_columns]
# y = data[target_column]

# Encode categorical variables if necessary
X = pd.get_dummies(X, drop_first=True)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the model
model = LinearRegression()

# Perform RFE
rfe = RFE(model, n_features_to_select=3)
rfe = rfe.fit(X_train_scaled, y_train)

# Get the selected features
selected_features = X.columns[rfe.support_]
print("Selected Features:", selected_features)

# Train the model with selected features
X_train_selected = X_train_scaled[:, rfe.support_]
X_test_selected = X_test_scaled[:, rfe.support_]
model.fit(X_train_selected, y_train)

# Predict on train and test sets
y_train_pred = model.predict(X_train_selected)
y_test_pred = model.predict(X_test_selected)

# Calculate performance metrics
def evaluate_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    return mse, rmse, r2, mae

mse_train, rmse_train, r2_train, mae_train = evaluate_metrics(y_train, y_train_pred)
mse_test, rmse_test, r2_test, mae_test = evaluate_metrics(y_test, y_test_pred)

print(f"Train MSE: {mse_train}")
print(f"Train RMSE: {rmse_train}")
print(f"Train R2: {r2_train}")
print(f"Train MAE: {mae_train}")

print(f"Test MSE: {mse_test}")
print(f"Test RMSE: {rmse_test}")
print(f"Test R2: {r2_test}")
print(f"Test MAE: {mae_test}")


Selected Features: Index(['Building_Type', 'Green_Certified', 'Number_of_Residents'], dtype='object')
Train MSE: 24569032.90689799
Train RMSE: 4956.715939702212
Train R2: 0.010134545491283897
Train MAE: 4006.4733775147365
Test MSE: 23941409.06299838
Test RMSE: 4892.995918964002
Test R2: 0.013901513867940918
Test MAE: 3813.948128176773


### PART E

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

# Load your dataset
data = pd.read_csv('path_to_your_electricity_bill_dataset.csv')

# Define feature columns and target variable
feature_columns = ['Building_Type', 'Construction_Year', 'Number_of_Floors', 'Energy_Consumption_Per_SqM', 
                    'Water_Usage_Per_Building', 'Waste_Recycled_Percentage', 'Occupancy_Rate', 'Indoor_Air_Quality', 
                    'Smart_Devices_Count', 'Green_Certified', 'Maintenance_Resolution_Time', 'Building_Status', 
                    'Maintenance_Priority', 'Energy_Per_SqM', 'Number_of_Residents']
target_column = 'Electricity_Bill'

X = data[feature_columns]
y = data[target_column]

# Define categorical and numerical features
categorical_features = ['Building_Type', 'Green_Certified', 'Building_Status']
numerical_features = [col for col in feature_columns if col not in categorical_features]

# Create a ColumnTransformer with OneHotEncoder for categorical features and StandardScaler for numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ])

# Create a Ridge Regression model
model = Ridge()

# Create a pipeline with preprocessing and modeling steps
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Predict on train and test sets
y_train_pred = pipeline.predict(X_train)
y_test_pred = pipeline.predict(X_test)

# Calculate performance metrics
def evaluate_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    return mse, rmse, r2, mae

mse_train, rmse_train, r2_train, mae_train = evaluate_metrics(y_train, y_train_pred)
mse_test, rmse_test, r2_test, mae_test = evaluate_metrics(y_test, y_test_pred)

# Calculate Adjusted R² score
def adjusted_r2_score(r2, n, p):
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)

n_train, p = X_train.shape
n_test = X_test.shape[0]
adjusted_r2_train = adjusted_r2_score(r2_train, n_train, p)
adjusted_r2_test = adjusted_r2_score(r2_test, n_test, p)

print(f"Train MSE: {mse_train}")
print(f"Train RMSE: {rmse_train}")
print(f"Train R2: {r2_train}")
print(f"Train Adjusted R2: {adjusted_r2_train}")
print(f"Train MAE: {mae_train}")

print(f"Test MSE: {mse_test}")
print(f"Test RMSE: {rmse_test}")
print(f"Test R2: {r2_test}")
print(f"Test Adjusted R2: {adjusted_r2_test}")
print(f"Test MAE: {mae_test}")


FileNotFoundError: [Errno 2] No such file or directory: 'path_to_your_electricity_bill_dataset.csv'