In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso


In [2]:
data = pd.read_csv(r"C:\Users\DELL\Desktop\Hamoye\energydata_complete.csv")

In [3]:
data.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [4]:
data.shape

(19735, 29)

In [5]:
data.dtypes

date            object
Appliances       int64
lights           int64
T1             float64
RH_1           float64
T2             float64
RH_2           float64
T3             float64
RH_3           float64
T4             float64
RH_4           float64
T5             float64
RH_5           float64
T6             float64
RH_6           float64
T7             float64
RH_7           float64
T8             float64
RH_8           float64
T9             float64
RH_9           float64
T_out          float64
Press_mm_hg    float64
RH_out         float64
Windspeed      float64
Visibility     float64
Tdewpoint      float64
rv1            float64
rv2            float64
dtype: object

In [6]:
# Convert the 'date' column to datetime format
data['date'] = pd.to_datetime(data['date'])


In [7]:
# Check for any missing values
missing_data = data.isnull().sum()
print(missing_data)

date           0
Appliances     0
lights         0
T1             0
RH_1           0
T2             0
RH_2           0
T3             0
RH_3           0
T4             0
RH_4           0
T5             0
RH_5           0
T6             0
RH_6           0
T7             0
RH_7           0
T8             0
RH_8           0
T9             0
RH_9           0
T_out          0
Press_mm_hg    0
RH_out         0
Windspeed      0
Visibility     0
Tdewpoint      0
rv1            0
rv2            0
dtype: int64


In [8]:
# Normalize the column names by stripping extra spaces and converting to lowercase
data.columns = data.columns.str.strip().str.lower()


In [9]:
data.columns

Index(['date', 'appliances', 'lights', 't1', 'rh_1', 't2', 'rh_2', 't3',
       'rh_3', 't4', 'rh_4', 't5', 'rh_5', 't6', 'rh_6', 't7', 'rh_7', 't8',
       'rh_8', 't9', 'rh_9', 't_out', 'press_mm_hg', 'rh_out', 'windspeed',
       'visibility', 'tdewpoint', 'rv1', 'rv2'],
      dtype='object')

In [10]:
# Calculate descriptive statistics for the dataset
descriptive_stats = data.describe(include='all', datetime_is_numeric=True)

# Print descriptive statistics
descriptive_stats

Unnamed: 0,date,appliances,lights,t1,rh_1,t2,rh_2,t3,rh_3,t4,...,t9,rh_9,t_out,press_mm_hg,rh_out,windspeed,visibility,tdewpoint,rv1,rv2
count,19735,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,...,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0
mean,2016-03-20 05:30:00,97.694958,3.801875,21.686571,40.259739,20.341219,40.42042,22.267611,39.2425,20.855335,...,19.485828,41.552401,7.411665,755.522602,79.750418,4.039752,38.330834,3.760707,24.988033,24.988033
min,2016-01-11 17:00:00,10.0,0.0,16.79,27.023333,16.1,20.463333,17.2,28.766667,15.1,...,14.89,29.166667,-5.0,729.3,24.0,0.0,1.0,-6.6,0.005322,0.005322
25%,2016-02-14 23:15:00,50.0,0.0,20.76,37.333333,18.79,37.9,20.79,36.9,19.53,...,18.0,38.5,3.666667,750.933333,70.333333,2.0,29.0,0.9,12.497889,12.497889
50%,2016-03-20 05:30:00,60.0,0.0,21.6,39.656667,20.0,40.5,22.1,38.53,20.666667,...,19.39,40.9,6.916667,756.1,83.666667,3.666667,40.0,3.433333,24.897653,24.897653
75%,2016-04-23 11:45:00,100.0,0.0,22.6,43.066667,21.5,43.26,23.29,41.76,22.1,...,20.6,44.338095,10.408333,760.933333,91.666667,5.5,40.0,6.566667,37.583769,37.583769
max,2016-05-27 18:00:00,1080.0,70.0,26.26,63.36,29.856667,56.026667,29.236,50.163333,26.2,...,24.5,53.326667,26.1,772.3,100.0,14.0,66.0,15.5,49.99653,49.99653
std,,102.524891,7.935988,1.606066,3.979299,2.192974,4.069813,2.006111,3.254576,2.042884,...,2.014712,4.151497,5.317409,7.399441,14.901088,2.451221,11.794719,4.194648,14.496634,14.496634


In [11]:
X = data[['t2']]  # Predictor: Temperature in the living room
y = data['t6']  # Target: Temperature outside

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, predictions))
rmse_rounded = round(rmse, 3)

print("Root Mean Squared Error (RMSE):", rmse_rounded)

Root Mean Squared Error (RMSE): 3.63


In [12]:
# Remove the specified columns
data_cleaned = data.drop(['date', 'lights'], axis=1)

# Define the target variable and features
X = data_cleaned.drop('appliances', axis=1)
y = data_cleaned['appliances']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Normalize the data using MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the linear regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Make predictions on the training set
y_train_pred = model.predict(X_train_scaled)

# Calculate the Mean Absolute Error (MAE) for the training set
mae_train = mean_absolute_error(y_train, y_train_pred)
mae_train_rounded = round(mae_train, 3)

print("Mean Absolute Error (MAE) for the training set:", mae_train_rounded)

Mean Absolute Error (MAE) for the training set: 53.742


In [13]:
# Make predictions on the test set
y_test_pred = model.predict(X_test_scaled)

# Calculate Mean Absolute Error (MAE) for the test set
mae_test = mean_absolute_error(y_test, y_test_pred)

# Round MAE to three decimal places
mae_test_rounded = round(mae_test, 3)

print("Mean Absolute Error (MAE) for the test set:", mae_test_rounded)


Mean Absolute Error (MAE) for the test set: 53.643


In [14]:
# Make predictions on the test set
y_test_pred = model.predict(X_test_scaled)

# Calculate Mean Squared Error (MSE) for the test set
mse_test = mean_squared_error(y_test, y_test_pred)

# Calculate Root Mean Squared Error (RMSE) for the test set
rmse_test = np.sqrt(mse_test)

# Round RMSE to three decimal places
rmse_test_rounded = round(rmse_test, 3)

print("Root Mean Squared Error (RMSE) for the test set:", rmse_test_rounded)

Root Mean Squared Error (RMSE) for the test set: 93.64


In [15]:
# Train Ridge regression model with default parameters
ridge_model = Ridge()

# Fit the model on the training data
ridge_model.fit(X_train_scaled, y_train)

# Make predictions on the test data
y_test_pred_ridge = ridge_model.predict(X_test_scaled)

# Calculate RMSE for the predictions on the test data
rmse_test_ridge = np.sqrt(mean_squared_error(y_test, y_test_pred_ridge))

# Round RMSE to three decimal places
rmse_test_ridge_rounded = round(rmse_test_ridge, 3)

print("RMSE for the test set using Ridge regression:", rmse_test_ridge_rounded)

RMSE for the test set using Ridge regression: 93.709


In [16]:
from sklearn.linear_model import Lasso

# Train Lasso regression model with default parameters
lasso_model = Lasso()

# Fit the model on the training data
lasso_model.fit(X_train_scaled, y_train)

# Extract feature weights
feature_weights = lasso_model.coef_

# Determine the number of features with non-zero weights
non_zero_features = sum(feature_weights != 0)

print("Number of features with non-zero feature weights:", non_zero_features)

Number of features with non-zero feature weights: 4


In [17]:
# Make predictions on the test set
y_test_pred_lasso = lasso_model.predict(X_test_scaled)

# Calculate RMSE for the predictions on the test set
rmse_test_lasso = np.sqrt(mean_squared_error(y_test, y_test_pred_lasso))

# Round RMSE to three decimal places
rmse_test_lasso_rounded = round(rmse_test_lasso, 3)

print("RMSE for the test set using Lasso regression:", rmse_test_lasso_rounded)

RMSE for the test set using Lasso regression: 99.424
