In [38]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from ydata_profiling import ProfileReport

In [2]:
# Loading the dataset
fiveG = pd.read_csv('5G_energy_consumption_dataset.csv')
fiveG.head()

Unnamed: 0,Time,BS,Energy,load,ESMODE,TXpower
0,20230101 010000,B_0,64.275037,0.487936,0.0,7.101719
1,20230101 020000,B_0,55.904335,0.344468,0.0,7.101719
2,20230101 030000,B_0,57.698057,0.193766,0.0,7.101719
3,20230101 040000,B_0,55.156951,0.222383,0.0,7.101719
4,20230101 050000,B_0,56.053812,0.175436,0.0,7.101719


In [4]:
fiveG.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92629 entries, 0 to 92628
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Time     92629 non-null  object 
 1   BS       92629 non-null  object 
 2   Energy   92629 non-null  float64
 3   load     92629 non-null  float64
 4   ESMODE   92629 non-null  float64
 5   TXpower  92629 non-null  float64
dtypes: float64(4), object(2)
memory usage: 4.2+ MB


In [5]:
# Generate profile report
profile = ProfileReport(fiveG, title="YData Profiling Report", explorative=True)
profile.to_widgets()  # Display the report in Jupyter Notebook/Lab

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [6]:
#checking for missing values
fiveG.isnull().sum()

Time       0
BS         0
Energy     0
load       0
ESMODE     0
TXpower    0
dtype: int64

In [7]:
#checking for duplicates
fiveG.duplicated().sum()

np.int64(0)

In [10]:
#handling outliers( using the IQR method)
numeric_columns = fiveG.select_dtypes(include=[np.number]).columns
for column in numeric_columns:
    Q1 = fiveG[column].quantile(0.25)
    Q3 = fiveG[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    fiveG = fiveG[(fiveG[column] >= lower_bound) & (fiveG[column] <= upper_bound)]

In [11]:
fiveG.head()

Unnamed: 0,Time,BS,Energy,load,ESMODE,TXpower
1,20230101 020000,B_0,55.904335,0.344468,0.0,7.101719
3,20230101 040000,B_0,55.156951,0.222383,0.0,7.101719
4,20230101 050000,B_0,56.053812,0.175436,0.0,7.101719
20,20230102 020000,B_0,55.754858,0.210952,0.0,7.101719
22,20230102 040000,B_0,55.455904,0.154449,0.0,7.101719


In [12]:
#encoding categorical features
le = LabelEncoder()
fiveG['BS'] = le.fit_transform(fiveG['BS'])

In [13]:
fiveG.head()

Unnamed: 0,Time,BS,Energy,load,ESMODE,TXpower
1,20230101 020000,0,55.904335,0.344468,0.0,7.101719
3,20230101 040000,0,55.156951,0.222383,0.0,7.101719
4,20230101 050000,0,56.053812,0.175436,0.0,7.101719
20,20230102 020000,0,55.754858,0.210952,0.0,7.101719
22,20230102 040000,0,55.455904,0.154449,0.0,7.101719


In [15]:
#selecting target and feature variables
target = 'Energy'
feature = ['BS', 'load', 'ESMODE', 'TXpower']

X = fiveG[feature]
Y = fiveG[target]

In [17]:
# splitting the dataset into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

## Linear Regression model

In [40]:
#scaling the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [41]:
#training the model
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, Y_train)

In [42]:
#making predications
Y_pred = lr_model.predict(X_test_scaled)

In [43]:
#evaluating the model
mse = mean_squared_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)
print("\n Model Performance:")
print(f"Mean Squared error: {mse}")
print(f"R-squared Score: {r2}")


 Model Performance:
Mean Squared error: 65.8188895815244
R-squared Score: 0.4268791772966054


## Random Forest Regressor model

In [18]:
#Train a random forest regressor
rf_model = RandomForestRegressor(n_estimators = 100, random_state = 42)
rf_model.fit(X_train, Y_train)

In [19]:
#making predications on the test sets
Y_pred = rf_model.predict(X_test)

In [31]:
#evaluating the model
mse = mean_squared_error(Y_test, Y_pred)
r2 = r2_score( Y_test, Y_pred)
print("\n Model Performance:")
print(f"Mean Squared error: {mse}")
print(f"R-squared Score: {r2}")


 Model Performance:
Mean Squared error: 8.559341944052642
R-squared Score: 0.925469160480153


In [34]:
#feature importance
feature_importance = pd.DataFrame({'feature': feature, 'importance': rf_model.feature_importances_})
feature_importance = feature_importance.sort_values('importance', ascending=False)
print("\nFeature Importance:")
feature_importance


Feature Importance:


Unnamed: 0,feature,importance
3,TXpower,0.580232
1,load,0.253491
0,BS,0.166277
2,ESMODE,0.0


In [35]:
#visualizing feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Feature Importance')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()

In [36]:
# scatter plot of predicted Vs actual values
plt.figure(figsize=(10, 6))
plt.scatter(Y_test, Y_pred, alpha=0.5)
plt.plot([Y_test.min(), Y_test.max()], [Y_test.min(), Y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Energy Consumption')
plt.ylabel('Predicted Energy Consumption')
plt.title('Actual vs Predicted Energy Consumption')
plt.tight_layout()
plt.savefig('actual_vs_predicted.png')
plt.close()

## To Improve perfomance:
- Do some feature engineering by creating new features or transform existing ones.
- Perform hyperparameter tuning