In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.feature_selection import SelectFromModel

## Data Preprocessing and Cleaning

#### Loading the Data Set

In [2]:
df = pd.read_csv("WSNBFSFdataset V2 (1).csv")

In [3]:
df.head()

Unnamed: 0,Event,Time,S_Node,Node_id,Rest_Energy,Trace_Level,Mac_Type_Pckt,Source_IP_Port,Des_IP_Port,Packet_Size,...,Broadcast_ID,Dest_Node_Num,Dest_Seq_Num,Src_Node_ID,Src_Seq_Num,behaviour,Type,rate_of_energy_consumption,cumulative_energy_consumption,energy_consumption
0,1,0.1,79,79,600.0,5,0,79.255,1.255,48,...,1,100,0,79,4,No attack,normal,0.0,0.0,0.0
1,2,0.100963,78,78,599.979723,5,800,79.255,1.255,48,...,1,100,0,79,4,No attack,normal,0.0,0.0,0.0
2,2,0.100963,76,76,599.979722,5,800,79.255,1.255,48,...,1,100,0,79,4,No attack,normal,0.0,0.0,0.0
3,2,0.100964,75,75,599.979722,5,800,79.255,1.255,48,...,1,100,0,79,4,No attack,normal,0.0,0.0,0.0
4,2,0.100964,118,118,599.979722,5,800,79.255,1.255,48,...,1,100,0,79,4,No attack,normal,0.0,0.0,0.0


In [4]:
df.shape

(312106, 22)

#### Checking for missing values

In [None]:
df.isnull().sum()

#### Dropping irrelevant and constant columns

In [None]:
columns = df.columns.tolist()

In [None]:
for column in columns:
    print(df[column].value_counts())
    print()

In [None]:
df.drop(['Trace_Level','Mac_Type_Pckt','Des_IP_Port'], axis = 1, inplace = True)

In [None]:
df

#### Data Transformation

##### Encoding non-numeric features

In [None]:
# One Hot Encoding for nominal featuers
encoded_data = pd.get_dummies(df, columns=['behaviour', 'Type','Event'], drop_first=True)

In [None]:
encoded_data

##### Data Standarization

In [None]:
time_column = encoded_data['Time'].values.reshape(-1, 1) 

scaler = StandardScaler()

time_standardized = scaler.fit_transform(time_column)

data_standardized = pd.DataFrame(encoded_data.drop(columns=['Time'])) 
data_standardized['Time_standardized'] = time_standardized

In [None]:
data_standardized

##### Data Normalization

In [None]:
cols= ['S_Node', 'Node_id', 'Rest_Energy', 'Source_IP_Port', 'Packet_Size','TTL', 'Hop_Count', 'Broadcast_ID', 'Dest_Node_Num', 'Src_Node_ID','Src_Seq_Num', 'rate_of_energy_consumption','cumulative_energy_consumption', 'energy_consumption']

selected_columns = data_standardized[cols]

scaler = MinMaxScaler()

normalized_data = scaler.fit_transform(selected_columns)

normalized_df = pd.DataFrame(normalized_data, columns=cols)

data_normalized = pd.concat([data_standardized.drop(columns=cols), normalized_df], axis=1)

In [None]:
data_normalized

In [None]:
df = data_normalized

## Feature Selection

#### SelectKBest for Linear Regression

In [None]:
X = df.drop(columns=['TTL'])  
y = df['TTL']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
r2_values = []
mse_values = []
rmse_values = []
mae_values = []
selected_features = []
for i in range(1,22):
    select_kbest = SelectKBest(score_func=f_regression, k=i)
    select_kbest.fit(X_train, y_train)
    
    X_train_new = select_kbest.transform(X_train)
    X_test_new = select_kbest.transform(X_test)
    
    model = LinearRegression()
    model.fit(X_train_new, y_train)
    
    y_pred = model.predict(X_test_new)

    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    
    r2_values.append(r2)
    mse_values.append(mse)
    rmse_values.append(rmse)
    mae_values.append(mae)
    selected_features.append(X.columns[select_kbest.get_support()])
print(f"Iteration {r2_values.index(max(r2_values)) + 1} has the highest R2 of {max(r2_values)}")
print(f"Iteration {mse_values.index(min(mse_values)) + 1} has the lowest MSE of {min(mse_values)}")
print(f"Iteration {rmse_values.index(min(rmse_values)) + 1} has the lowest RMSE of {min(rmse_values)}")
print(f"Iteration {mae_values.index(min(mae_values)) + 1} has the lowest MAE of {min(mae_values)}")

#### SelectKBest for Decision Tree

In [None]:
r2_values = []
mse_values = []
rmse_values = []
mae_values = []
selected_features = []
for i in range(1,22):
    select_kbest = SelectKBest(score_func=f_regression, k=i)
    select_kbest.fit(X_train, y_train)
    
    X_train_new = select_kbest.transform(X_train)
    X_test_new = select_kbest.transform(X_test)
    
    model = DecisionTreeRegressor()
    model.fit(X_train_new, y_train)
    
    y_pred = model.predict(X_test_new)

    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    
    r2_values.append(r2)
    mse_values.append(mse)
    rmse_values.append(rmse)
    mae_values.append(mae)
    selected_features.append(X.columns[select_kbest.get_support()])
print(f"Iteration {r2_values.index(max(r2_values)) + 1} has the highest R2 of {max(r2_values)}")
print(f"Iteration {mse_values.index(min(mse_values)) + 1} has the lowest MSE of {min(mse_values)}")
print(f"Iteration {rmse_values.index(min(rmse_values)) + 1} has the lowest RMSE of {min(rmse_values)}")
print(f"Iteration {mae_values.index(min(mae_values)) + 1} has the lowest MAE of {min(mae_values)}")

#### SelectKBest for Random Forest

In [None]:
r2_values = []
mse_values = []
rmse_values = []
mae_values = []
selected_features = []
for i in range(1,22):
    select_kbest = SelectKBest(score_func=f_regression, k=i)
    select_kbest.fit(X_train, y_train)
    
    X_train_new = select_kbest.transform(X_train)
    X_test_new = select_kbest.transform(X_test)
    
    model = RandomForestRegressor()
    model.fit(X_train_new, y_train)
    
    y_pred = model.predict(X_test_new)

    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    
    r2_values.append(r2)
    mse_values.append(mse)
    rmse_values.append(rmse)
    mae_values.append(mae)
    selected_features.append(X.columns[select_kbest.get_support()])
print(f"Iteration {r2_values.index(max(r2_values)) + 1} has the highest R2 of {max(r2_values)}")
print(f"Iteration {mse_values.index(min(mse_values)) + 1} has the lowest MSE of {min(mse_values)}")
print(f"Iteration {rmse_values.index(min(rmse_values)) + 1} has the lowest RMSE of {min(rmse_values)}")
print(f"Iteration {mae_values.index(min(mae_values)) + 1} has the lowest MAE of {min(mae_values)}")

#### SelectFrom for Linear Regression:

In [None]:
model =  LinearRegression()
model.fit(X,y)
selector = SelectFromModel(model)
selector.fit(X,y)
selectfrom_lr_selected_features = X.columns[selector.get_support()]
print("Selected features:", selectfrom_lr_selected_features)

#### SelectFrom for Decision Tree:

In [None]:
model =  DecisionTreeRegressor()
model.fit(X,y)
selector = SelectFromModel(model)
selector.fit(X,y)
selectfrom_dt_selected_features = X.columns[selector.get_support()]
print("Selected features:", selectfrom_dt_selected_features)

#### SelectFrom for Random Forest:

In [None]:
model =  RandomForestRegressor()
model.fit(X,y)
selector = SelectFromModel(model)
selector.fit(X,y)
selectfrom_rf_selected_features = X.columns[selector.get_support()]
print("Selected features:", selectfrom_rf_selected_features)

## Predictive Modeling

#### Linear Regression

##### SelectKBest

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
selector = SelectKBest(f_regression, k=21)
selector.fit(X_train, y_train)
selected_features = X.columns[selector.get_support()]

In [None]:
print("The selected features for Linear Regression using SelectKBest are: ", selected_features)

In [None]:
model = LinearRegression()
lr_result = []
for i in range(31):
    X_train, X_test, y_train, y_test = train_test_split(X[selected_features], y, test_size=0.2, random_state = i)
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    MAE = mean_absolute_error(y_test, y_pred)
    MSE = mean_squared_error(y_test, y_pred)
    RMSE = mean_squared_error(y_test, y_pred, squared=False)
    R2 = r2_score(y_test, y_pred)
    lr_result.append({'MAE':MAE,'MSE':MSE,'RMSE':RMSE,'R2':R2})
lr_df = pd.DataFrame(lr_result)

In [None]:
print("The R2 of the Linear Regression using SelectKBest: ",lr_df['R2'].mean())
print("The MSE of the Linear Regression using SelectKBest: ",lr_df['MSE'].mean())
print("The R2 of the Linear Regression using SelectKBest: ",lr_df['RMSE'].mean())
print("The MSE of the Linear Regression using SelectKBest: ",lr_df['MAE'].mean())

##### SelectFrom

In [None]:
model = LinearRegression()
selectfrom_lr_result = []
for i in range(31):
    X_train, X_test, y_train, y_test = train_test_split(X[selectfrom_lr_selected_features], y, test_size=0.2,random_state=i)
    
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    
    metrics = {'MAE': mean_absolute_error(y_test, y_pred),
               'MSE': mean_squared_error(y_test, y_pred),
               'RMSE': mean_squared_error(y_test, y_pred, squared=False),
               'R2': r2_score(y_test, y_pred)}
    selectfrom_lr_result.append(metrics)
selectfrom_lr_df = pd.DataFrame(selectfrom_lr_result)

In [None]:
print("The R2 of the Linear Regression using SelectFromModel: ",selectfrom_lr_df['R2'].mean())
print("The MSE of the Linear Regression using SelectFromModel: ",selectfrom_lr_df['MSE'].mean())
print("The R2 of the Linear Regression using SelectFromModel: ",selectfrom_lr_df['RMSE'].mean())
print("The MSE of the Linear Regression using SelectFromModel: ",selectfrom_lr_df['MAE'].mean())

#### Decision Tree

##### SelectKBest

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
selector = SelectKBest(f_regression, k = 5)
selector.fit(X_train, y_train)
selected_features = X.columns[selector.get_support()]

In [None]:
print("The selected features for Decision Tree using SelectKBest are: ", selected_features)

In [None]:
model = DecisionTreeRegressor()
skb_dt_result = []
for i in range(31):
    X_train, X_test, y_train, y_test = train_test_split(X[selected_features], y, test_size=0.2, random_state = i)
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    MAE = mean_absolute_error(y_test, y_pred)
    MSE = mean_squared_error(y_test, y_pred)
    RMSE = mean_squared_error(y_test, y_pred, squared=False)
    R2 = r2_score(y_test, y_pred)
    skb_dt_result.append({'MAE':MAE,'MSE':MSE,'RMSE':RMSE,'R2':R2})
skb_dt_df = pd.DataFrame(skb_dt_result)

In [None]:
print("The R2 of the Decision Tree Regression using SelectKBest: ", skb_dt_df['R2'].mean())
print("The MSE of the Decision Tree Regression using SelectKBest: ", skb_dt_df['MSE'].mean())
print("The R2 of the Decision Tree Regression using SelectKBest: ", skb_dt_df['RMSE'].mean())
print("The MSE of the Decision Tree Regression using SelectKBest: ", skb_dt_df['MAE'].mean())

##### SelectFrom

In [None]:
model = DecisionTreeRegressor()
selectfrom_dt_result = []
for i in range(31):
    X_train, X_test, y_train, y_test = train_test_split(X[selectfrom_dt_selected_features], y, test_size=0.2,random_state=i)
    
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    
    metrics = {'MAE': mean_absolute_error(y_test, y_pred),
               'MSE': mean_squared_error(y_test, y_pred),
               'RMSE': mean_squared_error(y_test, y_pred, squared=False),
               'R2': r2_score(y_test, y_pred)}
    selectfrom_dt_result.append(metrics)
selectfrom_dt_df = pd.DataFrame(selectfrom_dt_result)

In [None]:
print("The R2 of the Decision Tree Regression using SelectFromModel: ", selectfrom_dt_df['R2'].mean())
print("The MSE of the Decision Tree Regression using SelectFromModel: ", selectfrom_dt_df['MSE'].mean())
print("The R2 of the Decision Tree Regression using SelectFromModel: ", selectfrom_dt_df['RMSE'].mean())
print("The MSE of the Decision Tree Regression using SelectFromModel: ", selectfrom_dt_df['MAE'].mean())

#### Random Forest

##### SelectKBest

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
selector = SelectKBest(f_regression, k = 5)
selector.fit(X_train, y_train)
selected_features = X.columns[selector.get_support()]

In [None]:
print("The selected features for Random Forest using SelectKBest are: ", selected_features)

In [None]:
model = RandomForestRegressor()
skb_rf_result = []
for i in range(31):
    X_train, X_test, y_train, y_test = train_test_split(X[selected_features], y, test_size=0.2, random_state = i)
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    MAE = mean_absolute_error(y_test, y_pred)
    MSE = mean_squared_error(y_test, y_pred)
    RMSE = mean_squared_error(y_test, y_pred, squared=False)
    R2 = r2_score(y_test, y_pred)
    skb_rf_result.append({'MAE':MAE,'MSE':MSE,'RMSE':RMSE,'R2':R2})
skb_rf_df = pd.DataFrame(skb_rf_result)

In [None]:
print("The R2 of the Random Forest Regression using SelectKBest: ",skb_rf_df['R2'].mean())
print("The MSE of the Random Forest Regression using SelectKBest: ",skb_rf_df['MSE'].mean())
print("The R2 of the Random Forest Regression using SelectKBest: ",skb_rf_df['RMSE'].mean())
print("The MSE of the Random Forest Regression using SelectKBest: ",skb_rf_df['MAE'].mean())

##### SelectFrom

In [None]:
model = RandomForestRegressor()
selectfrom_rf_result = []
for i in range(31):
    X_train, X_test, y_train, y_test = train_test_split(X[selectfrom_rf_selected_features], y, test_size=0.2,random_state=i)
    
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    
    metrics = {'MAE': mean_absolute_error(y_test, y_pred),
               'MSE': mean_squared_error(y_test, y_pred),
               'RMSE': mean_squared_error(y_test, y_pred, squared=False),
               'R2': r2_score(y_test, y_pred)}
    selectfrom_rf_result.append(metrics)
selectfrom_rf_df = pd.DataFrame(selectfrom_rf_result)

In [None]:
print("The R2 of the Random Forest Regression using SelectFromModel: ",selectfrom_rf_df['R2'].mean())
print("The MSE of the Random Forest Regression using SelectFromModel: ",selectfrom_rf_df['MSE'].mean())
print("The R2 of the Random Forest Regression using SelectFromModel: ",selectfrom_rf_df['RMSE'].mean())
print("The MSE of the Random Forest Regression using SelectFromModel: ",selectfrom_rf_df['MAE'].mean())

## Results Visualization

In [None]:
skb_rf_df = pd.DataFrame(skb_rf_result)
skb_dt_df = pd.DataFrame(skb_dt_result)
lr_df = pd.DataFrame(lr_result)

selectfrom_lr_df = pd.DataFrame(selectfrom_lr_result)
selectfrom_dt_df = pd.DataFrame(selectfrom_dt_result)
selectfrom_rf_df = pd.DataFrame(selectfrom_rf_result)

In [None]:
dataframes = [lr_df, skb_dt_df, skb_rf_df]
labels = ['Linear Regression', 'Decision Tree Regression', 'Random Forest Regression']

r2_values = [df['R2'] for df in dataframes]

plt.figure(figsize=(10, 6))
plt.boxplot(r2_values, labels=labels, patch_artist=True, boxprops=dict(facecolor='lightgray'))

plt.xlabel('Models')
plt.ylabel('R2')
plt.title('Boxplot of R2 by selectKBest')

plt.show()

In [None]:
dataframes = [lr_df, skb_dt_df, skb_rf_df]
labels = ['Linear Regression', 'Decision Tree Regression', 'Random Forest Regression']

r2_values = [df['MSE'] for df in dataframes]

plt.figure(figsize=(10, 6))
plt.boxplot(r2_values, labels=labels, patch_artist=True, boxprops=dict(facecolor='lightgray'))

plt.xlabel('Models')
plt.ylabel('MSE')
plt.title('Boxplot of MSE by selectKBest')

plt.show()

In [None]:
dataframes = [selectfrom_lr_df, selectfrom_dt_df, selectfrom_rf_df]
labels = ['Linear Regression', 'Decision Tree Regression', 'Random Forest Regression']

r2_values = [df['R2'] for df in dataframes]

plt.figure(figsize=(10, 6))
plt.boxplot(r2_values, labels=labels, patch_artist=True, boxprops=dict(facecolor='lightgray'))

plt.xlabel('Models')
plt.ylabel('R2')
plt.title('Boxplot of R2 by selectFromModel')

plt.show()

In [None]:
dataframes = [selectfrom_lr_df, selectfrom_dt_df, selectfrom_rf_df]
labels = ['Linear Regression', 'Decision Tree Regression', 'Random Forest Regression']

r2_values = [df['MSE'] for df in dataframes]

plt.figure(figsize=(10, 6))
plt.boxplot(r2_values, labels=labels, patch_artist=True, boxprops=dict(facecolor='lightgray'))

plt.xlabel('Models')
plt.ylabel('MSE')
plt.title('Boxplot of MSE by selectFromModel')

plt.show()

In [None]:
models = ['Linear Regression', 'Decision Tree Regression', 'Random Forest Regression']
labels = ['R2']
colors = ['red', 'green']  # Red for SelectKBest, Green for SelectFromModel

fig, ax = plt.subplots(figsize=(10, 6))

x = np.arange(len(models))
width = 0.35

for i, label in enumerate(labels):
    skb_value = [lr_df[label].mean(), skb_dt_df[label].mean(), skb_rf_df[label].mean()]
    sfm_value = [selectfrom_lr_df[label].mean(), selectfrom_dt_df[label].mean(), selectfrom_rf_df[label].mean()]

    ax.bar(x - width/2, skb_value, width, label='SelectKBest', color=colors[0])
    ax.bar(x + width/2, sfm_value, width, label='SelectFromModel', color=colors[1])

ax.set_ylabel('R2')
ax.set_title('R2 by Model')
ax.set_xticks(x)
ax.set_xticklabels(models)
ax.legend()

plt.tight_layout()
plt.show()


In [None]:
models = ['Linear Regression', 'Decision Tree Regression', 'Random Forest Regression']
labels = ['MSE']
colors = ['red', 'green']  # Red for SelectKBest, Green for SelectFromModel

fig, ax = plt.subplots(figsize=(10, 6))

x = np.arange(len(models))
width = 0.35

for i, label in enumerate(labels):
    skb_value = [lr_df[label].mean(), skb_dt_df[label].mean(), skb_rf_df[label].mean()]
    sfm_value = [selectfrom_lr_df[label].mean(), selectfrom_dt_df[label].mean(), selectfrom_rf_df[label].mean()]

    ax.bar(x - width/2, skb_value, width, label='SelectKBest', color=colors[0])
    ax.bar(x + width/2, sfm_value, width, label='SelectFromModel', color=colors[1])

ax.set_ylabel('MSE')
ax.set_title('MSE by Model')
ax.set_xticks(x)
ax.set_xticklabels(models)
ax.legend()

plt.tight_layout()
plt.show()
