In [78]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import warnings
warnings.filterwarnings('ignore')

In [79]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from skelm import ELMRegressor
from sklearn.linear_model import Ridge

In [80]:
df = pd.read_csv("Group_13_data_cleaned.csv")
df.head()

Unnamed: 0,AT,AP,AH,AFDP,GTEP,TIT,TAT,TEY,CDP,CO,NOX
0,4.5878,1018.7,83.675,3.5758,23.979,1086.2,549.83,134.67,11.898,0.32663,81.952
1,4.2932,1018.3,84.235,3.5709,23.951,1086.1,550.05,134.67,11.892,0.44784,82.377
2,3.9045,1018.4,84.858,3.5828,23.99,1086.5,550.19,135.1,12.042,0.45144,83.776
3,3.7436,1018.3,85.434,3.5808,23.911,1086.5,550.17,135.03,11.99,0.23107,82.505
4,3.7516,1017.8,85.182,3.5781,23.917,1085.9,550.0,134.67,11.91,0.26747,82.028


In [81]:
# prepare the data for the model
# select the only NOX as target variable
nox_df = df.copy()
nox_df = nox_df.drop("CO", axis = 1) # drop the target variable CO

In [82]:
# split the data for training and test using sklearn train_test_split function 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer

# split the data 
X = nox_df.iloc[:, :-1]
y = nox_df["NOX"]

In [83]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler 
from sklearn.metrics import r2_score

In [84]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size = 0.2,random_state= 42, shuffle = True)
X_train.shape ,X_test.shape,y_train.shape, y_test.shape

((29386, 9), (7347, 9), (29386,), (7347,))

In [85]:
# Feature scaling using standard scaler
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Single models

# Linear Regression best model

In [86]:
from sklearn.linear_model import LinearRegression
best_linear = LinearRegression(copy_X= True, fit_intercept= True)
best_linear.fit(X_train,y_train)

y_test_pred = best_linear.predict(X_test)
MSE = mean_squared_error(y_test, y_test_pred)
MAE =  mean_absolute_error(y_test, y_test_pred)
RMSE = MSE**0.5

print(f"The mean absolute error is {MAE}")
print(f"The mean squared error is {MSE}")
print(f"The root mean squared error is {RMSE}")

The mean absolute error is 5.522325519055797
The mean squared error is 56.830857835226816
The root mean squared error is 7.538624399399854


# KNN best model

In [87]:
from sklearn.neighbors import KNeighborsRegressor
best_knn = KNeighborsRegressor(weights= 'distance', p= 1, n_neighbors= 5)
best_knn.fit(X_train,y_train)

y_test_pred = best_knn.predict(X_test)
MSE = mean_squared_error(y_test, y_test_pred)
MAE =  mean_absolute_error(y_test, y_test_pred)
RMSE = MSE**0.5

print(f"The mean absolute error is {MAE}")
print(f"The mean squared error is {MSE}")
print(f"The root mean squared error is {RMSE}")

The mean absolute error is 2.546409040097436
The mean squared error is 16.640377812578652
The root mean squared error is 4.079261920075573


# Decision Tree best model

In [88]:
from sklearn.tree import DecisionTreeRegressor
best_dtr = DecisionTreeRegressor(splitter='best', min_samples_split= 4, min_samples_leaf= 5, 
                                 max_features= 'auto', max_depth= 12)
best_dtr.fit(X_train,y_train)

y_test_pred = best_dtr.predict(X_test)
MSE = mean_squared_error(y_test, y_test_pred)
MAE =  mean_absolute_error(y_test, y_test_pred)
RMSE = MSE**0.5

print(f"The mean absolute error is {MAE}")
print(f"The mean squared error is {MSE}")
print(f"The root mean squared error is {RMSE}")

The mean absolute error is 3.607803206908116
The mean squared error is 29.519521733309197
The root mean squared error is 5.433187069603733


# Random Forest best model

In [89]:
from sklearn.ensemble import RandomForestRegressor
best_rf = RandomForestRegressor(n_estimators= 300, max_features= 'log2', max_depth= 22,bootstrap=True)
best_rf.fit(X_train,y_train)

y_test_pred = best_rf.predict(X_test)
MSE = mean_squared_error(y_test, y_test_pred)
MAE =  mean_absolute_error(y_test, y_test_pred)
RMSE = MSE**0.5

print(f"The mean absolute error is {MAE}")
print(f"The mean squared error is {MSE}")
print(f"The root mean squared error is {RMSE}")

The mean absolute error is 2.6791398520915353
The mean squared error is 16.544321350102013
The root mean squared error is 4.067471124679561


# SVM with linear kernel best model

In [90]:
from sklearn.svm import SVR
best_svm_linear = SVR(kernel="linear")
best_svm_linear.fit(X_train,y_train)

y_test_pred = best_svm_linear.predict(X_test)
MSE = mean_squared_error(y_test, y_test_pred)
MAE =  mean_absolute_error(y_test, y_test_pred)
RMSE = MSE**0.5

print(f"The mean absolute error is {MAE}")
print(f"The mean squared error is {MSE}")
print(f"The root mean squared error is {RMSE}")

The mean absolute error is 5.354599876306204
The mean squared error is 61.17643184018405
The root mean squared error is 7.821536411740602


# SVM with non-linear kernel best model

In [91]:
best_svm_non_linear = SVR(kernel="rbf")
best_svm_non_linear.fit(X_train,y_train)

y_test_pred = best_svm_non_linear.predict(X_test)
MSE = mean_squared_error(y_test, y_test_pred)
MAE =  mean_absolute_error(y_test, y_test_pred)
RMSE = MSE**0.5

print(f"The mean absolute error is {MAE}")
print(f"The mean squared error is {MSE}")
print(f"The root mean squared error is {RMSE}")

The mean absolute error is 3.850598825007777
The mean squared error is 32.434758761858646
The root mean squared error is 5.695152215863826


# Ensembling the models and implemeting using Voting Regressor

In [94]:
model_1 = LinearRegression()

model_2 = KNeighborsRegressor()

model_3 = DecisionTreeRegressor()

model_4 = RandomForestRegressor()

model_5 = SVR(kernel="linear")

model_6 = SVR(kernel="rbf")

model_7 = LinearRegression(copy_X= True, fit_intercept= True)

model_8 = KNeighborsRegressor(weights= 'distance', p= 1, n_neighbors= 5)

model_9 = DecisionTreeRegressor(splitter='best', min_samples_split= 4, min_samples_leaf= 5, 
                                 max_features= 'auto', max_depth= 12)

model_10 = RandomForestRegressor(n_estimators= 300, max_features= 'log2', max_depth= 22,bootstrap=True)

model_11 = SVR(kernel="linear")

model_12 = SVR(kernel="rbf")

model_14 = ELMRegressor()

model_15 = ridge_regressor.predict(X_train)



In [95]:
# Ensemble regression
from sklearn.ensemble import VotingRegressor

en_reg = VotingRegressor(estimators=[('lr', model_1),('knn', model_2),('dr', model_3),('rf', model_4),
                                     ('svr',model_5),('svnr',model_6),('lr_1',model_7),('knn_1',model_8),
                                     ('dr_1',model_9),('rf_1',model_10),('svr_1',model_11),('svr_2',model_12)])
en_reg = en_reg.fit(X_train,y_train)



In [96]:
en_reg.score(X_test, y_test)

0.823329720283054

In [97]:
# Ensemble model on test data
y_test_pred = en_reg.predict(X_test)
MSE = mean_squared_error(y_test_pred, y_test)
MAE =  mean_absolute_error(y_test_pred, y_test)
RMSE = MSE**0.5

print(f"The mean absolute error is {MAE}")
print(f"The mean squared error is {MSE}")
print(f"The root mean squared error is {RMSE}")

The mean absolute error is 3.3106380278455125
The mean squared error is 23.411606662833304
The root mean squared error is 4.838554191370942


In [98]:
from tabulate import tabulate

def metrics(name,x_train,y_test,pred):
    MSE=mean_squared_error(y_test,pred)
    RMSE=np.sqrt(mean_squared_error(y_test,pred))
    MAE = mean_absolute_error(y_test,pred)
    R2_Score=r2_score(y_test,pred)
    return [name,MSE,RMSE,MAE,R2_Score]


header = ["Model", "MSE","RMSE","MAE","R2_Score"]
data=[]
data.append(metrics("Linear Regression",X_train,y_test,best_linear.predict(X_test)))
data.append(metrics("KNN",X_train,y_test,best_knn.predict(X_test)))
data.append(metrics("Decision Tree",X_train,y_test,best_dtr.predict(X_test)))
data.append(metrics("Random Forest",X_train,y_test,best_rf.predict(X_test)))
data.append(metrics("SVM Linear",X_train,y_test,best_svm_linear.predict(X_test)))
data.append(metrics("SVM Non Linear",X_train,y_test,best_svm_non_linear.predict(X_test)))
data.append(metrics("Ensemble model",X_train,y_test,en_reg.predict(X_test)))

print(tabulate(data, headers=header, tablefmt="psql"))

+-------------------+---------+---------+---------+------------+
| Model             |     MSE |    RMSE |     MAE |   R2_Score |
|-------------------+---------+---------+---------+------------|
| Linear Regression | 56.8309 | 7.53862 | 5.52233 |   0.571139 |
| KNN               | 16.6404 | 4.07926 | 2.54641 |   0.874427 |
| Decision Tree     | 29.5195 | 5.43319 | 3.6078  |   0.777238 |
| Random Forest     | 16.5443 | 4.06747 | 2.67914 |   0.875152 |
| SVM Linear        | 61.1764 | 7.82154 | 5.3546  |   0.538346 |
| SVM Non Linear    | 32.4348 | 5.69515 | 3.8506  |   0.755239 |
| Ensemble model    | 23.4116 | 4.83855 | 3.31064 |   0.82333  |
+-------------------+---------+---------+---------+------------+


##  For CO as Target Variable

## Linear Regression

In [99]:
co_df = df.copy()
co_df = co_df.drop("NOX", axis = 1) # drop the target variable NOX

In [100]:
co_df

Unnamed: 0,AT,AP,AH,AFDP,GTEP,TIT,TAT,TEY,CDP,CO
0,4.5878,1018.7,83.675,3.5758,23.979,1086.2,549.83,134.67,11.898,0.32663
1,4.2932,1018.3,84.235,3.5709,23.951,1086.1,550.05,134.67,11.892,0.44784
2,3.9045,1018.4,84.858,3.5828,23.990,1086.5,550.19,135.10,12.042,0.45144
3,3.7436,1018.3,85.434,3.5808,23.911,1086.5,550.17,135.03,11.990,0.23107
4,3.7516,1017.8,85.182,3.5781,23.917,1085.9,550.00,134.67,11.910,0.26747
...,...,...,...,...,...,...,...,...,...,...
36728,3.6268,1028.5,93.200,3.1661,19.087,1037.0,541.59,109.08,10.411,10.99300
36729,4.1674,1028.6,94.036,3.1923,19.016,1037.6,542.28,108.79,10.344,11.14400
36730,5.4820,1028.5,95.219,3.3128,18.857,1038.0,543.48,107.81,10.462,11.41400
36731,5.8837,1028.7,94.200,3.9831,23.563,1076.9,550.11,131.41,11.771,3.31340


In [101]:
# split the data for training and test using sklearn train_test_split function 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer

# split the data 
X = co_df.iloc[:, :-1]
y = co_df["CO"]

In [102]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler 
from sklearn.metrics import r2_score

In [103]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size = 0.2,random_state= 42, shuffle = True)
X_train.shape ,X_test.shape,y_train.shape, y_test.shape

((29386, 9), (7347, 9), (29386,), (7347,))

In [104]:
# Feature scaling using standard scaler
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [105]:
from sklearn.linear_model import LinearRegression
best_linear = LinearRegression(copy_X= True, fit_intercept= True)
best_linear.fit(X_train,y_train)

y_test_pred = best_linear.predict(X_test)
MSE = mean_squared_error(y_test, y_test_pred)
MAE =  mean_absolute_error(y_test, y_test_pred)
RMSE = MSE**0.5

print(f"The mean absolute error is {MAE}")
print(f"The mean squared error is {MSE}")
print(f"The root mean squared error is {RMSE}")

The mean absolute error is 0.8350803011955488
The mean squared error is 2.3801519120147665
The root mean squared error is 1.542774096235339


## KNN MODEL

In [106]:
from sklearn.neighbors import KNeighborsRegressor
best_knn = KNeighborsRegressor(weights= 'distance', p= 1, n_neighbors= 5)
best_knn.fit(X_train,y_train)

y_test_pred = best_knn.predict(X_test)
MSE = mean_squared_error(y_test, y_test_pred)
MAE =  mean_absolute_error(y_test, y_test_pred)
RMSE = MSE**0.5

print(f"The mean absolute error is {MAE}")
print(f"The mean squared error is {MSE}")
print(f"The root mean squared error is {RMSE}")

The mean absolute error is 0.4842703921758483
The mean squared error is 1.379684125442311
The root mean squared error is 1.174599559612684


## DECISION TREE

In [107]:
from sklearn.tree import DecisionTreeRegressor
best_dtr = DecisionTreeRegressor(splitter='best', min_samples_split= 4, min_samples_leaf= 5, 
                                 max_features= 'auto', max_depth= 12)
best_dtr.fit(X_train,y_train)

y_test_pred = best_dtr.predict(X_test)
MSE = mean_squared_error(y_test, y_test_pred)
MAE =  mean_absolute_error(y_test, y_test_pred)
RMSE = MSE**0.5

print(f"The mean absolute error is {MAE}")
print(f"The mean squared error is {MSE}")
print(f"The root mean squared error is {RMSE}")

The mean absolute error is 0.60070382083274
The mean squared error is 1.7575843116915864
The root mean squared error is 1.325739156731665


## RANDOM FOREST

In [108]:
from sklearn.ensemble import RandomForestRegressor
best_rf = RandomForestRegressor(n_estimators= 300, max_features= 'log2', max_depth= 22,bootstrap=True)
best_rf.fit(X_train,y_train)

y_test_pred = best_rf.predict(X_test)
MSE = mean_squared_error(y_test, y_test_pred)
MAE =  mean_absolute_error(y_test, y_test_pred)
RMSE = MSE**0.5

print(f"The mean absolute error is {MAE}")
print(f"The mean squared error is {MSE}")
print(f"The root mean squared error is {RMSE}")

The mean absolute error is 0.4920644178865737
The mean squared error is 1.2189169507408322
The root mean squared error is 1.1040457194975362


## SVM-LINEAR

In [109]:
from sklearn.svm import SVR
best_svm_linear = SVR(kernel="linear")
best_svm_linear.fit(X_train,y_train)

y_test_pred = best_svm_linear.predict(X_test)
MSE = mean_squared_error(y_test, y_test_pred)
MAE =  mean_absolute_error(y_test, y_test_pred)
RMSE = MSE**0.5

print(f"The mean squared error is {MSE}")
print(f"The mean absolute error is {MAE}")
print(f"The root mean squared error is {RMSE}")

The mean squared error is 2.5737842956357158
The mean absolute error is 0.7939728648919845
The root mean squared error is 1.6043018093973826


## SVM- NON LINEAR

In [110]:
best_svm_non_linear = SVR(kernel="rbf")
best_svm_non_linear.fit(X_train,y_train)

y_test_pred = best_svm_non_linear.predict(X_test)
MSE = mean_squared_error(y_test, y_test_pred)
MAE =  mean_absolute_error(y_test, y_test_pred)
RMSE = MSE**0.5
print(f"The mean squared error is {MSE}")
print(f"The mean absolute error is {MAE}")
print(f"The root mean squared error is {RMSE}")

The mean squared error is 1.8085743911049819
The mean absolute error is 0.579575622468902
The root mean squared error is 1.3448324769669202


# Ensembling the models and implemeting using Voting Regressor

In [111]:
model_1 = LinearRegression()

model_2 = KNeighborsRegressor()

model_3 = DecisionTreeRegressor()

model_4 = RandomForestRegressor()

model_5 = SVR(kernel="linear")

model_6 = SVR(kernel="rbf")

model_7 = LinearRegression(copy_X= True, fit_intercept= True)

model_8 = KNeighborsRegressor(weights= 'distance', p= 1, n_neighbors= 5)

model_9 = DecisionTreeRegressor(splitter='best', min_samples_split= 4, min_samples_leaf= 5, 
                                 max_features= 'auto', max_depth= 12)

model_10 = RandomForestRegressor(n_estimators= 300, max_features= 'log2', max_depth= 22,bootstrap=True)

model_11 = SVR(kernel="linear")

model_12 = SVR(kernel="rbf",C=10,degree=1)

In [112]:
# Ensemble regression
from sklearn.ensemble import VotingRegressor

en_reg = VotingRegressor(estimators=[('lr', model_1),('knn', model_2),('dr', model_3),('rf', model_4),
                                     ('svr',model_5),('svnr',model_6),('lr_1',model_7),('knn_1',model_8),
                                     ('dr_1',model_9),('rf_1',model_10),('svr_1',model_11),('svr_2',model_12)])
en_reg = en_reg.fit(X_train,y_train)



In [113]:
en_reg.score(X_test, y_test)

0.7326613340314887

In [114]:
# Ensemble model on test data
y_test_pred = en_reg.predict(X_test)
MSE = mean_squared_error(y_test_pred, y_test)
MAE =  mean_absolute_error(y_test_pred, y_test)
RMSE = MSE**0.5

print(f"The mean absolute error is {MAE}")
print(f"The mean squared error is {MSE}")
print(f"The root mean squared error is {RMSE}")

The mean absolute error is 0.5507993057797654
The mean squared error is 1.4536905936940459
The root mean squared error is 1.2056909196365566


In [115]:
from tabulate import tabulate

def metrics(name,x_train,y_test,pred):
    MSE=mean_squared_error(y_test,pred)
    RMSE=np.sqrt(mean_squared_error(y_test,pred))
    MAE = mean_absolute_error(y_test,pred)
    R2_Score=r2_score(y_test,pred)
    return [name,MSE,RMSE,MAE,R2_Score]


header = ["Model", "MSE","RMSE","MAE","R2_Score"]
data=[]
data.append(metrics("Linear Regression",X_train,y_test,best_linear.predict(X_test)))
data.append(metrics("KNN",X_train,y_test,best_knn.predict(X_test)))
data.append(metrics("Decision Tree",X_train,y_test,best_dtr.predict(X_test)))
data.append(metrics("Random Forest",X_train,y_test,best_rf.predict(X_test)))
data.append(metrics("SVM Linear",X_train,y_test,best_svm_linear.predict(X_test)))
data.append(metrics("SVM Non Linear",X_train,y_test,best_svm_non_linear.predict(X_test)))
data.append(metrics("Ensemble model",X_train,y_test,en_reg.predict(X_test)))

print(tabulate(data, headers=header, tablefmt="psql"))

+-------------------+---------+---------+----------+------------+
| Model             |     MSE |    RMSE |      MAE |   R2_Score |
|-------------------+---------+---------+----------+------------|
| Linear Regression | 2.38015 | 1.54277 | 0.83508  |   0.562282 |
| KNN               | 1.37968 | 1.1746  | 0.48427  |   0.746271 |
| Decision Tree     | 1.75758 | 1.32574 | 0.600704 |   0.676774 |
| Random Forest     | 1.21892 | 1.10405 | 0.492064 |   0.775837 |
| SVM Linear        | 2.57378 | 1.6043  | 0.793973 |   0.526672 |
| SVM Non Linear    | 1.80857 | 1.34483 | 0.579576 |   0.667397 |
| Ensemble model    | 1.45369 | 1.20569 | 0.550799 |   0.732661 |
+-------------------+---------+---------+----------+------------+
