In [54]:
import numpy as np
import pandas as pd

In [55]:
df = pd.read_csv("Group_13_data_cleaned.csv")
df

Unnamed: 0,AT,AP,AH,AFDP,GTEP,TIT,TAT,TEY,CDP,CO,NOX
0,4.5878,1018.7,83.675,3.5758,23.979,1086.2,549.83,134.67,11.898,0.32663,81.952
1,4.2932,1018.3,84.235,3.5709,23.951,1086.1,550.05,134.67,11.892,0.44784,82.377
2,3.9045,1018.4,84.858,3.5828,23.990,1086.5,550.19,135.10,12.042,0.45144,83.776
3,3.7436,1018.3,85.434,3.5808,23.911,1086.5,550.17,135.03,11.990,0.23107,82.505
4,3.7516,1017.8,85.182,3.5781,23.917,1085.9,550.00,134.67,11.910,0.26747,82.028
...,...,...,...,...,...,...,...,...,...,...,...
36728,3.6268,1028.5,93.200,3.1661,19.087,1037.0,541.59,109.08,10.411,10.99300,89.172
36729,4.1674,1028.6,94.036,3.1923,19.016,1037.6,542.28,108.79,10.344,11.14400,88.849
36730,5.4820,1028.5,95.219,3.3128,18.857,1038.0,543.48,107.81,10.462,11.41400,96.147
36731,5.8837,1028.7,94.200,3.9831,23.563,1076.9,550.11,131.41,11.771,3.31340,64.738


In [56]:
# the data for the model
# selecting only NOX as target variable
nox_df = df.copy()
nox_df = nox_df.drop("CO", axis = 1) # drop the target variable CO

In [57]:
nox_df

Unnamed: 0,AT,AP,AH,AFDP,GTEP,TIT,TAT,TEY,CDP,NOX
0,4.5878,1018.7,83.675,3.5758,23.979,1086.2,549.83,134.67,11.898,81.952
1,4.2932,1018.3,84.235,3.5709,23.951,1086.1,550.05,134.67,11.892,82.377
2,3.9045,1018.4,84.858,3.5828,23.990,1086.5,550.19,135.10,12.042,83.776
3,3.7436,1018.3,85.434,3.5808,23.911,1086.5,550.17,135.03,11.990,82.505
4,3.7516,1017.8,85.182,3.5781,23.917,1085.9,550.00,134.67,11.910,82.028
...,...,...,...,...,...,...,...,...,...,...
36728,3.6268,1028.5,93.200,3.1661,19.087,1037.0,541.59,109.08,10.411,89.172
36729,4.1674,1028.6,94.036,3.1923,19.016,1037.6,542.28,108.79,10.344,88.849
36730,5.4820,1028.5,95.219,3.3128,18.857,1038.0,543.48,107.81,10.462,96.147
36731,5.8837,1028.7,94.200,3.9831,23.563,1076.9,550.11,131.41,11.771,64.738


In [58]:
# split the data for training and test using sklearn train_test_split function 
from sklearn.model_selection import train_test_split

# split the data 
X = nox_df.iloc[:, :-1]
y = nox_df["NOX"]

In [59]:
# Normalizing the X and y. 
# Feature scaling
from sklearn.preprocessing import Normalizer
X = nox_ds1 = Normalizer().fit_transform(X)

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y,  test_size = 0.2,random_state= 42, shuffle = True)

X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=42)

In [60]:
X_train.shape,X_valid.shape,X_test.shape

((22039, 9), (7347, 9), (7347, 9))

In [61]:
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
dtree = tree.DecisionTreeRegressor()
dtree.fit(X_train, y_train)

DecisionTreeRegressor()

In [62]:
dtree.score(X_train,y_train)

1.0

#### A decision tree score of 1 means that the model has made perfect predictions on the training data, achieving a 100% accuracy score.However, this does not necessarily mean that the model is good at generalizing to new data.

In [63]:
dtree.score(X_valid,y_valid)

0.687147092175692

In [64]:
from sklearn.metrics import mean_absolute_error,r2_score
from sklearn.metrics import mean_squared_error
from sklearn import metrics 
# make predictions on the validation set
y_pred_valid = dtree.predict(X_valid)

# calculate the MAE for the validation set
mae_val = mean_absolute_error(y_valid, y_pred_valid)
# calculate the MSE for the validation set
mse_val = mean_squared_error(y_valid, y_pred_valid)

print("Validation Mean Absolute Error: ", mae_val)
print("Validation Mean Squared Error: ", mse_val)
print(f"RMSE on the validation set: {np.sqrt(mse_val)}")

Validation Mean Absolute Error:  3.9766617667074997
Validation Mean Squared Error:  41.93760373948551
RMSE on the validation set: 6.475924933126195


In [65]:
y_pred = dtree.predict(X_valid) 
r2_score(y_valid, y_pred)

0.687147092175692

## Using GridSearchCV to select hyper-parameters

In [66]:
from sklearn.model_selection import GridSearchCV, cross_val_score

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size = 0.2,random_state= 42, shuffle = True)
max_depth = range(1,100,10)
grids_dtree = GridSearchCV(estimator = DecisionTreeRegressor(random_state=42),
                  param_grid= {
                                "max_features": ['auto', 'sqrt'],
                                "min_samples_split" : [2, 5, 10],
                                "min_samples_leaf" : [1, 2, 4],
                                "max_depth" : max_depth},
                  cv=5,
                  scoring = 'r2')

In [68]:
grids_dtree.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(random_state=42),
             param_grid={'max_depth': range(1, 100, 10),
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10]},
             scoring='r2')

In [69]:
grids_dtree.best_score_

0.7428484632677824

In [70]:
grids_dtree.best_params_

{'max_depth': 11,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 10}

In [71]:
# make predictions on the test set
y_pred_test = grids_dtree.predict(X_test)

# calculate the MAE for the test set
mae_val = mean_absolute_error(y_test, y_pred_test)
# calculate the MSE for the test set
mse_val = mean_squared_error(y_test, y_pred_test)

print("Mean Absolute Error: ", mae_val)
print("Mean Squared Error: ", mse_val)
print(f"RMSE : {np.sqrt(mse_val)}")


Mean Absolute Error:  3.7705239697823183
Mean Squared Error:  31.79030676459735
RMSE : 5.63828934736391


In [72]:
y_pred = grids_dtree.predict(X_test) 
r2_score(y_test, y_pred)

0.7601017961187106

# FOR CO

In [80]:
# prepare the data for the model
# select the only CO as target variable
co_df = df.copy()
co_df = co_df.drop("NOX", axis = 1) # drop the target variable CO

In [81]:
co_df

Unnamed: 0,AT,AP,AH,AFDP,GTEP,TIT,TAT,TEY,CDP,CO
0,4.5878,1018.7,83.675,3.5758,23.979,1086.2,549.83,134.67,11.898,0.32663
1,4.2932,1018.3,84.235,3.5709,23.951,1086.1,550.05,134.67,11.892,0.44784
2,3.9045,1018.4,84.858,3.5828,23.990,1086.5,550.19,135.10,12.042,0.45144
3,3.7436,1018.3,85.434,3.5808,23.911,1086.5,550.17,135.03,11.990,0.23107
4,3.7516,1017.8,85.182,3.5781,23.917,1085.9,550.00,134.67,11.910,0.26747
...,...,...,...,...,...,...,...,...,...,...
36728,3.6268,1028.5,93.200,3.1661,19.087,1037.0,541.59,109.08,10.411,10.99300
36729,4.1674,1028.6,94.036,3.1923,19.016,1037.6,542.28,108.79,10.344,11.14400
36730,5.4820,1028.5,95.219,3.3128,18.857,1038.0,543.48,107.81,10.462,11.41400
36731,5.8837,1028.7,94.200,3.9831,23.563,1076.9,550.11,131.41,11.771,3.31340


In [82]:
# split the data for training and test using sklearn train_test_split function 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer

# split the data 
X = co_df.iloc[:, :-1]
y = co_df["CO"]


In [83]:
X = co_df1 = Normalizer().fit_transform(X)
# normalize the data 

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y,  test_size = 0.3,random_state= 42, shuffle = True)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=42)

In [84]:
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
dtree = tree.DecisionTreeRegressor()
dtree.fit(X_train, y_train)

DecisionTreeRegressor()

In [85]:
dtree.score(X_train,y_train)

1.0

In [86]:
dtree.score(X_valid,y_valid)

0.49686129860503747

In [87]:
# make predictions on the validation set
y_pred_valid = dtree.predict(X_valid)

# calculate the MAE for the validation set
mae_val = mean_absolute_error(y_valid, y_pred_valid)
# calculate the MSE for the validation set
mse_val = mean_squared_error(y_valid, y_pred_valid)

print("Validation Mean Absolute Error: ", mae_val)
print("Validation Mean Squared Error: ", mse_val)
print(f"RMSE on the validation set: {np.sqrt(mse_val)}")

Validation Mean Absolute Error:  0.724412214667911
Validation Mean Squared Error:  2.9035623280210663
RMSE on the validation set: 1.7039842511071124


In [88]:
y_pred = dtree.predict(X_valid)
r2_score(y_valid,y_pred)

0.49686129860503747

In [89]:
from sklearn.model_selection import GridSearchCV, cross_val_score

In [90]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state= 42, shuffle = True)
max_depth = range(1,100,10)
grids_dtree = GridSearchCV(estimator = DecisionTreeRegressor(random_state=42),
                  param_grid= {
                                "max_features": ['auto', 'sqrt'],
                                "min_samples_split" : [2, 5, 10],
                                "min_samples_leaf" : [1, 2, 4],
                                "max_depth" : max_depth},
                  cv=5,
                  scoring = 'r2')

In [91]:
grids_dtree.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(random_state=42),
             param_grid={'max_depth': range(1, 100, 10),
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10]},
             scoring='r2')

In [92]:
grids_dtree.best_score_

0.6218561649983074

In [93]:
grids_dtree.best_params_

{'max_depth': 11,
 'max_features': 'auto',
 'min_samples_leaf': 4,
 'min_samples_split': 10}

In [94]:
# make predictions on the test set
y_pred_test = grids_dtree.predict(X_test)

# calculate the MAE for the test set
mae_val = mean_absolute_error(y_test, y_pred_test)
# calculate the MSE for the test set
mse_val = mean_squared_error(y_test, y_pred_test)

print("Mean Absolute Error: ", mae_val)
print("Mean Squared Error: ", mse_val)
print(f"RMSE : {np.sqrt(mse_val)}")


Mean Absolute Error:  0.6391783059352071
Mean Squared Error:  1.7560347184781873
RMSE : 1.3251546017269786


In [95]:
y_pred = grids_dtree.predict(X_test) 
r2_score(y_test, y_pred)

0.677059216680084