In [None]:
import pandas as pd
import os
from math import sqrt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

path = "/content/drive/MyDrive/dataset/PDOs PY copy.csv"
data = pd.read_csv(path)

y = data.PY
X = data.drop(['PY'], axis=1)

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

# Drop columns with missing values (simplest approach)
cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()]
X_train_full.drop(cols_with_missing, axis=1, inplace=True)
X_valid_full.drop(cols_with_missing, axis=1, inplace=True)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and
                        X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = low_cardinality_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

#X_train.head()

def score_dataset(X_train, X_valid, y_train, y_valid):
  model = RandomForestRegressor(n_estimators=100, random_state=0)
  model.fit(X_train, y_train)
  preds = model.predict(X_valid)
  return mean_absolute_error(y_valid, preds)

#Score from Approach 1 (Drop Categorical Variables)

drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])

print("MAE from Approach 1 (Drop categorical variables):")
print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid))

model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(drop_X_train, y_train)
preds = model.predict(drop_X_valid)

print("Mean Absolute Error: " + str(mean_absolute_error(y_valid, preds)))
r2_score(y_valid, preds)

NameError: name 'np' is not defined

In [None]:
#RMSE
MSE = mean_squared_error(y_valid, preds)

print("MSE: " + str(MSE))

RMSE = sqrt(MSE)

print("RMSE: " + str(RMSE))

MSE: 17.78810211642002
RMSE: 4.217594351810048


In [None]:
drop_X_valid.head()


Unnamed: 0,Initial glycerol concentration (M),Stir rate (rpm),Anode surface area (cm2),T (deg C),Time
294,0.25,0,11.0,25,10
65,0.3,650,121.38,25,14
15,0.3,650,121.38,25,14
272,0.25,0,11.0,25,10
140,0.3,0,121.38,25,14


In [None]:
y_valid.head()

294     3.87
65      0.00
15     17.30
272     3.00
140     3.50
Name: PY, dtype: float64

In [None]:
from sklearn.preprocessing import OrdinalEncoder

s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

# Make copy to avoid changing original data
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

# Apply ordinal encoder to each column with categorical data
ordinal_encoder = OrdinalEncoder()
label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])

print("MAE from Approach 2 (Ordinal Encoding):")
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))

MAE from Approach 2 (Ordinal Encoding):
1.9698920695970716


In [None]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

# Ensure all columns have string type
OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)

print("MAE from Approach 3 (One-Hot Encoding):")
print(score_dataset(OH_X_train, OH_X_valid, y_train, y_valid))

NameError: ignored

In [None]:
#improved version always second

import pandas as pd
import numpy as np
import os
from math import sqrt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

path = "/content/drive/MyDrive/dataset/raw data.csv"
data = pd.read_csv(path)

data = pd.get_dummies(data) #convert categorical variable into dummy variable

y = np.array(data['CR (%)']) #output feature
X = data.drop(['CR (%)', 'ECR PY (%)', 'PDOs PY (%)', 'POHs PY (%)', 'V'], axis=1) #input feature - axis=1 refers to the column

X_list = list(X.columns) # Saving feature names for later use

from sklearn.preprocessing import StandardScaler #standardscaler for normalization (Jinesh et al., 2023)
scaler = StandardScaler()

y = y.reshape(-1, 1) #normalizing y
scaler.fit(y)
y = scaler.transform(y)

scaler.fit(X) #normalizing X
X = scaler.transform(X)

X = np.array(X) # Convert to numpy array

# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.3, random_state = 42)
print('Training X Shape:', train_X.shape)
print('Training y Shape:', train_y.shape)
print('Testing X Shape:', test_X.shape)
print('Testing y Shape:', test_y.shape)

#imputation for missing value using kNN algo
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=10) #create an instance of KNNImputer class, k is desired number of neighbours

#perform imputation on on X because y no missing values
imp_train_X = imputer.fit_transform(train_X)
imp_test_X = imputer.fit_transform(test_X)

#random forest regression
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(imp_train_X, train_y)
preds = model.predict(imp_test_X)

score = model.score(imp_train_X,train_y)
print("Training score: ", score)

pred = model.predict(imp_test_X)

print("Mean Absolute Error: " + str(mean_absolute_error(test_y, pred)))
print("MSE: " + str(mean_squared_error(preds, test_y)))

RMSE = sqrt(mean_squared_error(preds, test_y))

print("RMSE: " + str(RMSE))
print("R2 score: " + str(r2_score(test_y, preds)))



Training X Shape: (312, 30)
Training y Shape: (312, 1)
Testing X Shape: (134, 30)
Testing y Shape: (134, 1)


  model.fit(imp_train_X, train_y)


Training score:  0.9927064263513695
Mean Absolute Error: 0.140841472346795
MSE: 0.05459088841183673
RMSE: 0.23364693109869158
R2 score: 0.947866849777909


In [None]:
#gridsearch here we go!

import pandas as pd
import numpy as np
import os
from math import sqrt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

path = "/content/drive/MyDrive/dataset/raw data.csv"
data = pd.read_csv(path)

data = pd.get_dummies(data) #convert categorical variable into dummy variable

y = np.array(data['PDOs PY (%)']) #output feature
X = data.drop(['CR (%)', 'PDOs PY (%)', 'I (A)'], axis=1) #input feature - axis=1 refers to the column

X_list = list(X.columns) # Saving feature names for later use

from sklearn.preprocessing import StandardScaler #standardscaler for normalization (Jinesh et al., 2023)
scaler = StandardScaler()

y = y.reshape(-1, 1) #normalizing y
scaler.fit(y)
y = scaler.transform(y)

scaler.fit(X) #normalizing X
X = scaler.transform(X)

X = np.array(X) # Convert to numpy array

# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.3, random_state = 42)
print('Training X Shape:', train_X.shape)
print('Training y Shape:', train_y.shape)
print('Testing X Shape:', test_X.shape)
print('Testing y Shape:', test_y.shape)

#imputation for missing value using kNN algo
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=10) #create an instance of KNNImputer class, k is desired number of neighbours

#perform imputation on on X because y no missing values
imp_train_X = imputer.fit_transform(train_X)
imp_test_X = imputer.fit_transform(test_X)

#hyperparamter optimization using gridsearch
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
random_cv = GridSearchCV(
            estimator=RandomForestRegressor(),
            param_grid={'n_estimators': [100, 200, 500],
                        'max_depth': [None, 3, 6],
                        'min_samples_split': [2, 4, 6],
                        'min_samples_leaf': [1, 2, 3],
                        #'max_features': ['auto', 'sqrt'],
                        'random_state': [0, 42]},
            cv=3, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)
#This includes 3-fold validation (cv=3) and uses mean scare error as the metric for evaluation

random_cv.fit(imp_train_X, train_y)

# use the best parameters
best_params = random_cv.best_params_
best_score = random_cv.best_score_
print('Best hyperparameters:', best_params)

# using the best parameters run model
model = RandomForestRegressor(**best_params)
model.fit(imp_train_X, train_y)

score = model.score(imp_train_X,train_y)
print("Training score: ", score)

pred = model.predict(imp_test_X)

print("Mean Absolute Error: " + str(mean_absolute_error(test_y, pred)))
print("MSE: " + str(mean_squared_error(pred, test_y)))

RMSE = sqrt(mean_squared_error(pred, test_y))

print("RMSE: " + str(RMSE))
print("R2 score: " + str(r2_score(test_y, pred)))

Training X Shape: (312, 30)
Training y Shape: (312, 1)
Testing X Shape: (134, 30)
Testing y Shape: (134, 1)


  self.best_estimator_.fit(X, y, **fit_params)
  model.fit(imp_train_X, train_y)


Best hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100, 'random_state': 0}
Training score:  0.9671767585181567
Mean Absolute Error: 0.18966454511737257
MSE: 0.24517088356032346
RMSE: 0.495147335204708
R2 score: 0.7823250204884203


In [None]:
#improved version always second (PDOs PY)

import pandas as pd
import numpy as np
import os
from math import sqrt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

path = "/content/drive/MyDrive/dataset/raw data.csv"
data = pd.read_csv(path)

data = pd.get_dummies(data) #convert categorical variable into dummy variable

y = np.array(data['PDOs PY (%)']) #output feature
X = data.drop(['CR (%)', 'ECR PY (%)', 'PDOs PY (%)', 'POHs PY (%)', 'V'], axis=1) #input feature - axis=1 refers to the column

X_list = list(X.columns) # Saving feature names for later use

from sklearn.preprocessing import StandardScaler #standardscaler for normalization (Jinesh et al., 2023)
scaler = StandardScaler()

y = y.reshape(-1, 1) #normalizing y
scaler.fit(y)
y = scaler.transform(y)

scaler.fit(X) #normalizing X
X = scaler.transform(X)

X = np.array(X) # Convert to numpy array

# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.3, random_state = 42)
print('Training X Shape:', train_X.shape)
print('Training y Shape:', train_y.shape)
print('Testing X Shape:', test_X.shape)
print('Testing y Shape:', test_y.shape)

#imputation for missing value using kNN algo
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=10) #create an instance of KNNImputer class, k is desired number of neighbours

#perform imputation on on X because y no missing values
imp_train_X = imputer.fit_transform(train_X)
imp_test_X = imputer.fit_transform(test_X)

#random forest regression
model = RandomForestRegressor(max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=0)
model.fit(imp_train_X, train_y)
preds = model.predict(imp_test_X)

score = model.score(imp_train_X,train_y)
print("Training score: ", score)

pred = model.predict(imp_test_X)

print("Mean Absolute Error: " + str(mean_absolute_error(test_y, pred)))
print("MSE: " + str(mean_squared_error(preds, test_y)))

RMSE = sqrt(mean_squared_error(preds, test_y))

print("RMSE: " + str(RMSE))
print("R2 score: " + str(r2_score(test_y, preds)))

