In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

data = pd.read_excel('data/data.xlsx', sheet_name = '1st Trial', names = ['time (min)', 'm_xylene', 'NO', 'NO2', 'NOx', 'NOx/m_xy', 'beta', 'NO,NO2 crossing time (min)', 'Wall loss factor', 'Peak O3 Concentration', '50% of Final M0 Time', 'PeakDp', 'deltaHC', 'm_xy consume ratio', 'deltaHC / beta', 'deltaHC * beta', 'M0', 'yield'])
scaler = StandardScaler()

In [None]:
X = (data['m_xylene'] * data['deltaHC / beta']).to_numpy().reshape(-1, 1)
y = data['M0'].to_numpy().reshape(-1, 1)

In [None]:
percent_errors = []
for _ in range(1000):
    state = np.random.randint(1, 1000)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = state)

    lr = LinearRegression()
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_test)
    percent_error = (100 * np.absolute(y_pred - y_test) / y_test).mean()
    percent_errors.append(percent_error)
percent_errors = np.array(percent_errors)
print('Best M0 Correlation Model (Linear)')
print(f'Average Percent Error: {np.round(percent_errors.mean(), 2)}%')

In [None]:
X = (data['PeakDp'] / data['50% of Final M0 Time']).to_numpy().reshape(-1, 1)
y = data['yield'].to_numpy().reshape(-1, 1)
percent_errors = []
for _ in range(1000):
    state = np.random.randint(1, 1000)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = state)

    lr = LinearRegression()
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_test)
    percent_error = (100 * np.absolute(y_pred - y_test) / y_test).mean()
    percent_errors.append(percent_error)
percent_errors = np.array(percent_errors)
print('Best Yield Correlation Model (Linear)')
print(f'Average Percent Error: {np.round(percent_errors.mean(), 2)}%')

In [None]:
X = data['NO'].to_numpy().reshape(-1, 1)#(data['PeakDp'] / data['50% of Final M0 Time']).to_numpy().reshape(-1, 1)
y = data['M0'].to_numpy()
percent_errors = []
for _ in range(1000):
    state1 = np.random.randint(1, 1000)
    state2 = np.random.randint(1, 1000)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = state1)
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    model = RandomForestRegressor(n_estimators = 100, random_state = state2)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    percent_error = (100 * np.absolute(y_pred - y_test) / y_test).mean()
    percent_errors.append(percent_error)
percent_errors = np.array(percent_errors)
print('Best Yield Correlation Model (Nonlinear)')
print(f'Average Percent Error: {np.round(percent_errors.mean(), 2)}%')