## The code is adapted from https://www.kaggle.com/code/chiter42/polymer-mfr-prediction
## Credits for the code :- Polina
## Kaggle profile :- https://www.kaggle.com/chiter42

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from catboost import CatBoostRegressor,Pool
from sklearn.neighbors import KNeighborsRegressor
import xgboost
from xgboost import XGBRegressor

ModuleNotFoundError: No module named 'xgboost'

In [None]:
data = pd.read_csv('..Data/Data-MFR_Prediction.xlsx') 

data.head(10)

In [None]:
max(data['MFR'])

In [None]:
data = data.rename(columns = {'Unnamed: 0': 'Date', '513FC31103.pv': 'Propylene feed rate (kg/h)','513HC31114-5.mv': 'Hydrogen : C3','513PC31201.pv':'Reactor pressure (bar)','513LC31202.pv':'Reactor bed level (m)','513FC31409.pv':'Ethylene flow rate (kg/h)','513FC31114-5.pv':'Catalyst feed rate (kg/h)','513TC31220.pv':'Reactor temperature','MFR':'MFR'})

In [None]:
np.unique(data.duplicated()) #Verifying there are no duplicates

In [None]:
data

In [None]:
#Checking for na's in the data and in the target variable (MFR)
print(np.unique(data['MFR'].isna(),return_counts=True))
print(np.unique(data.isna(),return_counts=True))


In [None]:
(data.isna().mean()*100).round(1)

In [None]:
data=data.drop(["Date"],axis=1) #droping the Data column

In [None]:
data=data.fillna(data.mean()) #Filling Na's with mean

In [None]:
data.dtypes

In [None]:
plt.figure(figsize=[12,9])

plt.hist(data['MFR'], bins = 100, alpha=.5)

plt.xlabel("MFR")
plt.ylabel("Distribution")


plt.title("Histogram of the MFR distribution")
plt.show()

In [None]:
lower = data['MFR'].quantile(0.01)
upper = data['MFR'].quantile(0.99) 

print(lower)
print(upper)

In [None]:
#Taking data that lies between the min and max

In [None]:
data = data.query('MFR> 3 and MFR< 20')

In [None]:
#Plotting the correlation plot

In [None]:
plt.figure(figsize=(10,8))
corr = data.corr()
sns.heatmap(corr, annot=True)
plt.show()

In [None]:
data['ln(MFR)'] =  np.log(data['MFR'])

In [None]:
plt.figure(figsize=(10,8))
corr = data.corr()
sns.heatmap(corr, annot=True)
plt.show()

In [None]:
data=data.drop('MFR',axis=1)

In [None]:
sns.pairplot(data)

In [None]:
target = data['ln(MFR)']
features = data.drop('ln(MFR)', axis=1)
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.25, random_state=12345)

In [None]:
len(features_test)

# Scaling

In [None]:
scaler = StandardScaler()
scaler.fit(features_train)
features_train= scaler.transform(features_train)
features_test = scaler.transform(features_test)

# Decision Tree Regression

In [None]:
from sklearn.tree import DecisionTreeRegressor
for depth in range(1, 15):
    pipeline = Pipeline([('Scaler', StandardScaler()), ('DTR', DecisionTreeRegressor(max_depth=depth,random_state=12345))])
    pipeline.fit(features_train, target_train)
    predicted_target_dtr = pipeline.predict(features_test)
    print("max_depth =", depth, ":", mean_squared_error(target_test, predicted_target_dtr)**0.5)

# K-Nearest Neighbors Regression

In [None]:
pipeline = Pipeline([('Scaler', StandardScaler()), ('KNR', KNeighborsRegressor())])
pipeline.fit(features_train, target_train)
predicted_target_knr = pipeline.predict(features_test)

print('RMSE -', mean_squared_error(target_test, predicted_target_knr)**0.5)

# CatBoost

In [None]:
catboost = CatBoostRegressor(loss_function='RMSE',iterations=250, logging_level='Silent')
parameters_cat = {'depth':[1,8], 'learning_rate':np.arange(0.1,1,0.2)}
catboost_grid = catboost.grid_search(parameters_cat,
            Pool(features_train,target_train),cv=3, verbose=100, plot=False)

In [None]:
model = CatBoostRegressor(depth=8, learning_rate=0.1, random_state=19072020, iterations=250, logging_level='Silent') 
model.fit(features_train,target_train, verbose=2)

In [None]:
predicted_target_train = model.predict(features_train)
predicted_target = model.predict(features_test)
print('RMSE - Train Case :-', mean_squared_error(target_train, predicted_target_train)**0.5)
print('RMSE - Test Case :-', mean_squared_error(target_test, predicted_target)**0.5)

#Least RMSE or mean squared error in this case on the test case, so choosinng this to go ahead

In [None]:
accuracy_train = model.score(features_train, predicted_target_train)
print("R2 - Train", ":", accuracy)
accuracy_test = model.score(features_test, target_test)
print("R2 - Test", ":", accuracy)

In [None]:
predicted_target_train_actual = np.e**predicted_target_train
target_train_actual = np.e**target_train
predicted_target_actual = np.e**predicted_target
target_test_actual = np.e**target_test

In [None]:
x_train = []
for i in range(0,len(target_train)):
    x_train.append(i)

In [None]:
x_test = []
for i in range(0,len(target_test)):
    x_test.append(i)

In [None]:
plt.figure(figsize=[12,9])
plt.scatter(x_train,predicted_target_train-target_train,label = 'Predicted-Actual')
plt.xlabel("Serial Number")
plt.ylabel("Difference between predicted and actual in log scale")
plt.title("Plot of difference between predicted and actual MFR in log scale for train case")
plt.show()

In [None]:
plt.figure(figsize=[12,9])
plt.scatter(x_train,predicted_target_train_actual-target_train_actual,label = 'Predicted-Actual')
plt.xlabel("Serial Number")
plt.ylabel("Difference between predicted and actual in log scale")
plt.title("Plot of difference between predicted and actual MFR for train case")
plt.show()

In [None]:
plt.figure(figsize=[12,9])
plt.scatter(x_test,predicted_target-target_test,label = 'Predicted-Actual')
plt.xlabel("Serial Number")
plt.ylabel("Difference between predicted and actual in log scale")
plt.title("Plot of difference between predicted and actual MFR in log scale for test case")
plt.show()

In [None]:
plt.figure(figsize=[12,9])
plt.scatter(x_test,predicted_target_actual-target_test_actual,label = 'Predicted-Actual')
plt.xlabel("Serial Number")
plt.ylabel("Difference between predicted and actual")
plt.title("Plot of difference between predicted and actual MFR for test case")
plt.show()

In [None]:
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/8c/3a/c9c5d4d5c49b132ef15ac7b5ccf56ef1c82efe36cd19414771762e97c00e/xgboost-1.7.6-py3-none-manylinux2014_x86_64.whl.metadata
  Downloading xgboost-1.7.6-py3-none-manylinux2014_x86_64.whl.metadata (1.9 kB)
Downloading xgboost-1.7.6-py3-none-manylinux2014_x86_64.whl (200.3 MB)
[2K   [38;2;249;38;114m━━━━━━━━━━━━━━━━━[0m[38;2;249;38;114m╸[0m[38;5;237m━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.8/200.3 MB[0m [31m7.0 MB/s[0m eta [36m0:00:16[0mm