# Import packages and load data

In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import StratifiedShuffleSplit
from matplotlib import rc
xscaler = MinMaxScaler()

In [2]:
file_path = '../input_data/full_set.csv'
model_output = 'order_disorder'
# import data frame for pre-processing
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Sample Name,Polymer Component,Polymer Conc.(mg/ml),Solvent type,Solvent Ratio,Additive type,Amount of additive (mg/ml),Peak Position,FWHM,domain,grain size,afm_domain,O/D,tri
0,I_A_AC,PS-b-PEO,20,Toluene/THF,80/20,Chloronaphthalene,0,0.020072,0.003697,31.30263,,33.737303,0,0
1,I_A025,PS-b-PEO,20,Toluene/THF,80/20,Chloronaphthalene,5,0.018586,0.003136,33.805848,0.00685,35.15169,0,1
2,I_A050,PS-b-PEO,20,Toluene/THF,80/20,Chloronaphthalene,10,0.018385,0.00297,34.176062,0.0576,35.397797,0,1
3,I_A075,PS-b-PEO,20,Toluene/THF,80/20,Chloronaphthalene,15,0.018274,0.003451,34.383752,0.073026,34.603726,0,1
4,I_A_100,PS-b-PEO,20,Toluene/THF,80/20,Chloronaphthalene,20,0.019837,0.005122,31.674149,0.016718,33.78621,0,1


# Grid-Search

In [3]:
%%time
# assign input variables and target variable, eliminates static values from the data set
inputs = df.loc[:, ('Solvent Ratio', 'Amount of additive (mg/ml)')]
target = df['tri']

# Define the custom mapping dictionary
mapping_dict = {'50/50': 0.5, '60/40': 0.6, '70/30': 0.7, '80/20': 0.8, '90/10': 0.9, '100': 1.0}

# Initializes label encoder
le_solventRatio = LabelEncoder()
le_solventRatio.fit(list(mapping_dict.keys()))
# encode non-numeric type inputs
# inputs['Solvent Ratio_n'] = le_solventRatio.transform(inputs['Solvent Ratio'])
# Map the transformed values to the desired values
inputs['Solvent Ratio_n'] = inputs['Solvent Ratio'].map(mapping_dict)
# remove non-encoded data from dataset 
inputs_n = inputs.drop(['Solvent Ratio'],axis='columns')

# Split the data into training and test sets using StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=10)

for train_index, test_index in sss.split(inputs_n, target):
    X_train, X_test = inputs_n.iloc[train_index], inputs_n.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]

# Scale the x data for better fitting
xscaler.fit(X_train)
X_train = xscaler.transform(X_train)
X_test = xscaler.transform(X_test)
X_all = xscaler.transform(inputs_n)

CPU times: user 5.65 ms, sys: 33 µs, total: 5.68 ms
Wall time: 5.68 ms


In [4]:
target_label = LabelBinarizer().fit_transform(target)

In [5]:
# Set option to display all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# print(target_label)

# Data pre-processing

In [5]:
from joblib import dump, load
# Convert cv_results_ to DataFrame

# Save the trained model
best = load('rfc_multiclass.joblib')

In [6]:
# Create an instance of the best performing model & train
rfreval = best.predict(X_test)
rfrtrain = best.predict(X_train)

# Create an array for x values
x_len = len(rfreval)
actual_x = np.arange(1, x_len+1)

# Define the output path for figures
output_path = f'./final_figs/{model_output}/'

In [7]:
from sklearn.metrics import f1_score

# Make predictions using the best model
y_pred_train = best.predict(X_train)
y_pred_test = best.predict(X_test)

# Calculate the F1 score
f1_train = f1_score(y_train, y_pred_train, average = None)
f1_test = f1_score(y_test, y_pred_test, average = None)

print("Training F1 Score:", f1_train)
print("Testing F1 Score:", f1_test)

Training F1 Score: [1. 1. 1.]
Testing F1 Score: [0.         0.90909091 1.        ]


In [10]:
from sklearn.metrics import roc_auc_score

# Predict probabilities for test data
y_pred_proba_train = best.predict_proba(X_train)#[:, 1]
y_pred_proba_test = best.predict_proba(X_test)#[:, 1]

# Calculate AUC score
auc_score_train = roc_auc_score(y_train, y_pred_proba_train, multi_class = 'ovr')
auc_score_test = roc_auc_score(y_test, y_pred_proba_test, multi_class = 'ovr')

print("Training AUC Score:", auc_score_train)
print("Testing AUC Score:", auc_score_test)

Training AUC Score: 1.0
Testing AUC Score: 1.0


In [11]:
from joblib import dump, load
# Convert cv_results_ to DataFrame
results_df = pd.DataFrame(grid_search.cv_results_)

file_name = 'grid_search_results.csv'
# Save results to CSV file
results_df.to_csv(output_path + file_name, index=False)

# Save the trained model
dump(best, 'rfc_multiclass.joblib')

['rfc_multiclass.joblib']

# Visualize the results

In [12]:
# predic_set = inputs_n
rfc_fullset = best.predict(X_all)

In [1]:
# Assuming your actual and predicted data are pandas DataFrames with columns 'input1', 'input2', and 'class'
from matplotlib.ticker import MultipleLocator
fig_title = f'{model_output}_classification_scatter.png'


%matplotlib inline
rc('text', usetex=False)
rc('mathtext', fontset='cm')
rc('xtick', labelsize=20)   
rc('xtick.major', size=7)  
rc('xtick.minor', size=4)
rc('xtick.major', width=2)
rc('xtick.minor', width=2)
rc('ytick.major', width=2)
rc('ytick.minor', width=2)
rc('ytick', labelsize=20)
rc('ytick.major', size=7)
rc('ytick.minor', size=4)
rc('axes', labelsize=25) 
rc('axes', linewidth=2) 
rc('font',family='sans serif')
rc('font', style='normal')
rc('font', weight='500')
rc('font', size='15')
rc('axes', labelweight='500')
rc('axes.spines', **{'right':True, 'top':True}) 
plt.rcParams['font.family'] = 'Arial'

#This sets the size of the entire image
fig,ax = plt.subplots(figsize=(8,5))
# Define marker colors and edgecolors based on class
actual_edge_colors = ['gold' if c == 0 else 'black' for c in target]
predicted_edge_colors = ['gold' if c == 0 else 'black' for c in rfc_fullset]
#fig = plt.figure()
#ax = plt.subplot(111)
# Plotting
ax.scatter(inputs['Amount of additive (mg/ml)'], 
            inputs['Solvent Ratio_n'], marker='s', color='black', alpha = 0.25,
            label='Actual Data', facecolor=actual_edge_colors, s=150)
ax.scatter(inputs['Amount of additive (mg/ml)'], 
            inputs['Solvent Ratio_n'], marker='v', color='black', alpha = 0.75,
            label='Predicted Data', facecolor=predicted_edge_colors, s=75)

plt.xlabel('Swelling Ratio', fontsize='25')
plt.ylabel('Solvent Ratio', fontsize='25')
# Set y-axis major locator to MultipleLocator with base 0.1
plt.gca().yaxis.set_major_locator(MultipleLocator(0.1))

# Shrink current axis by 20%
box = ax.get_position()

# Put a legend below current axis
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.225),
          fancybox=True, shadow=True, ncol=5, fontsize='20', markerscale=2)
# ax.legend()
plt.tight_layout()
plt.savefig(output_path + fig_title, dpi=500, format='png')
plt.show()

NameError: name 'model_output' is not defined

In [14]:
from sklearn.metrics import confusion_matrix

# Generate confusion matrix
cm = confusion_matrix(target, rfc_fullset)

print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[  5   1   0]
 [  0  24   0]
 [  0   0 120]]
