# Import packages and load data

In [2]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ParameterGrid, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from matplotlib import rc
xscaler = MinMaxScaler()

In [3]:
file_path = '../../input_data/pspeo_master.xlsx'
model_output = 'order_disorder'
# import data frame for pre-processing
df = pd.read_excel(file_path)
df.head()

Unnamed: 0,sample,solv_ratio,add_type,swell_ratio,gisaxs_domain,gisaxs_fwhm,afm_domain,afm_grain,order_disorder
0,I_A_AC,80/20,Chloronaphthalene,1.0,31.30263,0.003697,33.737303,0.0048,0
1,I_A025,80/20,Chloronaphthalene,1.25,33.805848,0.003136,35.15169,0.0047,0
2,I_A050,80/20,Chloronaphthalene,1.5,34.176062,0.00297,35.397797,0.0534,0
3,I_A075,80/20,Chloronaphthalene,1.75,34.383751,0.003451,34.603726,0.0425,0
4,I_A_100,80/20,Chloronaphthalene,2.0,31.674148,0.005122,33.78621,0.0244,0


# Grid-Search

In [4]:
# Assign input variables and target variable, eliminating static valuesp
inputs = df.loc[:, ['solv_ratio', 'swell_ratio', 'add_type']]
target = df['order_disorder']

# Define mapping for solv_ratio
type_mapping = {100: 1.0, '90/10': 0.9, '80/20': 0.8, '70/30': 0.7, '60/40': 0.6, '50/50': 0.5}
inputs['solv_ratio'] = inputs['solv_ratio'].replace(type_mapping)

# Encode categorical input 'add_type'
le_addType = LabelEncoder()
inputs['add_type'] = le_addType.fit_transform(inputs['add_type'])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2, random_state=10)

# Scale the x data for better fitting
xscaler.fit(X_train)
X_train = xscaler.transform(X_train)
X_test = xscaler.transform(X_test)

# Data pre-processing

In [5]:
%%time

# Define parameter grid
param_grid = {
    'n_estimators': [10, 25, 50, 75, 100],
    'criterion': ["gini", "entropy"],  # valid for classifiers
    'max_depth': [None, 5, 10, 20, 30],
    'min_samples_split': [2, 5, 10, 20],
    'max_leaf_nodes': [None, 10, 20, 30, 50],
    'max_features': ['sqrt', 'log2', 1.0],  # 1.0 = use all features
    'bootstrap': [True, False]
}

# Set seed
RANDOM_SEED = 42
CV_FOLDS = 5

# Variables to store the best results
best_model = None
best_params = None
best_avg_score = float('-inf')

# Loop over all combinations
for params in ParameterGrid(param_grid):
    model = RandomForestClassifier(**params, random_state=RANDOM_SEED, n_jobs=-1)
    
    # Cross-validated accuracy on training set
    cv_scores = cross_val_score(model, X_train, y_train, cv=CV_FOLDS, scoring='accuracy', n_jobs=-1)
    cv_accuracy = np.mean(cv_scores)
    
    # Fit model to full training set
    model.fit(X_train, y_train)
    
    # Accuracy on test set
    test_accuracy = model.score(X_test, y_test)
    
    # Average accuracy: cross-val on train + test
    avg_score = (cv_accuracy + test_accuracy) / 2

    if avg_score > best_avg_score:
        best_avg_score = avg_score
        best_model = model
        best_params = params
        best_cv_accuracy = cv_accuracy
        best_test_accuracy = test_accuracy

# Output the results
print("Best Average Accuracy (CV Train + Test):", best_avg_score)
print("CV Train Accuracy:", best_cv_accuracy)
print("Test Accuracy:", best_test_accuracy)
print("Best Parameters:", best_params)

Best Average Accuracy (CV Train + Test): 0.990625
CV Train Accuracy: 0.98125
Test Accuracy: 1.0
Best Parameters: {'bootstrap': True, 'criterion': 'gini', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'min_samples_split': 10, 'n_estimators': 10}
CPU times: user 4min 31s, sys: 45 s, total: 5min 16s
Wall time: 8min 48s


In [6]:
best = RandomForestClassifier(**best_params, random_state=42, n_jobs=-1)
best.fit(X_train, y_train)
# test_r2 = model.score(X_test, y_test)  # or use r2_score(y_test, model.predict(X_test))

In [7]:
train_r2 = best.score(X_train, y_train)
test_r2 = best.score(X_test, y_test)

print(f'Training r2: {train_r2}')
print(f'Testing r2: {test_r2}')

# Create an array for x values
x_len = len(y_test)
actual_x = np.arange(1, x_len+1)

# Define the output path for figures
output_path = f'../final_figs/{model_output}/'

Training r2: 0.9813664596273292
Testing r2: 1.0


In [8]:
from joblib import dump, load

dump(best, f'{model_output}.joblib')

['order_disorder.joblib']

In [11]:
rfr_train = best.predict(X_train)
rfr_test = best.predict(X_test)

In [13]:
from sklearn.metrics import f1_score

# Make predictions using the best model
y_pred_train = best.predict(X_train)
y_pred_test = best.predict(X_test)

# Calculate the F1 score
f1_train = f1_score(y_train, y_pred_train)
f1_test = f1_score(y_test, y_pred_test)

print("Training F1 Score:", f1_train)
print("Testing F1 Score:", f1_test)

Training F1 Score: 0.9846153846153847
Testing F1 Score: 1.0
