In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import joblib

In [5]:
#load data
df = pd.read_csv('../model_data.csv')

#drop auto generated column (no use)
df.drop(["Unnamed: 0"], axis=1, inplace = True)

#check for NA instances
print("# of na rows per column")
print(df.isna().sum())

#check for duplicates
print("# of duplicates")
print(df.duplicated().sum())
#df.drop_duplicates(inplace=True)

def determine_phase(df):
    if (df['Water Content(mol)'] != 0) & (df['contains_solid'] == 0):
        return 'liquid'
    elif (df['Water Content(mol)'] != 0) & (df['contains_solid'] == 1):
        return 'liquid/solid'
    else:
        return 'solid' 

df['phase'] = df.apply(determine_phase, axis=1)

print(df['phase'].value_counts())

df.describe()

# of na rows per column
Temp(K)               0
RH                    0
N_H+(mol)             0
N_NH4+(mol)           0
N_Na+(mol)            0
N_SO42-(mol)          0
N_NO3-(mol)           0
N_Cl-(mol)            0
N_OH-(mol)            0
N_NH3-(mol)           0
Water Content(mol)    0
P_HNO3(atm)           0
P_HCL(atm)            0
P_NH3(atm)            0
P_H2SO4(atm)          0
P_HBr(atm)            0
contains_solid        0
dtype: int64
# of duplicates
0
liquid/solid    11160
liquid           7390
solid            5950
Name: phase, dtype: int64


Unnamed: 0,Temp(K),RH,N_H+(mol),N_NH4+(mol),N_Na+(mol),N_SO42-(mol),N_NO3-(mol),N_Cl-(mol),N_OH-(mol),N_NH3-(mol),Water Content(mol),P_HNO3(atm),P_HCL(atm),P_NH3(atm),P_H2SO4(atm),P_HBr(atm),contains_solid
count,24500.0,24500.0,24500.0,24500.0,24500.0,24500.0,24500.0,24500.0,24500.0,24500.0,24500.0,24500.0,24500.0,24500.0,24500.0,24500.0,24500.0
mean,298.15,0.57,0.0,4.559251e-08,1.953965e-08,7.335343e-09,1.628304e-08,3.419438e-08,0.0,0.0,1.865244e-07,8.957591e-12,3.929536e-11,9.94482e-07,5.107882e-29,0.0,0.698367
std,0.0,0.180558,0.0,1.44864e-08,7.577349e-09,4.396267e-09,8.369788e-09,1.07981e-08,0.0,0.0,2.382971e-07,1.242647e-11,4.166712e-11,6.985366e-07,3.6262440000000003e-29,0.0,0.458976
min,298.15,0.25,0.0,2.324523e-08,7.74841e-09,0.0,3.874205e-09,1.743392e-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,298.15,0.45,0.0,3.348078e-08,1.355972e-08,4.05e-09,9.685512e-09,2.511059e-08,0.0,0.0,4.22287e-09,9.83153e-13,1.15606e-11,2.65584e-07,0.0,0.0,0.0
50%,298.15,0.575,0.0,4.319325e-08,1.81125e-08,6.657775e-09,1.488113e-08,3.264098e-08,0.0,0.0,6.812875e-08,5.477985e-12,3.882495e-11,1.07618e-06,6.84111e-29,0.0,1.0
75%,298.15,0.7,0.0,5.67e-08,2.43e-08,1e-08,2.125764e-08,4.2525e-08,0.0,0.0,3.08527e-07,1.35651e-11,5.41369e-11,1.53391e-06,8.35019e-29,0.0,1.0
max,298.15,0.85,0.0,8e-08,4e-08,2.25e-08,4e-08,6e-08,0.0,0.0,1.10977e-06,1.6706e-10,4.01064e-10,2.74479e-06,9.84569e-29,0.0,1.0


In [31]:
factor = pd.factorize(df['phase'])
df.phase = factor[0]
definitions = factor[1]

In [32]:
X = df.drop(columns=['Temp(K)', 'N_H+(mol)', 'N_OH-(mol)', 'N_NH3-(mol)', 'Water Content(mol)', 
                     'P_HNO3(atm)', 'P_HCL(atm)', 'P_NH3(atm)', 'P_H2SO4(atm)', 'P_HBr(atm)', 'contains_solid', 'phase'])
y = df['phase']

In [33]:
#df["Water Content(mol)"] = np.log(df["Water Content(mol)"])
scalers = {}
for col in X.columns:
    mean = X[col].mean()
    std = X[col].std(ddof=0)
    X[col] = (X[col] - mean) / std
    scalers[col] = (mean, std)
    '''
    maximum = max(df[col])
    minimum = min(df[col])
    df[col] = (df[col] - minimum) / (maximum - minimum)
    scalers[col] = (minimum, maximum)
    '''

'''
#Standardize water content
water_content_mean = target.mean()
water_content_std = target.std(ddof=0)
df["Water Content(mol)"] = (df["Water Content(mol)"] - water_content_mean) / water_content_std
'''
X.describe()

Unnamed: 0,RH,N_NH4+(mol),N_Na+(mol),N_SO42-(mol),N_NO3-(mol),N_Cl-(mol)
count,24800.0,24800.0,24800.0,24800.0,24800.0,24800.0
mean,-9.852334e-16,-1.3465930000000001e-17,1.615912e-16,2.246232e-16,1.146037e-17,6.861895000000001e-17
std,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002
min,-1.772316,-0.160742,-2.465838,-1.631763,-0.1337814,-0.3261753
25%,-0.6646185,-0.1266932,-0.7336939,-0.739567,-0.1085876,-0.1295712
50%,0.02769244,-0.0925657,-0.1664835,-0.1342873,-0.08554762,-0.07189606
75%,0.7200034,-0.049454,0.6382907,0.6224487,-0.05576406,0.00677544
max,1.550777,16.39454,2.643839,3.440213,21.52596,23.16241


In [34]:
#split for train / test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,  shuffle=True)

In [35]:
# Fitting Random Forest Classification to the Training set
rf_classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy')
rf_classifier.fit(X_train, y_train)

In [9]:
print(list(zip(X, rf_classifier.feature_importances_)))
joblib.dump(rf_classifier, 'saved_models/rf_phase_classifier.pkl') 

[('RH', 0.9062775949405316), ('N_NH4+(mol)', 0.01281999765760212), ('N_Na+(mol)', 0.013604433829746385), ('N_SO42-(mol)', 0.022955010948180823), ('N_NO3-(mol)', 0.02442283183869705), ('N_Cl-(mol)', 0.01992013078524196)]


['saved_models/rf_phase_classifier.pkl']