In [1]:
import numpy as np
import pandas as pd
import os
import joblib
import sklearn
from sklearn.svm import SVR
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [2]:
df = pd.read_csv('../../flo_dataset_test_HI.csv')
df

Unnamed: 0,in_source,in_amount_mmol,p_source,p_amount_mmol,ligand_source,ligand_amount_mmol,first_sol,first_sol_amount_ml,second_sol,second_sol_amount_ml,other_1,other_1_amount_mmol,other_2,other_2_amount_mmol,total_volume_ml,temp_c,time_min,diameter_nm,abs_nm,emission_nm
0,indium acetate,1.20,tris(trimethylsilyl)phosphine,1.2,,0.0,octadecene,3.0,dioctylamine,0.151,,0.0,,0.0,3.151,178,840.0,6.7,622,680
1,indium acetate,0.40,tris(trimethylsilyl)phosphine,0.4,,0.0,octadecene,3.0,oleylamine,0.494,,0.0,,0.0,3.494,178,4.0,1.5,445,568
2,indium acetate,1.20,tris(trimethylsilyl)phosphine,1.2,,0.0,octadecene,3.0,oleylamine,0.494,,0.0,,0.0,3.494,178,32.0,2.5,543,589
3,indium acetate,0.20,tris(trimethylsilyl)phosphine,0.2,myristic acid,0.7,octadecene,6.5,octylamine,0.199,,0.0,,0.0,6.699,130,30.0,,412,
4,indium acetate,0.20,tris(trimethylsilyl)phosphine,0.2,myristic acid,0.7,octadecene,6.5,octylamine,0.199,,0.0,,0.0,6.699,130,1.0,,419,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,indium myristate,0.10,tris(trimethylsilyl)phosphine,0.1,,0.0,octadecene,8.5,,0.000,zinc stearate,0.1,,0.0,8.500,300,20.0,,500,550
153,indium trifluoroacetate,0.11,tris(trimethylsilyl)phosphine,0.1,myristic acid,0.3,octadecene,7.0,,0.000,,0.0,,0.0,7.000,270,240.0,8.3,532,590
154,indium trifluoroacetate,0.11,tris(trimethylsilyl)phosphine,0.1,myristic acid,0.3,octadecene,7.0,,0.000,,0.0,,0.0,7.000,270,120.0,,549,
155,indium trifluoroacetate,0.11,tris(trimethylsilyl)phosphine,0.1,myristic acid,0.3,octadecene,7.0,,0.000,,0.0,,0.0,7.000,270,60.0,,562,


In [3]:
#Checks if there are any columns with empty cells
df.isna().sum()

in_source               0
in_amount_mmol          0
p_source                0
p_amount_mmol           0
ligand_source           0
ligand_amount_mmol      0
first_sol               0
first_sol_amount_ml     0
second_sol              0
second_sol_amount_ml    0
other_1                 0
other_1_amount_mmol     0
other_2                 0
other_2_amount_mmol     0
total_volume_ml         0
temp_c                  0
time_min                0
diameter_nm             0
abs_nm                  0
emission_nm             0
dtype: int64

# Scaling and Transforming

In [4]:
#Separate initial DataFrame into input and output features (output doesn't get scaled)
df_input = df.drop(columns =['diameter_nm', 'abs_nm', 'emission_nm'], inplace = False, axis = 1)
df_output = df[['diameter_nm', 'abs_nm', 'emission_nm']]

In [5]:
#Checks the column names, and ensures that they do not have any leading or trailing spaces
df_input.columns = df_input.columns.str.strip()
df_output.columns = df_output.columns.str.strip()

In [6]:
df_input.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   in_source             157 non-null    object 
 1   in_amount_mmol        157 non-null    float64
 2   p_source              157 non-null    object 
 3   p_amount_mmol         157 non-null    float64
 4   ligand_source         157 non-null    object 
 5   ligand_amount_mmol    157 non-null    float64
 6   first_sol             157 non-null    object 
 7   first_sol_amount_ml   157 non-null    float64
 8   second_sol            157 non-null    object 
 9   second_sol_amount_ml  157 non-null    float64
 10  other_1               157 non-null    object 
 11  other_1_amount_mmol   157 non-null    float64
 12  other_2               157 non-null    object 
 13  other_2_amount_mmol   157 non-null    float64
 14  total_volume_ml       157 non-null    float64
 15  temp_c                1

In [7]:
#change temp from integer to float
df_input['temp_c'] = df_input['temp_c'].astype(float)
df_input.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   in_source             157 non-null    object 
 1   in_amount_mmol        157 non-null    float64
 2   p_source              157 non-null    object 
 3   p_amount_mmol         157 non-null    float64
 4   ligand_source         157 non-null    object 
 5   ligand_amount_mmol    157 non-null    float64
 6   first_sol             157 non-null    object 
 7   first_sol_amount_ml   157 non-null    float64
 8   second_sol            157 non-null    object 
 9   second_sol_amount_ml  157 non-null    float64
 10  other_1               157 non-null    object 
 11  other_1_amount_mmol   157 non-null    float64
 12  other_2               157 non-null    object 
 13  other_2_amount_mmol   157 non-null    float64
 14  total_volume_ml       157 non-null    float64
 15  temp_c                1

In [8]:
#Initializes 2 lists to contain all of the numerical and categorical input columns
input_num_cols = [col for col in df_input.columns if df[col].dtypes !='O']
input_cat_cols = [col for col in df_input.columns if df[col].dtypes =='O']

In [9]:
print(input_num_cols, input_cat_cols)

['in_amount_mmol', 'p_amount_mmol', 'ligand_amount_mmol', 'first_sol_amount_ml', 'second_sol_amount_ml', 'other_1_amount_mmol', 'other_2_amount_mmol', 'total_volume_ml', 'temp_c', 'time_min'] ['in_source', 'p_source', 'ligand_source', 'first_sol', 'second_sol', 'other_1', 'other_2']


In [10]:
#Initializes the ColumnTransformer object, and specifies what it will do with a dataframe
#scaling numerical columns
#onehotencoder creates a binary column for each categorical entry
ct = ColumnTransformer([
    ('step1', StandardScaler(), input_num_cols),
    ('step2', OneHotEncoder(sparse=False, handle_unknown='ignore'), input_cat_cols)
], remainder = 'passthrough')

In [11]:
#what are the transformers in ct
ct.transformers

[('step1',
  StandardScaler(),
  ['in_amount_mmol',
   'p_amount_mmol',
   'ligand_amount_mmol',
   'first_sol_amount_ml',
   'second_sol_amount_ml',
   'other_1_amount_mmol',
   'other_2_amount_mmol',
   'total_volume_ml',
   'temp_c',
   'time_min']),
 ('step2',
  OneHotEncoder(handle_unknown='ignore', sparse=False),
  ['in_source',
   'p_source',
   'ligand_source',
   'first_sol',
   'second_sol',
   'other_1',
   'other_2'])]

In [12]:
#Uses the ColumnTransformer object to modify the input columns
df_input_scaled_encoded = pd.DataFrame(ct.fit_transform(df_input))
df_input_scaled_encoded

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,47,48,49,50,51,52,53,54,55,56
0,0.713177,0.328895,-0.453863,-0.453709,-0.297521,-0.749374,-0.18627,-0.447720,-0.694687,6.731028,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,-0.336633,-0.477469,-0.453863,-0.453709,-0.076698,-0.749374,-0.18627,-0.426587,-0.694687,-0.357533,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.713177,0.328895,-0.453863,-0.453709,-0.076698,-0.749374,-0.18627,-0.426587,-0.694687,-0.120118,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,-0.599085,-0.679060,-0.197558,-0.219170,-0.266618,-0.749374,-0.18627,-0.229122,-1.714392,-0.137076,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,-0.599085,-0.679060,-0.197558,-0.219170,-0.266618,-0.749374,-0.18627,-0.229122,-1.714392,-0.382971,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,-0.730311,-0.779855,-0.453863,-0.085147,-0.394734,-0.678852,-0.18627,-0.118160,1.897062,-0.221867,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
153,-0.717189,-0.779855,-0.344018,-0.185664,-0.394734,-0.749374,-0.18627,-0.210577,1.259747,1.643544,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
154,-0.717189,-0.779855,-0.344018,-0.185664,-0.394734,-0.749374,-0.18627,-0.210577,1.259747,0.626047,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
155,-0.717189,-0.779855,-0.344018,-0.185664,-0.394734,-0.749374,-0.18627,-0.210577,1.259747,0.117298,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [13]:
#Number of categorical columns
array_of_cat_titles = ct.transformers_[1][1].get_feature_names()
len(array_of_cat_titles)

47

In [14]:
#Number of numerical columns
len(input_num_cols)

10

In [15]:
#renaming columns
for i in range(len(input_num_cols)):
    df_input_scaled_encoded.rename(columns={df_input_scaled_encoded.columns[i]: input_num_cols[i]}, inplace = True)

for j in range(len(array_of_cat_titles)):
    df_input_scaled_encoded.rename(columns={df_input_scaled_encoded.columns[i+1]: array_of_cat_titles[j]}, inplace = True)
    i = i + 1

In [16]:
df_input_scaled_encoded

Unnamed: 0,in_amount_mmol,p_amount_mmol,ligand_amount_mmol,first_sol_amount_ml,second_sol_amount_ml,other_1_amount_mmol,other_2_amount_mmol,total_volume_ml,temp_c,time_min,...,x5_zinc chloride,x5_zinc iodide,x5_zinc oleate,x5_zinc stearate,x5_zinc undecylenate,x6_None,x6_copper bromide,x6_trioctylphosphine,x6_water,x6_zinc iodide
0,0.713177,0.328895,-0.453863,-0.453709,-0.297521,-0.749374,-0.18627,-0.447720,-0.694687,6.731028,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,-0.336633,-0.477469,-0.453863,-0.453709,-0.076698,-0.749374,-0.18627,-0.426587,-0.694687,-0.357533,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.713177,0.328895,-0.453863,-0.453709,-0.076698,-0.749374,-0.18627,-0.426587,-0.694687,-0.120118,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,-0.599085,-0.679060,-0.197558,-0.219170,-0.266618,-0.749374,-0.18627,-0.229122,-1.714392,-0.137076,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,-0.599085,-0.679060,-0.197558,-0.219170,-0.266618,-0.749374,-0.18627,-0.229122,-1.714392,-0.382971,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,-0.730311,-0.779855,-0.453863,-0.085147,-0.394734,-0.678852,-0.18627,-0.118160,1.897062,-0.221867,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
153,-0.717189,-0.779855,-0.344018,-0.185664,-0.394734,-0.749374,-0.18627,-0.210577,1.259747,1.643544,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
154,-0.717189,-0.779855,-0.344018,-0.185664,-0.394734,-0.749374,-0.18627,-0.210577,1.259747,0.626047,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
155,-0.717189,-0.779855,-0.344018,-0.185664,-0.394734,-0.749374,-0.18627,-0.210577,1.259747,0.117298,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [17]:
df_input_scaled_encoded.columns

Index(['in_amount_mmol', 'p_amount_mmol', 'ligand_amount_mmol',
       'first_sol_amount_ml', 'second_sol_amount_ml', 'other_1_amount_mmol',
       'other_2_amount_mmol', 'total_volume_ml', 'temp_c', 'time_min',
       'x0_indium acetate', 'x0_indium bromide', 'x0_indium chloride',
       'x0_indium iodide', 'x0_indium myristate', 'x0_indium trifluoroacetate',
       'x1_bis(trimethylsilyl)phosphine', 'x1_phosphorus trichloride',
       'x1_tris(diethylamino)phosphine', 'x1_tris(dimethylamino)phosphine',
       'x1_tris(trimethylgermyl)phosphine', 'x1_tris(trimethylsilyl)phosphine',
       'x2_None', 'x2_lauric acid', 'x2_myristic acid', 'x2_oleic acid',
       'x2_palmitic acid', 'x2_stearic acid', 'x3_dodecylamine',
       'x3_octadecene', 'x3_oleylamine', 'x3_trioctylamine',
       'x3_trioctylphosphine', 'x4_None', 'x4_dioctyl ether',
       'x4_dioctylamine', 'x4_hexadecylamine', 'x4_octylamine',
       'x4_oleylamine', 'x4_toluene', 'x4_trioctylphosphine',
       'x4_trioctylphos

In [18]:
#appends output columns
df_scaled_encoded = pd.concat([df_input_scaled_encoded, df_output], axis = 1)
df_scaled_encoded

Unnamed: 0,in_amount_mmol,p_amount_mmol,ligand_amount_mmol,first_sol_amount_ml,second_sol_amount_ml,other_1_amount_mmol,other_2_amount_mmol,total_volume_ml,temp_c,time_min,...,x5_zinc stearate,x5_zinc undecylenate,x6_None,x6_copper bromide,x6_trioctylphosphine,x6_water,x6_zinc iodide,diameter_nm,abs_nm,emission_nm
0,0.713177,0.328895,-0.453863,-0.453709,-0.297521,-0.749374,-0.18627,-0.447720,-0.694687,6.731028,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,6.7,622,680
1,-0.336633,-0.477469,-0.453863,-0.453709,-0.076698,-0.749374,-0.18627,-0.426587,-0.694687,-0.357533,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.5,445,568
2,0.713177,0.328895,-0.453863,-0.453709,-0.076698,-0.749374,-0.18627,-0.426587,-0.694687,-0.120118,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.5,543,589
3,-0.599085,-0.679060,-0.197558,-0.219170,-0.266618,-0.749374,-0.18627,-0.229122,-1.714392,-0.137076,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,,412,
4,-0.599085,-0.679060,-0.197558,-0.219170,-0.266618,-0.749374,-0.18627,-0.229122,-1.714392,-0.382971,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,,419,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,-0.730311,-0.779855,-0.453863,-0.085147,-0.394734,-0.678852,-0.18627,-0.118160,1.897062,-0.221867,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,,500,550
153,-0.717189,-0.779855,-0.344018,-0.185664,-0.394734,-0.749374,-0.18627,-0.210577,1.259747,1.643544,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,8.3,532,590
154,-0.717189,-0.779855,-0.344018,-0.185664,-0.394734,-0.749374,-0.18627,-0.210577,1.259747,0.626047,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,,549,
155,-0.717189,-0.779855,-0.344018,-0.185664,-0.394734,-0.749374,-0.18627,-0.210577,1.259747,0.117298,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,,562,


In [19]:
df_scaled_encoded.to_csv('flo_dataset_scaled.csv')

# Making separate abs, em and diam datasets

In [20]:
#Saves into a list the row indexes to drop for absorbance dataset
total_row_num = len(df_scaled_encoded)
drop_list_abs =[]
for row_i in range(total_row_num):
    if df_scaled_encoded['abs_nm'].values[row_i] == 'None':
        drop_list_abs.append(row_i)
    
#number of entries
print(total_row_num-len(drop_list_abs))

152


In [21]:
#Drops rows that don't have abs output
df_absorbance_scaled_encoded = df_scaled_encoded.drop(drop_list_abs)
df_absorbance_scaled_encoded.to_csv('dataset_scaled_abs.csv')

In [22]:
#Saves the row indexes to drop for emission dataset
total_row_num = len(df_scaled_encoded)
drop_list_em =[]
for row_i in range(total_row_num):
    if df_scaled_encoded['emission_nm'].values[row_i] == 'None':
        drop_list_em.append(row_i)

#number of entries
print(total_row_num-len(drop_list_em))

49


In [24]:
#Drops rows that don't have emission output
df_emission_scaled_encoded = df_scaled_encoded.drop(drop_list_em)
df_emission_scaled_encoded.to_csv('dataset_scaled_em.csv')

In [25]:
#Saves the row indexes to drop for diameter dataset
total_row_num = len(df_scaled_encoded)
drop_list_diam =[]
for row_i in range(total_row_num):
    if df_scaled_encoded['diameter_nm'].values[row_i] == 'None':
        drop_list_diam.append(row_i)
    
#number of entries
print(total_row_num-len(drop_list_diam))

72


In [26]:
#Drops rows that don't have diameter output
df_diameter_scaled_encoded = df_scaled_encoded.drop(drop_list_diam)
df_diameter_scaled_encoded.to_csv('dataset_scaled_diam.csv')