In [1]:
import numpy as np
import pandas as pd
import os
import joblib
import sklearn
from sklearn.svm import SVR
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [3]:
df = pd.read_csv('../../dataset/flo/Flo_dataset.csv')
df

Unnamed: 0,in_source,in_amount_mmol,p_source,p_amount_mmol,ligand_source,ligand_amount_mmol,first_sol,first_sol_amount_ml,second_sol,second_sol_amount_ml,...,other_1,other_1_amount_mmol,other_2,other_2_amount_mmol,total_volume_ml,temp_c,time_min,diameter_nm,abs_nm,emission_nm
0,indium acetate,1.00,tris(trimethylsilyl)phosphine,1.000,dodecanethiol,0.5,,0.000000,,0.0,...,zinc stearate,2.00,,0.0,0.120,300,30.0,,539,480
1,chloroindium oxalate,1.05,tris(trimethylsilyl)phosphine,1.400,,0.0,trioctylphosphine oxide,0.222222,,0.0,...,,0.00,,0.0,0.223,270,4320.0,2.61,610,
2,indium chloride,0.30,white phosphorus,0.450,,0.0,oleylamine,0.264000,,0.0,...,zinc chloride,1.47,,0.0,0.264,180,30.0,,560,595
3,indium chloride,0.30,white phosphorus,0.450,,0.0,oleylamine,0.264000,,0.0,...,zinc chloride,1.47,,0.0,0.264,210,30.0,,590,635
4,indium acetate,1.00,tris(trimethylsilyl)phosphine,1.000,dodecanethiol,0.5,octadecene,1.000000,,0.0,...,zinc octanoate,2.00,,0.0,1.120,180,60.0,,,500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217,indium acetate,1.00,tris(trimethylsilyl)phosphine,0.499,myristic acid,3.0,octadecene,75.000000,,0.0,...,,0.00,,0.0,75.000,270,180.0,4,585,630
218,indium chloride,10.00,sodium phosphide,11.300,,0.0,dimethylformamide,90.000000,,0.0,...,,0.00,,0.0,90.000,160,120.0,,465,550
219,indium acetate,4.00,tris(trimethylsilyl)phosphine,2.000,palmitic acid,12.0,octadecene,100.000000,trioctylphosphine,10.0,...,,0.00,,0.0,110.000,260,1.0,,465,
220,indium acetate,4.00,tris(trimethylsilyl)phosphine,2.000,palmitic acid,12.0,octadecene,100.000000,trioctylphosphine,10.0,...,,0.00,,0.0,110.000,260,20.0,,495,


In [4]:
#Checks if there are any columns with no values
df.isna().sum()

in_source               0
in_amount_mmol          0
p_source                0
p_amount_mmol           0
ligand_source           0
ligand_amount_mmol      0
first_sol               0
first_sol_amount_ml     0
second_sol              0
second_sol_amount_ml    0
third_sol               0
third_sol_amount_ml     0
other_1                 0
other_1_amount_mmol     0
other_2                 0
other_2_amount_mmol     0
total_volume_ml         0
temp_c                  0
time_min                0
diameter_nm             0
abs_nm                  0
emission_nm             0
dtype: int64

In [5]:
#Separate out initial DataFrame into the input features and output features
df_input = df.drop(columns =['diameter_nm', 'abs_nm', 'emission_nm'], inplace = False, axis = 1)
df_output = df[['diameter_nm', 'abs_nm', 'emission_nm']]

In [6]:
#Checks the column names, and ensures that they do not have any leading or trailing spaces
df_input.columns = df_input.columns.str.strip()
df_output.columns = df_output.columns.str.strip()

In [7]:
df_input.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 222 entries, 0 to 221
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   in_source             222 non-null    object 
 1   in_amount_mmol        222 non-null    float64
 2   p_source              222 non-null    object 
 3   p_amount_mmol         222 non-null    float64
 4   ligand_source         222 non-null    object 
 5   ligand_amount_mmol    222 non-null    float64
 6   first_sol             222 non-null    object 
 7   first_sol_amount_ml   222 non-null    float64
 8   second_sol            222 non-null    object 
 9   second_sol_amount_ml  222 non-null    float64
 10  third_sol             222 non-null    object 
 11  third_sol_amount_ml   222 non-null    float64
 12  other_1               222 non-null    object 
 13  other_1_amount_mmol   222 non-null    float64
 14  other_2               222 non-null    object 
 15  other_2_amount_mmol   2

In [8]:
df_input['temp_c'] = df_input['temp_c'].astype(float)
df_input.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 222 entries, 0 to 221
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   in_source             222 non-null    object 
 1   in_amount_mmol        222 non-null    float64
 2   p_source              222 non-null    object 
 3   p_amount_mmol         222 non-null    float64
 4   ligand_source         222 non-null    object 
 5   ligand_amount_mmol    222 non-null    float64
 6   first_sol             222 non-null    object 
 7   first_sol_amount_ml   222 non-null    float64
 8   second_sol            222 non-null    object 
 9   second_sol_amount_ml  222 non-null    float64
 10  third_sol             222 non-null    object 
 11  third_sol_amount_ml   222 non-null    float64
 12  other_1               222 non-null    object 
 13  other_1_amount_mmol   222 non-null    float64
 14  other_2               222 non-null    object 
 15  other_2_amount_mmol   2

In [9]:
#Initializes 2 lists to contain all of the numerical and categorical input columns
input_num_cols = [col for col in df_input.columns if df[col].dtypes !='O']
input_cat_cols = [col for col in df_input.columns if df[col].dtypes =='O']

In [10]:
input_cat_cols

['in_source',
 'p_source',
 'ligand_source',
 'first_sol',
 'second_sol',
 'third_sol',
 'other_1',
 'other_2']

In [11]:
#Initializes the ColumnTransformer object, and specifies what it will do with a passed in dataframe
#scaling numerical columns
#onehotencoder creates a binary column for each category
ct = ColumnTransformer([
    ('step1', StandardScaler(), input_num_cols),
    ('step2', OneHotEncoder(sparse=False, handle_unknown='ignore'), input_cat_cols)
], remainder = 'passthrough')

In [14]:
ct.transformers

[('step1',
  StandardScaler(),
  ['in_amount_mmol',
   'p_amount_mmol',
   'ligand_amount_mmol',
   'first_sol_amount_ml',
   'second_sol_amount_ml',
   'third_sol_amount_ml',
   'other_1_amount_mmol',
   'other_2_amount_mmol',
   'total_volume_ml',
   'temp_c',
   'time_min']),
 ('step2',
  OneHotEncoder(handle_unknown='ignore', sparse=False),
  ['in_source',
   'p_source',
   'ligand_source',
   'first_sol',
   'second_sol',
   'third_sol',
   'other_1',
   'other_2'])]

In [15]:
#Uses the ColumnTransformer object to modify the input columns
df_input_scaled_encoded = pd.DataFrame(ct.fit_transform(df_input))
df_input_scaled_encoded

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,66,67,68,69,70,71,72,73,74,75
0,0.141371,-0.004155,-0.272127,-0.674671,-0.372078,-0.095346,0.877867,-0.145236,-0.661429,1.587780,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.182693,0.289167,-0.457063,-0.659269,-0.372078,-0.095346,-0.668729,-0.145236,-0.654873,0.989364,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,-0.437142,-0.407474,-0.457063,-0.656373,-0.372078,-0.095346,0.468019,-0.145236,-0.652263,-0.805885,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,-0.437142,-0.407474,-0.457063,-0.656373,-0.372078,-0.095346,0.468019,-0.145236,-0.652263,-0.207469,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.141371,-0.004155,-0.272127,-0.605360,-0.372078,-0.095346,0.877867,-0.145236,-0.597780,-0.805885,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217,0.141371,-0.371542,0.652557,4.523692,-0.372078,-0.095346,-0.668729,-0.145236,4.104599,0.989364,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
218,7.579398,7.548897,-0.457063,5.563365,-0.372078,-0.095346,-0.668729,-0.145236,5.059333,-1.204830,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
219,2.620713,0.729151,3.981417,6.256480,4.553404,-0.095346,-0.668729,-0.145236,6.332310,0.789892,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
220,2.620713,0.729151,3.981417,6.256480,4.553404,-0.095346,-0.668729,-0.145236,6.332310,0.789892,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [16]:
#Look into ct object to see the column titles
array_of_cat_titles = ct.transformers_[1][1].get_feature_names()
len(array_of_cat_titles) #There are a total of 41 encoded categorical columns

65

In [17]:
#Number of numerical columns in the dataset
len(input_num_cols)

11

In [18]:
for i in range(len(input_num_cols)):
    df_input_scaled_encoded.rename(columns={df_input_scaled_encoded.columns[i]: input_num_cols[i]}, inplace = True)

for j in range(len(array_of_cat_titles)):
    df_input_scaled_encoded.rename(columns={df_input_scaled_encoded.columns[i+1]: array_of_cat_titles[j]}, inplace = True)
    i = i + 1

In [19]:
df_input_scaled_encoded

Unnamed: 0,in_amount_mmol,p_amount_mmol,ligand_amount_mmol,first_sol_amount_ml,second_sol_amount_ml,third_sol_amount_ml,other_1_amount_mmol,other_2_amount_mmol,total_volume_ml,temp_c,...,x6_zinc iodide,x6_zinc octanoate,x6_zinc oleate,x6_zinc stearate,x6_zinc undecylenate,x7_None,x7_copper bromide,x7_oleic acid,x7_water,x7_zinc iodide
0,0.141371,-0.004155,-0.272127,-0.674671,-0.372078,-0.095346,0.877867,-0.145236,-0.661429,1.587780,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.182693,0.289167,-0.457063,-0.659269,-0.372078,-0.095346,-0.668729,-0.145236,-0.654873,0.989364,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,-0.437142,-0.407474,-0.457063,-0.656373,-0.372078,-0.095346,0.468019,-0.145236,-0.652263,-0.805885,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,-0.437142,-0.407474,-0.457063,-0.656373,-0.372078,-0.095346,0.468019,-0.145236,-0.652263,-0.207469,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.141371,-0.004155,-0.272127,-0.605360,-0.372078,-0.095346,0.877867,-0.145236,-0.597780,-0.805885,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217,0.141371,-0.371542,0.652557,4.523692,-0.372078,-0.095346,-0.668729,-0.145236,4.104599,0.989364,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
218,7.579398,7.548897,-0.457063,5.563365,-0.372078,-0.095346,-0.668729,-0.145236,5.059333,-1.204830,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
219,2.620713,0.729151,3.981417,6.256480,4.553404,-0.095346,-0.668729,-0.145236,6.332310,0.789892,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
220,2.620713,0.729151,3.981417,6.256480,4.553404,-0.095346,-0.668729,-0.145236,6.332310,0.789892,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [20]:
#appends output columns
df_scaled_encoded = pd.concat([df_input_scaled_encoded, df_output], axis = 1)
df_scaled_encoded

Unnamed: 0,in_amount_mmol,p_amount_mmol,ligand_amount_mmol,first_sol_amount_ml,second_sol_amount_ml,third_sol_amount_ml,other_1_amount_mmol,other_2_amount_mmol,total_volume_ml,temp_c,...,x6_zinc stearate,x6_zinc undecylenate,x7_None,x7_copper bromide,x7_oleic acid,x7_water,x7_zinc iodide,diameter_nm,abs_nm,emission_nm
0,0.141371,-0.004155,-0.272127,-0.674671,-0.372078,-0.095346,0.877867,-0.145236,-0.661429,1.587780,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,,539,480
1,0.182693,0.289167,-0.457063,-0.659269,-0.372078,-0.095346,-0.668729,-0.145236,-0.654873,0.989364,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.61,610,
2,-0.437142,-0.407474,-0.457063,-0.656373,-0.372078,-0.095346,0.468019,-0.145236,-0.652263,-0.805885,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,,560,595
3,-0.437142,-0.407474,-0.457063,-0.656373,-0.372078,-0.095346,0.468019,-0.145236,-0.652263,-0.207469,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,,590,635
4,0.141371,-0.004155,-0.272127,-0.605360,-0.372078,-0.095346,0.877867,-0.145236,-0.597780,-0.805885,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,,,500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217,0.141371,-0.371542,0.652557,4.523692,-0.372078,-0.095346,-0.668729,-0.145236,4.104599,0.989364,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4,585,630
218,7.579398,7.548897,-0.457063,5.563365,-0.372078,-0.095346,-0.668729,-0.145236,5.059333,-1.204830,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,,465,550
219,2.620713,0.729151,3.981417,6.256480,4.553404,-0.095346,-0.668729,-0.145236,6.332310,0.789892,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,,465,
220,2.620713,0.729151,3.981417,6.256480,4.553404,-0.095346,-0.668729,-0.145236,6.332310,0.789892,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,,495,


In [21]:
df_scaled_encoded.to_csv('flo_dataset_scaled.csv')