# Adicionando colunas de cálculo para equação de Danielson

In [20]:
import pandas as pd
import numpy as np

df_all = pd.read_csv("../../data/final/all_info.csv")
df_all.head()

Unnamed: 0,Molecule,Formula,Ei,Alpha,axx,ayy,azz,AlphaB,pi_bond,Dipole,Expt
0,Butane,C4H10,10.6,8.1,9.3,7.7,7.2,8.1,0,0.0,35.0
1,Pentane,C5H12,10.4,10.0,11.7,9.4,8.7,10.0,0,0.0,60.0
2,Hexane,C6H14,10.2,11.8,14.3,11.0,10.2,11.9,0,0.0,80.0
3,Heptane,C7H16,9.9,13.7,16.9,12.7,11.7,13.8,0,0.0,105.0
4,Octane,C8H18,10.0,15.5,19.6,14.3,13.2,15.7,0,0.0,115.0


In [50]:
def DYS(alpha, dipole, pi_bond):
    e_b = 12.4*(alpha + 1.6*dipole + 2.4*pi_bond - 5.6)

    return round(e_b, 2)

def calculate_DYS(row):
   if row["Molecule"] in ["Benzene", "Naphthalene"]:
       return DYS(row["Alpha"], row["Dipole"], row["pi_bond"])
   else:
       return DYS(row["Alpha"], row["Dipole"], 0)

In [45]:
# DYS_B se refere à previsão pela equação (5) do Danielson, Young e Surko considerando pi_bond de todas as moléculas
df_b = df_all.assign(DYS_B=lambda x: DYS(x["Alpha"], x["Dipole"], x["pi_bond"]))
df_b

Unnamed: 0,Molecule,Formula,Ei,Alpha,axx,ayy,azz,AlphaB,pi_bond,Dipole,Expt,DYS_B
0,Butane,C4H10,10.6,8.1,9.3,7.7,7.2,8.1,0,0.0,35.0,31.0
1,Pentane,C5H12,10.4,10.0,11.7,9.4,8.7,10.0,0,0.0,60.0,54.56
2,Hexane,C6H14,10.2,11.8,14.3,11.0,10.2,11.9,0,0.0,80.0,76.88
3,Heptane,C7H16,9.9,13.7,16.9,12.7,11.7,13.8,0,0.0,105.0,100.44
4,Octane,C8H18,10.0,15.5,19.6,14.3,13.2,15.7,0,0.0,115.0,122.76
5,Nonane,C9H20,10.0,17.4,21.3,14.9,14.0,16.8,0,0.0,145.0,146.32
6,Dodecane,C12H26,9.9,22.9,,,,,0,0.0,220.0,214.52
7,Hexadecane,C16H34,9.9,30.3,,,,,0,0.0,310.0,306.28
8,Ethylene,C2H4,10.5,4.2,5.4,3.8,3.3,4.2,1,0.0,20.0,12.4
9,Cyclopropane,C3H6,9.9,5.7,5.8,5.8,5.0,5.5,0,0.0,10.0,1.24


In [51]:
# DYS_A se refere à previsão pela equação (5) do Danielson, Young e Surko considerando pi_bond apenas das moléculas aromáticas

df_a = df_all.assign(DYS_A=df_all.apply(calculate_DYS, axis=1))
df_a.head(13)

Unnamed: 0,Molecule,Formula,Ei,Alpha,axx,ayy,azz,AlphaB,pi_bond,Dipole,Expt,DYS_A
0,Butane,C4H10,10.6,8.1,9.3,7.7,7.2,8.1,0,0.0,35.0,31.0
1,Pentane,C5H12,10.4,10.0,11.7,9.4,8.7,10.0,0,0.0,60.0,54.56
2,Hexane,C6H14,10.2,11.8,14.3,11.0,10.2,11.9,0,0.0,80.0,76.88
3,Heptane,C7H16,9.9,13.7,16.9,12.7,11.7,13.8,0,0.0,105.0,100.44
4,Octane,C8H18,10.0,15.5,19.6,14.3,13.2,15.7,0,0.0,115.0,122.76
5,Nonane,C9H20,10.0,17.4,21.3,14.9,14.0,16.8,0,0.0,145.0,146.32
6,Dodecane,C12H26,9.9,22.9,,,,,0,0.0,220.0,214.52
7,Hexadecane,C16H34,9.9,30.3,,,,,0,0.0,310.0,306.28
8,Ethylene,C2H4,10.5,4.2,5.4,3.8,3.3,4.2,1,0.0,20.0,-17.36
9,Cyclopropane,C3H6,9.9,5.7,5.8,5.8,5.0,5.5,0,0.0,10.0,1.24


In [55]:
DYS(8.2, 0, 0)

32.24

In [52]:
# DYS_A se refere à previsão pela equação (5) do Danielson, Young e Surko considerando pi_bond apenas das moléculas aromáticas
# DYS_B se refere à previsão pela equação (5) do Danielson, Young e Surko considerando pi_bond de todas as moléculas

df_ab = pd.merge(df_a, df_b, how="inner")
df_ab

Unnamed: 0,Molecule,Formula,Ei,Alpha,axx,ayy,azz,AlphaB,pi_bond,Dipole,Expt,DYS_A,DYS_B
0,Butane,C4H10,10.6,8.1,9.3,7.7,7.2,8.1,0,0.0,35.0,31.0,31.0
1,Pentane,C5H12,10.4,10.0,11.7,9.4,8.7,10.0,0,0.0,60.0,54.56,54.56
2,Hexane,C6H14,10.2,11.8,14.3,11.0,10.2,11.9,0,0.0,80.0,76.88,76.88
3,Heptane,C7H16,9.9,13.7,16.9,12.7,11.7,13.8,0,0.0,105.0,100.44,100.44
4,Octane,C8H18,10.0,15.5,19.6,14.3,13.2,15.7,0,0.0,115.0,122.76,122.76
5,Nonane,C9H20,10.0,17.4,21.3,14.9,14.0,16.8,0,0.0,145.0,146.32,146.32
6,Dodecane,C12H26,9.9,22.9,,,,,0,0.0,220.0,214.52,214.52
7,Hexadecane,C16H34,9.9,30.3,,,,,0,0.0,310.0,306.28,306.28
8,Ethylene,C2H4,10.5,4.2,5.4,3.8,3.3,4.2,1,0.0,20.0,-17.36,12.4
9,Cyclopropane,C3H6,9.9,5.7,5.8,5.8,5.0,5.5,0,0.0,10.0,1.24,1.24


In [57]:
df_ab.to_csv("../../data/final/all_info.csv", index=False)

# Comparando MAPE pra DYS_A e DYS_B

In [71]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_percentage_error

df = pd.read_csv("../../data/final/all_info.csv")
df.head()

Unnamed: 0,Molecule,Formula,Ei,Alpha,axx,ayy,azz,AlphaB,pi_bond,Dipole,Expt,DYS_A,DYS_B
0,Butane,C4H10,10.6,8.1,9.3,7.7,7.2,8.1,0,0.0,35.0,31.0,31.0
1,Pentane,C5H12,10.4,10.0,11.7,9.4,8.7,10.0,0,0.0,60.0,54.56,54.56
2,Hexane,C6H14,10.2,11.8,14.3,11.0,10.2,11.9,0,0.0,80.0,76.88,76.88
3,Heptane,C7H16,9.9,13.7,16.9,12.7,11.7,13.8,0,0.0,105.0,100.44,100.44
4,Octane,C8H18,10.0,15.5,19.6,14.3,13.2,15.7,0,0.0,115.0,122.76,122.76


## Moléculas Apolares

In [72]:
df_apol = df.query("Dipole == 0")
df_apol

Unnamed: 0,Molecule,Formula,Ei,Alpha,axx,ayy,azz,AlphaB,pi_bond,Dipole,Expt,DYS_A,DYS_B
0,Butane,C4H10,10.6,8.1,9.3,7.7,7.2,8.1,0,0.0,35.0,31.0,31.0
1,Pentane,C5H12,10.4,10.0,11.7,9.4,8.7,10.0,0,0.0,60.0,54.56,54.56
2,Hexane,C6H14,10.2,11.8,14.3,11.0,10.2,11.9,0,0.0,80.0,76.88,76.88
3,Heptane,C7H16,9.9,13.7,16.9,12.7,11.7,13.8,0,0.0,105.0,100.44,100.44
4,Octane,C8H18,10.0,15.5,19.6,14.3,13.2,15.7,0,0.0,115.0,122.76,122.76
5,Nonane,C9H20,10.0,17.4,21.3,14.9,14.0,16.8,0,0.0,145.0,146.32,146.32
6,Dodecane,C12H26,9.9,22.9,,,,,0,0.0,220.0,214.52,214.52
7,Hexadecane,C16H34,9.9,30.3,,,,,0,0.0,310.0,306.28,306.28
8,Ethylene,C2H4,10.5,4.2,5.4,3.8,3.3,4.2,1,0.0,20.0,-17.36,12.4
9,Cyclopropane,C3H6,9.9,5.7,5.8,5.8,5.0,5.5,0,0.0,10.0,1.24,1.24


In [73]:
error_dysA = mean_absolute_percentage_error(df_apol['Expt'], df_apol['DYS_A'])
error_dysB = mean_absolute_percentage_error(df_apol['Expt'], df_apol['DYS_B'])

error_dysA, error_dysB

(0.30705263874608785, 0.3294107157567964)

## Moléculas Polares

In [74]:
df_pol = df.query("Dipole != 0")
df_pol.head()

Unnamed: 0,Molecule,Formula,Ei,Alpha,axx,ayy,azz,AlphaB,pi_bond,Dipole,Expt,DYS_A,DYS_B
20,Bromoform,CHBr3,10.5,11.3,13.2,13.2,8.6,11.7,0,0.9,130.0,88.54,88.54
21,Cyclopentanone,C5H8O,9.3,9.0,10.3,9.6,7.6,9.2,1,3.3,230.0,107.63,137.39
22,Ethyl Acetate(conformer1),C4H8O2,10.0,8.7,10.8,8.9,7.1,8.9,1,2.1,160.0,80.1,109.86
23,1-Butanol,C4H10O,10.0,8.5,10.4,8.3,7.7,8.8,0,1.6,102.0,67.7,67.7
24,Chloroform,CHCl3,11.4,8.2,9.4,9.3,6.5,8.4,0,1.1,50.0,54.06,54.06


In [75]:
error_dysA = mean_absolute_percentage_error(df_pol['Expt'], df_pol['DYS_A'])
error_dysB = mean_absolute_percentage_error(df_pol['Expt'], df_pol['DYS_B'])

error_dysA, error_dysB

(0.4079286971272076, 0.33281287518233754)

## Todas Moléculas

In [76]:
error_dysA = mean_absolute_percentage_error(df['Expt'], df['DYS_A'])
error_dysB = mean_absolute_percentage_error(df['Expt'], df['DYS_B'])

error_dysA, error_dysB

(0.37190153341966486, 0.33159781824464435)