In [59]:
import pandas as pd
import numpy as np

df_apol_old = pd.read_csv("data/final/final_apolar.csv")
df_pol_old = pd.read_csv("data/final/final_polar.csv")
df_pol_apol_old = pd.read_csv("data/final/final_polar_apolar.csv")

df_raw = pd.read_excel("data/raw/coletando_moléculas2024.xlsx", sheet_name=
                       "moléculas")

In [60]:
df_raw.columns = pd.Index(["ord", "Molecule", "Formula", "Expt", "Dipole", "Alpha", "pi_bond",
                           "Ei", "axx", "ayy", "azz","base_anisotropia", "pontos_atencao"])
# retirando quem não tem valor experimental pra energia de ligação
df_raw.dropna(subset=["Expt", "Ei", "Dipole"], inplace=True)

# retirando etano e outras moléculas com valores de ligação não determinados
df_raw = df_raw.query("Expt > 3")

In [61]:
# retirando colunas que não precisa
df_raw = df_raw.drop(["ord", "base_anisotropia", "pontos_atencao"], axis=1)

In [62]:
def treat_alpha(x):
    if '-' in str(x):
        return None
    else:
        if float(x) > 100:
            return float(x)/1000
        else:
            return float(x)
        

In [63]:
# Tratando as moléculas com nome esquisito proveninete do excel
df_raw.Molecule = df_raw.Molecule.str.replace("\xa0", "").values

In [64]:
# lidando com os alphas e seus valores
df_raw["axx"] = df_raw['axx'].apply(lambda x: treat_alpha(x))
df_raw["ayy"] = df_raw['ayy'].apply(lambda x: treat_alpha(x))
df_raw["azz"] = df_raw['azz'].apply(lambda x: treat_alpha(x))

In [65]:
# criando a coluna AlphaB
df_raw["AlphaB"] = df_raw[["axx", "ayy", "azz"]].mean(axis=1)

In [66]:
# Criando uma coluna que é dropada no fluxo de preparo de dados.
df_raw["Molecule Type"] = np.zeros(df_raw["Molecule"].shape)

In [67]:
# retirando moléculas
molecules_to_get_off = ['1,1-dicloroetileno/Vinilideno', '2,2-difl.propano', 'Etanol',
       'Tricloroetileno', 'Dimetilcarbonato', '1,2-cis-dicloroetileno',
       '1-fl-hexano']

df_raw = df_raw[~df_raw['Molecule'].isin(molecules_to_get_off)].reset_index(drop=True)

In [68]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Molecule       75 non-null     object 
 1   Formula        75 non-null     object 
 2   Expt           75 non-null     float64
 3   Dipole         75 non-null     float64
 4   Alpha          75 non-null     float64
 5   pi_bond        75 non-null     float64
 6   Ei             75 non-null     float64
 7   axx            64 non-null     float64
 8   ayy            64 non-null     float64
 9   azz            64 non-null     float64
 10  AlphaB         64 non-null     float64
 11  Molecule Type  75 non-null     float64
dtypes: float64(10), object(2)
memory usage: 7.2+ KB


In [69]:
# separando apolar de polar
df_raw_apolar = df_raw.query("Dipole == 0")
df_raw_polar = df_raw.query("Dipole != 0")

In [70]:
df_raw_apolar["N of Carbon"] = np.zeros(df_raw_apolar["Molecule"].shape)
df_raw_apolar["DYS"] = np.zeros(df_raw_apolar["Molecule"].shape)
df_raw_apolar["SG"] = np.zeros(df_raw_apolar["Molecule"].shape)
df_raw_apolar["PauloPred"] = np.zeros(df_raw_apolar["Molecule"].shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_raw_apolar["N of Carbon"] = np.zeros(df_raw_apolar["Molecule"].shape)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_raw_apolar["DYS"] = np.zeros(df_raw_apolar["Molecule"].shape)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_raw_apolar["SG"] = np.zeros(df_raw_apolar["Molecule"].shape)
A v

In [71]:
# Okay, acredito que estejamos prontos para rodar o data preparation.py
df_raw_apolar.to_csv("data/raw/expd_apolar_molecules.CSV", index=False)
df_raw_polar.to_csv("data/raw/polar_molecules.CSV", index = False)

Neste momento você deve modificar os data_paths em config e rodar o general/creating_data

# Analisando os novos dados salvos e processados

Aqui está acontecendo a junção das moléculas polares e apolares também.

In [72]:
df_apol_processed = pd.read_csv("data/processed/processed_apolar.csv")
df_pol_processed = pd.read_csv("data/processed/processed_polar.csv")

df_pol_processed.shape, df_apol_processed.shape

((48, 7), (27, 7))

In [73]:
# gerando a base de dados mista
polar_apolar_processed_data = pd.concat([df_apol_processed, 
                                         df_pol_processed],
                               axis=0, ignore_index=True)
polar_apolar_processed_data.to_csv('data/processed/processed_polar_apolar.csv', 
                                   index=False)

In [74]:
df_apol_final = pd.read_csv("data/final/final_apolar.csv")
df_pol_final = pd.read_csv("data/final/final_polar.csv")

df_pol_final.shape, df_apol_final.shape

((41, 10), (23, 10))

In [75]:
# gerando a base de dados mista
polar_apolar_final_data = pd.concat([df_apol_final, 
                                         df_pol_final],
                               axis=0, ignore_index=True)
polar_apolar_final_data.to_csv('data/final/final_polar_apolar.csv', 
                                   index=False)