In [1]:
import pandas as pd

In [2]:
unit_text="""
absolute molar magnetic susceptibility	[10^-6 cm3/mol]
absolute standard enthalpy of formation	[kJ/mol]
amorphous density	[g/cm3]
amorphous thermal conductivity	[W/m/K]
band gap	[eV]
boiling temperature	[oC]
converted oxidation/reduction potential (V vs SHE)	[V vs SHE]
crystalline density	[g/cm3]
crystalline thermal conductivity	[W/m/K]
decomposition temperature	[oC]
density	[g/cm3]
dipole moment	[debye]
electric conductivity	[S/cm]
flash temperature	[oC]
glass expansivity	[10^-4 cm3/g/K]
glass transition temperature	[oC]
heat capacity	[J/g/K]
ionic conductivity	[S/cm]
ionization energy	[eV]
liquid expansivity	[10^-4 cm3/g/K]
liquid heat capacity	[J/g/K]
melting enthalpy	[kJJ/mol]
melting temperature	[oC]
molar heat capacity	[J/mol/K]
molar volume	[cm3/mol]
oxidation potential (V vs Ag/Ag+)	[V vs Ag/Ag+]
oxidation potential (V vs Fc/Fc+)	[V vs Fc/Fc+]
oxidation potential (V vs SCE)	[V vs SCE]
oxidation/reduction potential (V vs Li/Li+)	[V vs Li/Li+]
oxidation/reduction potential (V vs SHE)	[V vs SHE]
partition coefficient	[-]
permittivity	[-]
pKa	[-]
polarizability	[Å3]
reduction potential (V vs Ag/Ag+)	[V vs Ag/Ag+]
reduction potential (V vs SCE)	[V vs SCE]
reduction potential (V vs SHE)	[V vs SHE]
refractive index	[-]
solid heat capacity	[J/g/K]
solubility parameter	[J^(1/2)/cm^(3/2)]
surface tension	[mN/m]
thermal conductivity	[W/m/K]
time	[minutes]
UV cut-off	[nm]
vapor pressure	[kPa]
viscosity	[mPas]
"""

unit_dict={}
for line in unit_text.split("\n"):
    if len(line)<4:
        continue
    k,v=line.split("	")
    unit_dict[k]=v

unit_dict

{'absolute molar magnetic susceptibility': '[10^-6 cm3/mol]',
 'absolute standard enthalpy of formation': '[kJ/mol]',
 'amorphous density': '[g/cm3]',
 'amorphous thermal conductivity': '[W/m/K]',
 'band gap': '[eV]',
 'boiling temperature': '[oC]',
 'converted oxidation/reduction potential (V vs SHE)': '[V vs SHE]',
 'crystalline density': '[g/cm3]',
 'crystalline thermal conductivity': '[W/m/K]',
 'decomposition temperature': '[oC]',
 'density': '[g/cm3]',
 'dipole moment': '[debye]',
 'electric conductivity': '[S/cm]',
 'flash temperature': '[oC]',
 'glass expansivity': '[10^-4 cm3/g/K]',
 'glass transition temperature': '[oC]',
 'heat capacity': '[J/g/K]',
 'ionic conductivity': '[S/cm]',
 'ionization energy': '[eV]',
 'liquid expansivity': '[10^-4 cm3/g/K]',
 'liquid heat capacity': '[J/g/K]',
 'melting enthalpy': '[kJJ/mol]',
 'melting temperature': '[oC]',
 'molar heat capacity': '[J/mol/K]',
 'molar volume': '[cm3/mol]',
 'oxidation potential (V vs Ag/Ag+)': '[V vs Ag/Ag+]',
 '

In [12]:


noise_columns=[
'Unnamed: 13',
'Unnamed: 16',
'Unnamed: 17',
'Unnamed: 18',
'Unnamed: 21',
'Notes',
'Compounds without SMILES were not used for machine learning',
"CASNo",
'ID',
]

key_columns=["SMILES","CompName"]



parsed_records=[]
for csv_path in ["wiki/wiki1.csv","wiki/wiki2.csv"]:
    df=pd.read_csv(csv_path)
    records=df.to_dict(orient="records")

    for compound_data in records:
        if "CompName" not in compound_data:
            compound_data["CompName"]="unknown"
        for k,v in compound_data.items():
            if k in noise_columns:
                continue
            if k in key_columns:
                continue
            n_dict={
                "CompName":compound_data["CompName"],
                "SMILES":compound_data["SMILES"],
            }
            if v==v:
                if k.lower() in unit_dict:
                    n_dict["Property"]=k
                    try:
                        n_dict["Value"]=float(v)
                    except:
                        continue
                    n_dict["unit"]=unit_dict[k.lower()]
                    n_dict["Source"]="Wikipedia/Wikidata"
                    parsed_records.append(n_dict)

In [13]:
compound_data

{'ID': 1283,
 'SMILES': 'CCC(C)N(C)C',
 'Refractive index': nan,
 'pKa': 10.4,
 'Density': nan,
 'Melting enthalpy': nan,
 'Melting temperature': nan,
 'Boiling temperature': nan,
 'Decomposition temperature': nan,
 'Vapor pressure': nan,
 'Flash temperature': nan,
 'Ionization Energy': nan,
 'Absolute standard enthalpy of formation': nan,
 'Unnamed: 13': nan,
 'Notes': nan,
 'wikidataID': 'Q24004478',
 'Unnamed: 16': nan,
 'Unnamed: 17': nan,
 'Unnamed: 18': nan,
 'Compounds without SMILES were not used for machine learning': nan,
 'CompName': 'unknown'}

In [14]:
len(parsed_records)

7241

In [15]:
df_2=pd.DataFrame(parsed_records)
df_2

Unnamed: 0,CompName,SMILES,Property,Value,unit,Source
0,Ammonia,N,Viscosity,0.2760,[mPas],Wikipedia/Wikidata
1,Ammonia,N,Vapor pressure,857.3000,[kPa],Wikipedia/Wikidata
2,Ammonia,N,Refractive index,1.3327,[-],Wikipedia/Wikidata
3,Ammonia,N,Melting temperature,-77.7300,[oC],Wikipedia/Wikidata
4,Ammonia,N,Absolute molar magnetic susceptibility,18.0000,[10^-6 cm3/mol],Wikipedia/Wikidata
...,...,...,...,...,...,...
7236,unknown,F[Zr-2](F)(F)(F)(F)F.[K+].[K+],Density,3.4800,[g/cm3],Wikipedia/Wikidata
7237,unknown,C1(=NC(=NC(=N1)N)N)N,Density,1.5740,[g/cm3],Wikipedia/Wikidata
7238,unknown,C1(C(C(C(C(C1O)O)O)O)O)O,Density,1.7520,[g/cm3],Wikipedia/Wikidata
7239,unknown,[Li+].[O-][Nb](=O)=O,Density,4.6440,[g/cm3],Wikipedia/Wikidata


In [16]:
import datasets
dataset=datasets.Dataset.from_pandas(df_2)#.astype(str)
dataset

Dataset({
    features: ['CompName', 'SMILES', 'Property', 'Value', 'unit', 'Source'],
    num_rows: 7241
})

In [17]:
dataset.push_to_hub("kanhatakeyama/material-properties")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/kanhatakeyama/material-properties/commit/511329b355c38d9721ae88d937c788aec55630b3', commit_message='Upload dataset', commit_description='', oid='511329b355c38d9721ae88d937c788aec55630b3', pr_url=None, pr_revision=None, pr_num=None)