In [16]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv(
    "data/finalData.csv",
    engine="python",         
    on_bad_lines="skip",      
    encoding="utf-8",
)

print(df.shape)

df["ascii_name_clean"] = (
    df["ascii_name_clean"]
    .astype(str)
    .str.lower()
    .str.replace(r"[\(\)\[\],\-]", " ", regex=True)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)

(387828, 2)


In [18]:
df

Unnamed: 0,compound_id,ascii_name_clean
0,3,r 3 hydroxybutanoyl n 2
1,7,s + 3 carene
2,7,1s 3 7 7 trimethylbicyclo 4.1.0 hept 3 ene
3,7,+ 3 carene
4,7,1s 6r 3 7 7 trimethylbicyclo 4.1.0 hept 3 ene
...,...,...
387823,746994,2 amp uridineresidue
387824,746995,5 methylamino methyl 2 thiouridine5' phosphate...
387825,746996,1 4 7 methoxy 6 sulfooxyquinolin 4 yl oxypheny...
387826,746997,3 2 acetamidoethyl 5 methoxy 1h indol 6 yl sul...


In [19]:
num_unique_compounds = df["compound_id"].nunique()
print("Unique compound_ids:", num_unique_compounds)

Unique compound_ids: 154224


In [None]:
name_counts = (
    df.groupby("compound_id")["ascii_name_clean"]
      .nunique()
)

num_with_multiple_names = (name_counts > 1).sum()
multi_name_cids = name_counts[name_counts > 1].index

print("sanity check:", len(multi_name_cids))
print("Compound_ids with multiple names:", num_with_multiple_names)

sanity check: 64570
Compound_ids with multiple names: 64570


In [None]:
sampled_cids = np.random.choice(
    multi_name_cids,
    size=25_000,
    replace=False  
)

print(sampled_cids)

[ 15447 152536  59924 ...   4450  21634 233480]


In [None]:
df_sampled = df[df["compound_id"].isin(sampled_cids)]

In [None]:
df_25k_unique = (
    df_sampled
    .groupby("compound_id", group_keys=False)
    .sample(n=1, random_state=42)
)

print(df_25k_unique.shape)              
print(df_25k_unique["compound_id"].nunique())

(25000, 2)
25000


In [24]:
df_25k_unique.head()

Unnamed: 0,compound_id,ascii_name_clean
6,7,1alpha 6alpha car 3 ene
28,20,1r 4s 2 2 dimethyl 3 methylenebicyclo 2.2.1 he...
49,31,+ menthone
88,38,+ o methylthalicberine
90,39,dextropimaricacid


In [None]:
df_25k_unique.to_csv(
    "data/testData.csv",
    index=False,
    encoding="utf-8"
)