In [1]:
from pathlib import Path

In [2]:
folder_name = "groups_new/0.1"
group_folder = Path(folder_name)
group_dict = {}


# Read the contents of each file
for i in range(8):
    file_path = group_folder / f"{i}.txt"
    if file_path.exists():
        with open(file_path, "r") as f:
            contents = f.read().split()
            group_dict[i] = set(map(int, contents))  # Convert to set of integers
            print(f"File {i}.txt contents: {group_dict[i]}")


File 0.txt contents: {1019269, 7326982, 1636752, 36119, 5973402, 5759258, 7977115, 2245668, 2310185, 3151666, 5123639, 8337722, 9838013, 313406, 5079234, 7610437, 3414356, 1903702, 2864987, 3546460, 7227357, 1650157, 3378420, 7983227}
File 1.txt contents: {3867265, 9407874, 9617923, 1827204, 1875719, 8573194, 6632459, 4013838, 5346580, 8739734, 2146331, 1881249, 3682850, 7375018, 3968043, 2390199, 4694328, 8337977, 7859387, 1967551, 5627459, 1976156, 1502181, 2487143, 3573480, 8492526, 3318135}
File 2.txt contents: {299266, 5494918, 9112978, 5310099, 9913368, 3211679, 2143011, 2493732, 4043951, 1662128, 1899443, 766666, 6474572, 8467917, 5952846, 6733657, 7099614, 3709920, 8885088, 4762983, 6571751, 6091882, 778352, 9197172, 303222, 4465023}
File 3.txt contents: {8754702, 1722522, 7573149, 7943327, 1781152, 6854178, 6973347, 6619554, 3982371, 3690931, 820789, 2929847, 8514745, 4924862, 7758918, 1592519, 859464, 4792268, 2364370, 2294995, 1735893, 5471704, 6981465, 7455579, 1717215, 303

In [3]:
from itertools import combinations

# Check for intersections
for (i, set1), (j, set2) in combinations(group_dict.items(), 2):
    intersection = set1.intersection(set2)
    if intersection:
        print(f"Intersection between file {i} and file {j}: {intersection}")

In [4]:
# Each file contains a list of space separated ids of the molecules in the group
# Create a df with the columns molecule_id and group_id

from pathlib import Path

import pandas as pd

folder_name = "groups_new/0.1"
group_folder = Path(folder_name)
group_folder.mkdir(exist_ok=True)

group_df = pd.DataFrame(columns=["molecule_id", "group_id"])

for i in range(8):
    with open(folder_name + "/" + str(i) + ".txt", "r") as f:
        contents = f.read().split()

        for j in range(len(contents)):
            group_df.loc[len(group_df)] = [contents[j], i]

group_df.to_csv(folder_name + "/group_df.csv", index=False)
group_df.head()

Unnamed: 0,molecule_id,group_id
0,313406,0
1,7227357,0
2,2245668,0
3,7977115,0
4,2310185,0


In [5]:
descFile = "sasa_pol_desc_newRot.csv"
features = ["pol", "psa", "n_donors", "nrotb", "n_acceptors", "logP"]
df = pd.read_csv(descFile)

df.head()

Unnamed: 0,mobleyID,sasa,ele,born,pol,wgt,logP,psa,num_atoms,n_acceptors,n_donors,volume,nrotb,rand1,rand2,rand3,rand4,rand5,dG_exp
0,mobley_7532833,191.81246,-0.3045,-7.186908,-7.491408,41.053,0.52988,23.79,3,1,0,58.432,0,0.470332,0.536774,0.855347,0.499103,0.95608,-3.88
1,mobley_2198613,213.24034,-0.083095,-1.414853,-1.497948,64.515,1.2451,0.0,3,0,0,68.632,0,0.285356,0.76077,0.58094,0.472225,0.7027,-0.63
2,mobley_9257453,301.361333,-1.277946,-7.817131,-9.095077,163.003,2.699,20.23,9,1,1,146.728,0,0.552854,0.876311,0.052245,0.06561,0.940141,-7.29
3,mobley_755351,306.791942,-3.184943,-10.224205,-13.409148,123.155,1.2774,35.25,9,2,1,138.736,1,0.467881,0.11659,0.646711,0.15964,0.332809,-7.29
4,mobley_9729792,281.826444,-0.264551,-3.091874,-3.356425,92.141,2.0587,0.0,7,0,0,113.608,0,0.057479,0.547507,0.846293,0.333114,0.89313,-0.99


In [6]:
def get_group_id(molecule_id):
    id = molecule_id.split("_")[-1]
    try:
        group_id = group_df.loc[group_df["molecule_id"] == id, "group_id"].iloc[0]
        return group_id
    except IndexError:
        # print('No group id found for molecule id {}'.format(molecule_id))
        return None


get_group_id("mobley_8057732")

5

In [7]:
def add_group_id_column(df, group_df):
    # Apply the get_group_id function to each mobleyID
    df["group_id"] = df["mobleyID"].apply(get_group_id)
    return df


# Assuming your existing DataFrame is called 'molecules_df'
df = add_group_id_column(df, group_df)

In [8]:
df.head()

Unnamed: 0,mobleyID,sasa,ele,born,pol,wgt,logP,psa,num_atoms,n_acceptors,n_donors,volume,nrotb,rand1,rand2,rand3,rand4,rand5,dG_exp,group_id
0,mobley_7532833,191.81246,-0.3045,-7.186908,-7.491408,41.053,0.52988,23.79,3,1,0,58.432,0,0.470332,0.536774,0.855347,0.499103,0.95608,-3.88,7.0
1,mobley_2198613,213.24034,-0.083095,-1.414853,-1.497948,64.515,1.2451,0.0,3,0,0,68.632,0,0.285356,0.76077,0.58094,0.472225,0.7027,-0.63,4.0
2,mobley_9257453,301.361333,-1.277946,-7.817131,-9.095077,163.003,2.699,20.23,9,1,1,146.728,0,0.552854,0.876311,0.052245,0.06561,0.940141,-7.29,5.0
3,mobley_755351,306.791942,-3.184943,-10.224205,-13.409148,123.155,1.2774,35.25,9,2,1,138.736,1,0.467881,0.11659,0.646711,0.15964,0.332809,-7.29,5.0
4,mobley_9729792,281.826444,-0.264551,-3.091874,-3.356425,92.141,2.0587,0.0,7,0,0,113.608,0,0.057479,0.547507,0.846293,0.333114,0.89313,-0.99,


In [9]:
df.columns

Index(['mobleyID', 'sasa', 'ele', 'born', 'pol', 'wgt', 'logP', 'psa',
       'num_atoms', 'n_acceptors', 'n_donors', 'volume', 'nrotb', 'rand1',
       'rand2', 'rand3', 'rand4', 'rand5', 'dG_exp', 'group_id'],
      dtype='object')

In [10]:
df = df[
    [
        "mobleyID",
        "pol",
        "psa",
        "n_donors",
        "nrotb",
        "group_id",
        "dG_exp",
        "n_acceptors",
        "logP",
    ]
]

# Save the dataframe as a csv file
df.to_csv("groups_new/0.1/grouped_data.csv", index=False)

In [11]:
df["group_id"].value_counts()

group_id
5.0    250
4.0     85
3.0     35
1.0     27
2.0     26
0.0     24
7.0     12
6.0     12
Name: count, dtype: int64