In [1]:
# set working directory
import os
os.chdir("..")
print(os.getcwd())

/mnt/c/Users/jtbvd/Documents/Github Repositories/Research-proj-2-TNO


In [3]:
import pandas as pd

# Load the datasets
df = pd.read_csv("./Data/csv data/4.papyrus_present_genes&compounds.csv")
pdb_df = pd.read_csv("./Data/csv data/3.pdbligands&genes_combined.csv")

# Classify based on pchembl_value_Mean
df['Active'] = df['pchembl_value_Mean'] > 6.5
df['Inactive'] = df['pchembl_value_Mean'] <= 6.5

# Count the active, inactive, and datapoints (formerly Total) for each Symbol
summary = df.groupby('Symbol').agg(
    Datapoints=('SMILES', 'count'),
    Active=('Active', 'sum'),
    Inactive=('Inactive', 'sum'),
    pchembl_min=('pchembl_value_Mean', 'min'),
    pchembl_max=('pchembl_value_Mean', 'max')
).reset_index()

# Round the min and max values to 3 decimals
summary['pchembl_min'] = summary['pchembl_min'].round(3)
summary['pchembl_max'] = summary['pchembl_max'].round(3)

# Add the Ratio column (Active / Datapoints)
summary['Ratio'] = summary['Active'] / summary['Datapoints']
summary['Ratio'] = summary['Ratio'].round(3)

# Create the Range column with the min and max values for pchembl_value_Mean
summary['Activity range'] = summary['pchembl_min'].astype(str) + ' to ' + summary['pchembl_max'].astype(str)

# Count the number of PDB ligands per Symbol
pdb_counts = pdb_df.groupby('Symbol')['pdb_ligand_ID'].nunique().reset_index()
pdb_counts.rename(columns={'pdb_ligand_ID': 'PDB_ligands'}, inplace=True)

# Merge with the summary table
summary = summary.merge(pdb_counts, on='Symbol', how='left')
summary['PDB_ligands'] = summary['PDB_ligands'].fillna(0).astype(int)  # Fill missing values with 0

# Sort the table by Datapoints in descending order
summary = summary.sort_values(by='Datapoints', ascending=False)

# Show the summary table with the counts, ratio, range, and PDB ligands
summary

Unnamed: 0,Symbol,Datapoints,Active,Inactive,pchembl_min,pchembl_max,Ratio,Activity range,PDB_ligands
168,EGFR,11072,6011,5061,3.036,11.000,0.543,3.036 to 11.0,160
329,MAPK14,9344,5708,3636,3.020,13.939,0.611,3.02 to 13.939,190
305,KDR,6787,3759,3028,4.000,10.700,0.554,4.0 to 10.7,41
425,PIK3CD,6463,4837,1626,4.000,11.585,0.748,4.0 to 11.585,15
162,DRD2,5125,3114,2011,3.740,10.830,0.608,3.74 to 10.83,6
...,...,...,...,...,...,...,...,...,...
217,GALNT3,1,0,1,5.790,5.790,0.000,5.79 to 5.79,0
396,NT5C2,1,0,1,3.020,3.020,0.000,3.02 to 3.02,9
40,AMPD1,1,0,1,6.300,6.300,0.000,6.3 to 6.3,0
581,TRPM6,1,1,0,8.102,8.102,1.000,8.102 to 8.102,0


In [4]:
# Filter symbols with less than 30 actives or 30 inactives
symbols_with_less_than_30 = summary[(summary['Active'] < 30) & (summary['Inactive'] < 30)]

symbols_list_less = symbols_with_less_than_30['Symbol'].tolist()

# Print the number of such symbols
print(f"Number of targets with less than 30 actives or 30 inactives: {len(symbols_list_less)}")

Number of targets with less than 30 actives or 30 inactives: 218


In [5]:
# Filter symbols with more than 30 Active or more than 30 Inactive
symbols_with_more_than_30_active_or_inactive = summary[(summary['Active'] > 30) & (summary['Inactive'] > 30)]

# Get the list of Symbols
symbols_list_more = symbols_with_more_than_30_active_or_inactive['Symbol'].tolist()

# Print the number of such Symbols
print(f"Number of Symbols with more than 30 active and inactive: {len(symbols_list_more)}")

Number of Symbols with more than 30 active and inactive: 259


In [6]:
# Load the raw data
df_raw = pd.read_csv("./Data/csv data/1.raw_genes.csv")

# Filter the raw data for symbols with more than 60 datapoints
filtered_df = df_raw[df_raw['Symbol'].isin(symbols_list_more)].copy()

# Count the number of PDB IDs in each row (split by commas) and store it in a new column 'PDB-ID Count'
filtered_df.loc[:, 'PDB-ID Count'] = filtered_df['PDB-ID'].apply(lambda x: len(x.split(',')) if pd.notnull(x) else 0)

# Filter to only rows where there's at least one PDB ID
filtered_with_pdb_count = filtered_df[filtered_df['PDB-ID Count'] > 0]

# Merge the datapoints and additional columns (Active, Inactive, Ratio) with the filtered data based on Symbol
final_df = pd.merge(filtered_with_pdb_count, summary[['Symbol', 'Datapoints', 'Active', 'Inactive', 'Ratio', 'Activity range', 'PDB_ligands']], on='Symbol', how='left')

# Filter for rows where PDB_ligands > 0 and order
final_df = final_df[final_df['PDB_ligands'] > 0]
final_df = final_df.sort_values(by='Datapoints', ascending=False)

# Show the resulting DataFrame
final_df

Unnamed: 0,Symbol,Uniprot,PDB-ID,PDB-ID Count,Datapoints,Active,Inactive,Ratio,Activity range,PDB_ligands
4,EGFR,P00533,"1IVO,1M14,1M17,1MOX,1NQL,1XKK,1YY9,1Z9I,2EB2,2...",321,11072,6011,5061,0.543,3.036 to 11.0,160
38,MAPK14,Q16539,"1A9U,1BL6,1BL7,1BMK,1DI9,1IAN,1KV1,1KV2,1M7Q,1...",246,9344,5708,3636,0.611,3.02 to 13.939,190
33,KDR,P35968,"1VR2,1Y6A,1Y6B,1YWN,2M59,2MET,2MEU,2OH4,2P2H,2...",54,6787,3759,3028,0.554,4.0 to 10.7,41
90,PIK3CD,O00329,"5DXU,5M6U,5T8F,5UBT,5VLR,6G6W,6OCO,6OCU,6PYR,6...",15,6463,4837,1626,0.748,4.0 to 11.585,15
150,DRD2,P14416,"5AER,6CM4,6LUQ,6VMS,7DFP,7JVR,8IRS,8TZQ,8U02",9,5125,3114,2011,0.608,3.74 to 10.83,6
...,...,...,...,...,...,...,...,...,...,...
214,EPAS1,Q99814,"1P97,2A24,3F1N,3F1O,3F1P,3H7W,3H82,4GHI,4GS9,4...",36,89,46,43,0.517,4.28 to 8.05,21
0,ERBB3,P21860,"1M6B,2L9U,3KEX,3LMG,3P11,4LEO,4P59,4RIW,4RIX,4...",21,73,42,31,0.575,4.2 to 9.114,3
112,GSTO1,P78417,"1EEM,3LFL,3VLN,4IS0,4YQM,4YQU,4YQV,5UEH,5V3Q,5...",17,72,31,41,0.431,4.21 to 9.66,14
78,STAT6,P42226,"1OJ5,4Y5U,4Y5W,5D39,5NWM,5NWX",6,70,36,34,0.514,4.05 to 9.15,1


In [7]:
# Load the raw data
df_raw = pd.read_csv("./Data/csv data/1.raw_genes.csv")

# Filter the raw data for symbols with more than 60 datapoints
filtered_df = df_raw[df_raw['Symbol'].isin(symbols_list_less)].copy()

# Count the number of PDB IDs in each row (split by commas) and store it in a new column 'PDB-ID Count'
filtered_df.loc[:, 'PDB-ID Count'] = filtered_df['PDB-ID'].apply(lambda x: len(x.split(',')) if pd.notnull(x) else 0)

# Filter to only rows where there's at least one PDB ID
filtered_with_pdb_count = filtered_df[filtered_df['PDB-ID Count'] > 0]

# Merge the datapoints and additional columns (Active, Inactive, Ratio) with the filtered data based on Symbol
final_df_alt = pd.merge(filtered_with_pdb_count, summary[['Symbol', 'Datapoints', 'Active', 'Inactive', 'Ratio', 'Activity range', 'PDB_ligands']], on='Symbol', how='left')

# filter for more than 0 ligands and order
final_df_alt = final_df_alt[final_df_alt['PDB_ligands'] > 0]
final_df_alt = final_df_alt.sort_values(by='Datapoints', ascending=False)

# Show the resulting DataFrame
final_df_alt

Unnamed: 0,Symbol,Uniprot,PDB-ID,PDB-ID Count,Datapoints,Active,Inactive,Ratio,Activity range,PDB_ligands
154,WNT3A,P56704,"7DRT,7URD,7URE,8TZR",4,51,24,27,0.471,4.39 to 8.89,3
8,NOTCH1,P46531,"1PB5,1TOZ,1YYH,2F8X,2F8Y,2HE0,2VJ3,3ETO,3I08,3...",27,46,23,23,0.500,4.37 to 9.0,3
175,GAA,P10253,"5KZW,5KZX,5NN3,5NN4,5NN5,5NN6,5NN8,7P2Z,7P32",9,37,9,28,0.243,4.09 to 8.21,7
153,DCPS,Q96C86,"1ST0,1ST4,1XML,1XMM,3BL7,3BL9,3BLA,4QDE,4QDV,4...",11,36,27,9,0.750,4.62 to 10.7,10
158,HSF1,Q00613,"2LDU,5D5U,5D5V,5HDG,5HDN,7DCJ,7DCS,7DCT",8,35,18,17,0.514,4.75 to 8.15,1
...,...,...,...,...,...,...,...,...,...,...
168,PDCD1,Q15116,"2M2D,3RRQ,4ZQK,5B8C,5GGR,5GGS,5IUS,5JXE,5WT9,6...",34,1,1,0,1.000,8.48 to 8.48,2
13,MUC1,P15941,"1SM3,2ACM,2FO4,5T6P,5T78,6FZQ,6FZR,6KX1,6TGG,7...",18,1,0,1,0.000,4.12 to 4.12,8
170,IL33,O95760,"2KLL,4KC3",2,1,0,1,0.000,4.375 to 4.375,1
14,ADIPOR1,Q96A54,"5LXG,6KRZ,6KS0",3,1,0,1,0.000,5.75 to 5.75,1


In [9]:
# Save to Excel
final_df.to_excel("./Data/2.TOI_sufficient_data.xlsx", index=False)
final_df_alt.to_excel("./Data/3.TOI_insufficient_data.xlsx", index=False)

print("Saved")

Saved
