In [1]:
import os
import pandas as pd

def calculate_ranks(smiles_str, dir):
    # Generate the file names
    files = [f'seed{i}_all.out' for i in range(1, 6)]
    
    # Initialize an empty list to store results
    results = []
    
    # Loop over each file and compute ranks
    for file in files:
        # Load the data
        df = pd.read_csv(os.path.join(dir, file), index_col=0)
        
        # Sort by mixed_score
        df_sorted_dock = df.sort_values(by='mixed_score', ascending=False).reset_index(drop=True)
        
        # Find the dock rank
        try:
            dock_rank = df_sorted_dock[df_sorted_dock['smiles'] == smiles_str].index[0] + 1  # 1-based
            results.append([file, dock_rank])
        except IndexError:
            print(f"SMILES string not found in file: {file}")
            continue
    
    # Create a DataFrame with the results
    df_results = pd.DataFrame(results, columns=['file', 'rank'])
    
    # Calculate average rank and top percentage
    average_dock_rank = df_results['rank'].mean()
    df_results['top_percent'] = (df_results['rank'] / 50000) * 100
    average_dock_top_percent = df_results['top_percent'].mean()
    
    # Append the averages as a row
    df_results.loc['Average'] = ['Average', average_dock_rank, average_dock_top_percent]
    
    return df_results



In [6]:
# 9cpd
smiles_str = '[H]/N=C(\\c1ccc2c(c1)c(cc(n2)C)Nc3cccc(c3)OC)/N'
results_df = calculate_ranks(smiles_str, dir='case_study_9cpd')
results_df


Unnamed: 0,file,rank,top_percent
0,seed1_all.out,4182.0,8.364
1,seed2_all.out,3192.0,6.384
2,seed3_all.out,5295.0,10.59
3,seed4_all.out,3102.0,6.204
4,seed5_all.out,3037.0,6.074
Average,Average,3761.6,7.5232


In [6]:
# 9cpd
smiles_str = '[H]/N=C(\\c1ccc2c(c1)c(cc(n2)C)Nc3cccc(c3)OC)/N'
results_df = calculate_ranks(smiles_str, dir='case_study_9cpd')
results_df


Unnamed: 0,file,rank,top_percent
0,1M_all_ex.out,109355.0,10.935489
Average,Average,109355.0,10.935489
