<div style="display: flex; align-items: center;">
    <span style="font-size: 24px; color: #422057FF; font-weight: 500;">Index_based Nearest_Neighbour Search</span>
    <img src="logo.svg" style="height: 50px; width: auto; margin-left: auto;"/>
</div>

In [None]:
import os
import bz2
import csv
import zlib
import time
import faiss
import pickle
import psycopg2
import subprocess
import numpy as np
import pandas as pd

from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs, rdFingerprintGenerator

<div style="background-color:#4B6300; color:#F0E5CF; padding: 1px; border-radius: 5px;">
    <h2 style="font-size: 16px; margin-left: 10px;"> Spark-Based Fingerprint Generation and Database Insertion
 </h2>
</div>

In [None]:
def generate_database(input_file_path, batch_size):
    os.makedirs("simulation_files", exist_ok=True)
    db_out = os.path.join("simulation_files", "db_out.txt")
    db_time = os.path.join("simulation_files", "db_time.txt")

    start_time = time.time()

    with open(db_out, "w") as file:
        subprocess.run(["spark-submit", "--jars", "/spark/jars/postgresql-42.2.27.jar", "/home/z_main_files/generate_db.py", input_file_path, str(batch_size)], stdout=file, stderr=subprocess.PIPE)

    end_time = time.time()
    elapsed_time_minutes = round((end_time - start_time) / 60, 2)
    with open(db_time, "w") as file:
        file.write(f"{elapsed_time_minutes} minutes")

batch_size = 100000
generate_database('simulation_files/Enamine_REAL_HAC_11_21_1M_CXSMILES.gz', batch_size)

To verify the database and count the number of rows, follow these steps:

1. Go to the Enamine folder and enter the PostgreSQL container: ```make enter-pg-container```
2. Connect to the PostgreSQL database: ```psql -U kailash fingerprint_db;```
3. Count the number of rows in the fingerprints_table: ```SELECT COUNT(*) FROM fingerprints_table;```

<div style="background-color:#4B6300; color:#F0E5CF; padding: 1px; border-radius: 5px;">
    <h2 style="font-size: 16px; margin-left: 10px;"> Batch Processing and Indexing with Faiss
 </h2>
</div>

In [None]:
def generate_index(batch_size):
    os.makedirs("simulation_files", exist_ok=True)
    index_out   = os.path.join("simulation_files", "index_out.txt")
    index_time  = os.path.join("simulation_files", "index_time.txt")

    start_time = time.time()

    with open(index_out, "w") as file:
        subprocess.run(["spark-submit", "/home/z_main_files/generate_index.py", str(batch_size)], stdout=file, stderr=subprocess.PIPE)

    end_time = time.time()
    elapsed_time_minutes = round((end_time - start_time) / 60, 2)
    with open(index_time, "w") as file:
        file.write(f"{elapsed_time_minutes} minutes")
        
batch_size = 10000
generate_index(batch_size)

In [None]:
index = faiss.read_index('simulation_files/batch_indexes/lsh_index_batch_1.faiss')
print(f"Number of vectors: {round(index.ntotal/1e6, 1)} million")
print(f"Dimensionality of vectors: {index.d}")

<div style="background-color:#4B6300; color:#F0E5CF; padding: 1px; border-radius: 5px;">
    <h2 style="font-size: 16px; margin-left: 10px;"> Check the Streamlit output
 </h2>
</div>

In [None]:
import os
import time
import faiss
import psycopg2
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator, DataStructs
from concurrent.futures import ProcessPoolExecutor, as_completed

def query_index(file_path, dense_fp, k):
    lsh_index = faiss.read_index(file_path)
    _, query_indices = lsh_index.search(dense_fp.reshape(1, -1), k)
    return query_indices[0]

def compute_tanimoto_similarity(query_mol, mol):
    gen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=1024)
    fp1 = gen.GetFingerprint(query_mol)
    fp2 = gen.GetFingerprint(mol)
    tanimoto = DataStructs.TanimotoSimilarity(fp1, fp2)
    return round(tanimoto, 4)

def process_single_index(file_path, dense_fp, k, query_mol, cursor, batch_num, total_batches):
    step_start_time = time.time()
    
    query_indices = query_index(file_path, dense_fp, k)
    if query_indices.size > 0:
        query_indices = [int(idx) for idx in query_indices]
        placeholders = ','.join(['%s'] * len(query_indices))
        cursor.execute(f'SELECT id, smiles FROM fingerprints_table WHERE db_index IN ({placeholders})', tuple(query_indices))
        results = cursor.fetchall()

        df = pd.DataFrame(results, columns=['id', 'smiles'])
        df['Tanimoto_similarity'] = df['smiles'].apply(lambda x: compute_tanimoto_similarity(query_mol, Chem.MolFromSmiles(x)))
        df = df.sort_values(by='Tanimoto_similarity', ascending=False).head(k).reset_index(drop=True)
        df = df.rename(columns={'id': 'ID', 'smiles': 'SMILES', 'Tanimoto_similarity': 'Score'})
        
        step_end_time = time.time()
        total_step_time = int(step_end_time - step_start_time)
        print(f"Processed batch {batch_num}/{total_batches} in {total_step_time} s")
        
        return df
    return pd.DataFrame()

def process_similarity_query(query_smiles: str, k: int, conn):
    query_mol = Chem.MolFromSmiles(query_smiles)
    gen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=1024)
    query_fp = gen.GetFingerprint(query_mol)

    dense_fp = np.zeros((1024,), dtype=np.float32)
    Chem.DataStructs.ConvertToNumpyArray(query_fp, dense_fp)

    index_files = [os.path.join('simulation_files/batch_indexes', f) for f in os.listdir('simulation_files/batch_indexes') if f.endswith('.faiss')]
    total_batches = len(index_files)
    all_results = pd.DataFrame()

    with conn.cursor() as cursor:
        for batch_num, file_path in enumerate(index_files, start=1):
            result = process_single_index(file_path, dense_fp, k, query_mol, cursor, batch_num, total_batches)
            if not result.empty:
                all_results = pd.concat([all_results, result], ignore_index=True)

    return all_results

query_smiles = "Cc1cnc2c(cccc2c1)S(=O)(=O)N1CCN(CC1)C(=O)Nc1ccc(F)cc1"
k = 20

with psycopg2.connect(host="db", dbname="fingerprint_db", user="kailash", password="enamine", port=5432) as conn:
    overall_start_time = time.time()
    all_results = process_similarity_query(query_smiles, k, conn)
    all_results = all_results.sort_values(by='Score', ascending=False).head(k).reset_index(drop=True)
    all_results.to_csv('simulation_files/vector_similarity_results.csv', index=False)
    
    overall_end_time = time.time()
    elapsed_time = overall_end_time - overall_start_time

    print(f"Top {k} Similar Compounds")
    display(all_results)
    print(f"Total Execution Time: {elapsed_time:.2f} seconds")

<div style="background-color:#4B6300; color:#F0E5CF; padding: 1px; border-radius: 5px;">
    <h2 style="font-size: 16px; margin-left: 10px;"> Verify the output
 </h2>
</div>

In [None]:
df = pd.read_csv("simulation_files/vector_similarity_results.csv")

gen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=1024) 

def compute_tanimoto_similarity(query_mol, mol):
    fp1 = gen.GetFingerprint(query_mol)
    fp2 = gen.GetFingerprint(mol)
    tanimoto = DataStructs.TanimotoSimilarity(fp1, fp2)
    return round(tanimoto, 4)

query_smiles = "Cc1cnc2c(cccc2c1)S(=O)(=O)N1CCN(CC1)C(=O)Nc1ccc(F)cc1"
query_mol = Chem.MolFromSmiles(query_smiles)

df['Tanimoto_similarity'] = df['SMILES'].apply(lambda x: compute_tanimoto_similarity(query_mol, Chem.MolFromSmiles(x)))
df = df.sort_values(by='Tanimoto_similarity', ascending=False).reset_index(drop=True)
display(df.head(20))
print(df.shape)