In [2]:
#Data retrieval and description

import pandas as pd
import gzip

In [3]:
import gzip
import pandas as pd
with gzip.open('drug.target.interaction (1).tsv.gz', 'rt') as f:
    df = pd.read_csv(f, sep='\t')

print(df.describe())
list(df.columns)

          STRUCT_ID     ACT_VALUE  ACT_UNIT     MOA
count  19378.000000  17774.000000       0.0  2733.0
mean    2371.957426      6.525474       NaN     1.0
std     1541.293557      1.471356       NaN     0.0
min        4.000000      1.200000       NaN     1.0
25%     1057.000000      5.428000       NaN     1.0
50%     2105.000000      6.280000       NaN     1.0
75%     3524.000000      7.500000       NaN     1.0
max     5462.000000     13.000000       NaN     1.0


['DRUG_NAME',
 'STRUCT_ID',
 'TARGET_NAME',
 'TARGET_CLASS',
 'ACCESSION',
 'GENE',
 'SWISSPROT',
 'ACT_VALUE',
 'ACT_UNIT',
 'ACT_TYPE',
 'ACT_COMMENT',
 'ACT_SOURCE',
 'RELATION',
 'MOA',
 'MOA_SOURCE',
 'ACT_SOURCE_URL',
 'MOA_SOURCE_URL',
 'ACTION_TYPE',
 'TDL',
 'ORGANISM']

In [4]:
#data normalization and cleaning 
#remove duplicates 
df = df.drop_duplicates()
print(df.describe())

          STRUCT_ID     ACT_VALUE  ACT_UNIT     MOA
count  19149.000000  17545.000000       0.0  2733.0
mean    2359.020262      6.531527       NaN     1.0
std     1534.188516      1.468356       NaN     0.0
min        4.000000      1.200000       NaN     1.0
25%     1053.000000      5.430000       NaN     1.0
50%     2090.000000      6.290000       NaN     1.0
75%     3514.000000      7.510000       NaN     1.0
max     5462.000000     13.000000       NaN     1.0


In [5]:
#converting text to lower case for consistency 
text_columns = ['DRUG_NAME', 'TARGET_NAME', 'TARGET_CLASS']
df[text_columns] = df[text_columns].apply(lambda x: x.str.lower())

#Remove leading/trailing whitespace
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

print(df.describe())

          STRUCT_ID     ACT_VALUE  ACT_UNIT     MOA
count  19149.000000  17545.000000       0.0  2733.0
mean    2359.020262      6.531527       NaN     1.0
std     1534.188516      1.468356       NaN     0.0
min        4.000000      1.200000       NaN     1.0
25%     1053.000000      5.430000       NaN     1.0
50%     2090.000000      6.290000       NaN     1.0
75%     3514.000000      7.510000       NaN     1.0
max     5462.000000     13.000000       NaN     1.0


In [6]:
#structuring the data 
df['interaction_id'] = df.index
df['interaction_text'] = df.apply(lambda row: f"Drug: {row['DRUG_NAME']} | Target: {row['TARGET_NAME']} | Type: {row['TARGET_CLASS']} | Action: {row['ACTION_TYPE']}", axis=1)

In [7]:
# Select relevant columns for your model
columns_to_keep = ['interaction_id', 'DRUG_NAME', 'TARGET_NAME', 'TARGET_CLASS', 'ACTION_TYPE', 'interaction_text']
df_processed = df[columns_to_keep]

In [8]:
df_processed.to_csv('processed_drug_interactions.csv', index=False)

In [9]:
import pandas as pd
from collections import defaultdict
import re

# Load the processed data
df = pd.read_csv('processed_drug_interactions.csv')

# Create an inverted index
inverted_index = defaultdict(set)

def tokenize(text):
    return re.findall(r'\w+', text.lower())

for idx, row in df.iterrows():
    tokens = tokenize(row['interaction_text'])
    for token in tokens:
        inverted_index[token].add(idx)

In [10]:
def search(query, inverted_index, df):
    query_tokens = tokenize(query)
    result_indices = set.intersection(*[inverted_index[token] for token in query_tokens if token in inverted_index])
    results = df.loc[list(result_indices)]
    return results

query = "aspirin enzyme"
search_results = search(query, inverted_index, df)
print(search_results['interaction_text'].head())

4     Drug: levobupivacaine | Target: cytochrome p45...
16    Drug: aminopterin | Target: dihydrofolate redu...
17    Drug: aminopterin | Target: dihydrofolate redu...
18    Drug: aminopterin | Target: dihydrofolate redu...
19    Drug: aminopterin | Target: folylpoly-gamma-gl...
Name: interaction_text, dtype: object


In [11]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['interaction_text'])

def search_tfidf(query, vectorizer, tfidf_matrix, df):
    query_vector = vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    related_docs_indices = cosine_similarities.argsort()[-10:][::-1]  # Top 10 results
    results = df.iloc[related_docs_indices]
    return results

In [12]:
query = "aspirin enzyme"
search_results = search_tfidf(query, vectorizer, tfidf_matrix, df)
print(search_results['interaction_text'].tolist())

['Drug: losartan | Target: angiotensin-converting enzyme | Type: enzyme | Action: nan', 'Drug: captopril | Target: angiotensin-converting enzyme | Type: enzyme | Action: nan', 'Drug: captopril | Target: angiotensin-converting enzyme | Type: enzyme | Action: nan', 'Drug: diiodohydroxyquinoline | Target: angiotensin-converting enzyme | Type: enzyme | Action: nan', 'Drug: sitagliptin | Target: angiotensin-converting enzyme | Type: enzyme | Action: nan', 'Drug: racecadotril | Target: angiotensin-converting enzyme | Type: enzyme | Action: nan', 'Drug: racecadotril | Target: angiotensin-converting enzyme | Type: enzyme | Action: nan', 'Drug: telmisartan | Target: angiotensin-converting enzyme | Type: enzyme | Action: nan', 'Drug: proline | Target: angiotensin-converting enzyme | Type: enzyme | Action: nan', 'Drug: captopril | Target: angiotensin-converting enzyme | Type: enzyme | Action: INHIBITOR']


In [13]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class HybridSearch:
    def __init__(self, df):
        self.df = df
        self.documents = df['DRUG_NAME'] + ' ' + df['TARGET_NAME'] + ' ' + df['interaction_text']
        self.documents = self.documents.fillna('')
        self.inverted_index = self.build_inverted_index()
        self.tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
        self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(self.documents)

    def build_inverted_index(self):
        inverted_index = {}
        for idx, doc in enumerate(self.documents):
            for word in doc.lower().split():
                if word not in inverted_index:
                    inverted_index[word] = set()
                inverted_index[word].add(idx)
        return inverted_index

    def search(self, query, top_k=5):
        query_words = query.lower().split()
        candidate_docs = set()
        for word in query_words:
            candidate_docs.update(self.inverted_index.get(word, set()))

        if not candidate_docs:
            return []

        candidate_docs = list(candidate_docs)
        candidate_matrix = self.tfidf_matrix[candidate_docs]
        query_vector = self.tfidf_vectorizer.transform([query])

        similarities = cosine_similarity(query_vector, candidate_matrix).flatten()

        ranked_results = sorted(zip(candidate_docs, similarities), key=lambda x: x[1], reverse=True)

        return [(self.df.iloc[doc_id], score) for doc_id, score in ranked_results[:top_k]]

# Load and preprocess the data
df = pd.read_csv('processed_drug_interactions.csv')

# Initialize the hybrid search
hybrid_search = HybridSearch(df)

# Example search
query = "levobupivacaine ion channel"
results = hybrid_search.search(query)

print(f"Search results for: {query}")
for result, score in results:
    print(f"Score: {score:.4f}")
    print(f"Drug: {result['DRUG_NAME']}")
    print(f"Target: {result['TARGET_NAME']}")
    print(f"Interaction: {result['interaction_text']}")
    print("---")

Search results for: levobupivacaine ion channel
Score: 0.5097
Drug: levobupivacaine
Target: sodium channel protein type 1 subunit alpha
Interaction: Drug: levobupivacaine | Target: sodium channel protein type 1 subunit alpha | Type: ion channel | Action: nan
---
Score: 0.5072
Drug: levobupivacaine
Target: 5-hydroxytryptamine receptor 3a
Interaction: Drug: levobupivacaine | Target: 5-hydroxytryptamine receptor 3a | Type: ion channel | Action: nan
---
Score: 0.5019
Drug: levobupivacaine
Target: sodium channel protein type 4 subunit alpha
Interaction: Drug: levobupivacaine | Target: sodium channel protein type 4 subunit alpha | Type: ion channel | Action: BLOCKER
---
Score: 0.4773
Drug: levobupivacaine
Target: potassium voltage-gated channel subfamily h member 2
Interaction: Drug: levobupivacaine | Target: potassium voltage-gated channel subfamily h member 2 | Type: ion channel | Action: nan
---
Score: 0.4773
Drug: levobupivacaine
Target: potassium voltage-gated channel subfamily d member

In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split

# Load the data
df = pd.read_csv('processed_drug_interactions.csv')

# Fill missing values in ACTION_TYPE with a default category, e.g., 'Unknown'
df['ACTION_TYPE'] = df['ACTION_TYPE'].fillna('Unknown')  # Fixed assignment without inplace=True

# Prepare the dataset
texts = df['interaction_text'].tolist()

# Convert ACTION_TYPE to categorical codes (including 'Unknown' as a new category)
df['ACTION_TYPE'] = df['ACTION_TYPE'].astype('category')
labels = df['ACTION_TYPE'].cat.codes.tolist()

# Split the dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2)

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the data
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

# Create PyTorch datasets
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels
})
val_dataset = Dataset.from_dict({
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask'],
    'labels': val_labels
})

# Initialize the model
num_labels = len(df['ACTION_TYPE'].cat.categories)  # Number of unique labels, including 'Unknown'
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained('./fine_tuned_bert')
tokenizer.save_pretrained('./fine_tuned_bert')


  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the fine-tuned model and tokenizer
model = BertForSequenceClassification.from_pretrained('./fine_tuned_bert')
tokenizer = BertTokenizer.from_pretrained('./fine_tuned_bert')

# Load the data
df = pd.read_csv('processed_drug_interactions.csv')

# Retrieval Component
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['interaction_text'])

def retrieve_relevant_passages(query, top_k=5):
    query_vec = vectorizer.transform([query])
    similarities = cosine_similarity(query_vec, tfidf_matrix)[0]
    top_indices = similarities.argsort()[-top_k:][::-1]
    return df.iloc[top_indices]['interaction_text'].tolist()

# Generation Component
def generate_response(query, passages):
    combined_input = query + " " + " ".join(passages)
    inputs = tokenizer(combined_input, truncation=True, padding=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get the predicted class (ACTION_TYPE)
    predicted_class = torch.argmax(outputs.logits, dim=1).item()
    action_type = df['ACTION_TYPE'].cat.categories[predicted_class]
    
    # Construct a more informative response
    response = f"Based on the query '{query}' and the retrieved drug interaction information, "
    response += f"the predicted action type is: {action_type}.\n\n"
    response += "Relevant passages:\n"
    for i, passage in enumerate(passages, 1):
        response += f"{i}. {passage}\n"
    
    return response

# RAG Pipeline
def rag_pipeline(query):
    relevant_passages = retrieve_relevant_passages(query)
    response = generate_response(query, relevant_passages)
    return response

# Example usage
def process_user_query():
    while True:
        user_query = input("Enter your drug interaction query (or 'quit' to exit): ")
        if user_query.lower() == 'quit':
            break
        result = rag_pipeline(user_query)
        print("\nRAG System Response:")
        print(result)
        print("\n" + "="*50 + "\n")

# Run the interactive query processor
if __name__ == "__main__":
    print("Welcome to the Drug Interaction RAG System!")
    print("You can ask questions about drug interactions.")
    print("Type 'quit' to exit the program.\n")
    process_user_query()