In [1]:
import pandas as pd

  from pandas.core import (


In [None]:
pd.read_parquet('passages.parquet').to_csv('passages.csv')

In [None]:
df2 = pd.read_csv('passages.csv')

In [None]:
df2.dropna().to_csv('passages.csv')

In [2]:
pip install langchain langchain-google-genai faiss-cpu pandas sentence-transformers



Note: you may need to restart the kernel to use updated packages.


In [3]:
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAI

from  langchain.embeddings import HuggingFaceEmbeddings

from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
import pandas as pd

In [25]:
class LangChainRAG:
    def __init__(self, google_api_key: str ):
        """
        Initialize the RAG system with LangChain components
        
        Args:
            google_api_key: Google API key for Gemini
        """
        # Initialize the Gemini LLM
        self.llm = GoogleGenerativeAI(
            model="gemini-pro",
            google_api_key=google_api_key,
            temperature=0.3
        )
        
        # Initialize embeddings (using a free model to avoid additional API costs)
        self.embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2"
        )
        
        # Initialize text splitter
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,
            chunk_overlap=50,
            length_function=len
        )
        
        self.vectorstore = None
        self.qa_chain = None
        
    def load_and_index_passages(self, csv_path: str, passage_column: str = 'passage'):
        """
        Load passages from CSV, split them, and create a vector store
        
        Args:
            csv_path: Path to CSV file containing passages
            passage_column: Name of the column containing passages
        """
        # Load passages from CSV
        df = pd.read_csv(csv_path)
        passages = df[passage_column].tolist()
        
        # Split texts into chunks
        texts = self.text_splitter.split_text('\n'.join(passages))
        
        # Create vector store
        self.vectorstore = FAISS.from_texts(
            texts,
            self.embeddings
        )
        
        # Create prompt template
        prompt_template = """Use the following pieces of context to answer the question at the end. 
        If you don't know the answer or can't find it in the context, just say that you don't know, 
        don't try to make up an answer.

        Context: {context}

        Question: {question}

        Answer:"""
        
        PROMPT = PromptTemplate(
            template=prompt_template,
            input_variables=["context", "question"]
        )
        
        # Create QA chain
        self.qa_chain = RetrievalQA.from_chain_type(
            llm=self.llm,
            chain_type="stuff",
            retriever=self.vectorstore.as_retriever(
                search_kwargs={"k": 3}
            ),
            return_source_documents=True,
            chain_type_kwargs={"prompt": PROMPT}
        )
        
    def get_answer(self, query: str) -> tuple:
        """
        Get answer for a query using the RAG system
        
        Args:
            query: User question
            
        Returns:
            Tuple containing (answer, source_documents)
        """
        if not self.qa_chain:
            raise ValueError("Please load and index passages first using load_and_index_passages()")
            
        result = self.qa_chain({"query": query})
        return result["result"], result["source_documents"]

In [26]:
# pip install langchain langchain-google-genai faiss-cpu pandas sentence-transformers

# Example usage
# def main():
# Initialize the RAG system
google_api_key = "AIzaSyC5ihtfdzs86uWcNmuu4swqZB3tkzM37dE"
rag = LangChainRAG(google_api_key)

In [27]:
# Load and index passages
rag.load_and_index_passages("passages.csv")

In [28]:
df3= pd.read_parquet('test.parquet')

In [29]:
# Get answer for a query
question = df3.iloc[0,0]
answer, sources = rag.get_answer(question)

print(f"Question: {question}")
print(f"\nAnswer: {answer}")
print("\nSources:")
for i, doc in enumerate(sources, 1):
    print(f"\nSource {i}:")
    print(doc.page_content)

# if __name__ == "__main__":
#     main()

Question: Is Hirschsprung disease a mendelian or a multifactorial disorder?

Answer: Multifactorial

Sources:

Source 1:
other syndromes related to cholesterol and fat metabolism among others. The 
genetics of Hirschsprung's disease are highly complex with the majority of known 
genetic sites relating to the main susceptibility pathways (RET an EDNRB). 
Non-syndromic non-familial, short-segment HSCR appears to represent a 
non-Mendelian condition with variable expression and sex-dependent penetrance. 
Syndromic and familial forms, on the other hand, have complex patterns of

Source 2:
variable pattern of inheritance. Although Hirschsprung's disease occurs as an 
isolated phenotype in at least 70% of cases, it is not infrequently associated 
with a number of congenital abnormalities and associated syndromes, 
demonstrating a spectrum of congenital anomalies. Certain of these syndromic 
phenotypes have been linked to distinct genetic sites, indicating underlying 
genetic associations of 

In [30]:
questions,pred_answers= list(df3.iloc[:,0]),[]


In [None]:
for q in questions:
    answer, sources = rag.get_answer(q)
    pred_answers.append(answer)

Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised DeadlineExceeded: 504 Deadline Exceeded.
E1022 19:43:01.667337000 8606249536 ssl_transport_security_utils.cc:105] Corruption detected.
E1022 19:43:01.667936000 8606249536 ssl_transport_security_utils.cc:61] error:1e000065:Cipher functions:OPENSSL_internal:BAD_DECRYPT
E1022 19:43:01.668658000 8606249536 ssl_transport_security_utils.cc:61] error:1000008b:SSL routines:OPENSSL_internal:DECRYPTION_FAILED_OR_BAD_RECORD_MAC
E1022 19:43:01.668670000 8606249536 secure_endpoint.cc:305]            Decryption error: TSI_DATA_CORRUPTED
E1022 19:43:01.670568000 8606249536 ssl_transport_security_utils.cc:78] SSL_write failed with error SSL_ERROR_SSL.
Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Unknown: None Stream removed.
Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_

In [None]:
ans_df = pd.DataFrame({'question':questions,'answer':pred_answers})

In [None]:
ans_df.to_csv('final_submission.csv')