In [7]:
import google.generativeai as genai
import dotenv
import os
dotenv.load_dotenv()
import pandas as pd
import time

In [8]:
df = pd.read_csv('../dataset/cleaned_dataset.csv')
df.head()

Unnamed: 0,Question,Answer
0,How do I apply for CPT?,You must get approval from your Designated Sch...
1,What documents are required for CPT?,Typically you need:\n- CPT Request Form from y...
2,How long does CPT processing take?,Processing usually takes 5-10 business days af...
3,Can I work before receiving CPT authorization?,No you must wait for approval and receive an u...
4,Do I need to pay any fees for CPT?,No CPT does not require a separate application...


In [9]:
# drop Na
df = df.dropna()

In [15]:
import os
import google.generativeai as genai
import pandas as pd

class FinetunePreprocessor:
    def __init__(self) -> None:
        # Retrieve the API key from environment variables
        api_key = os.getenv("GOOGLE_API_KEY")
        if not api_key:
            raise ValueError("GOOGLE_API_KEY environment variable is not set.")
        
        # Configure the generative AI model
        model_name = 'gemini-2.0-flash-exp'
        genai.configure(api_key=api_key)

        # Define the finetuning prompt
        self.finetuning_prompt = """Act as a legal expert analyzing a complex question. Generate detailed reasoning that leads to the provided answer. Follow these steps:

1. **Question Understanding**:
   - Break down the key legal concepts, jurisdictions, and implied context in the question
   - Identify potential ambiguities or multiple interpretations

2. **Core Analysis**:
   - Explain the primary legal framework(s) applicable
   - Reference relevant statutes, case law, and legal principles
   - Outline logical steps connecting facts to legal conclusions

3. **Alternative Perspectives**:
   - Present 2-3 plausible counterarguments or different interpretations
   - Consider opposing viewpoints and conflicting precedents
   - Discuss edge cases or exceptional circumstances

4. **Self-Reflection**:
   - Evaluate the strength of each perspective
   - Identify potential weaknesses in the main answer's reasoning
   - Explain why the provided answer is preferable despite alternatives

5. **Conclusion Synthesis**:
   - Clearly restate how the reasoning supports the final answer
   - Acknowledge any remaining uncertainties or limitations

Format Requirements:
- Use clear section headers without markdown
- Maintain academic tone but avoid unnecessary jargon
- Keep paragraphs concise (2-3 sentences)
- Reference specific legal concepts when possible

Input:
Question: {question}
Answer: {answer}

Output: 
[Only provide the reasoning text using the specified structure]"""

        # Initialize the generative model with the finetuning prompt
        self.chat_model = genai.GenerativeModel(model_name, system_instruction=self.finetuning_prompt)

    def generate_bulk_reasoning(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Generates reasoning for each question-answer pair in the DataFrame.

        Args:
            df (pd.DataFrame): A DataFrame containing 'question' and 'answer' columns.

        Returns:
            pd.DataFrame: The updated DataFrame with an additional 'reasoning' column.
        """
        for index, row in df.iterrows():
            time.sleep(5)
            if index%10 == 0:
                time.sleep(15)
            question = row['Question']
            answer = row['Answer']
            try:
                response = self.chat_model.generate_content(f"Question: {question}\nAnswer: {answer}")
                reasoning = response.text
                df.at[index, 'reasoning'] = reasoning
            except Exception as e:
                print(f"Error generating reasoning for row {index}: {e}")
                df.at[index, 'reasoning'] = None

        return df

In [16]:

df = df.head(5)
preprocessor = FinetunePreprocessor()
df = preprocessor.generate_bulk_reasoning(df)
print(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.at[index, 'reasoning'] = reasoning


                                         Question  \
0                         How do I apply for CPT?   
1            What documents are required for CPT?   
2              How long does CPT processing take?   
3  Can I work before receiving CPT authorization?   
4              Do I need to pay any fees for CPT?   

                                              Answer  \
0  You must get approval from your Designated Sch...   
1  Typically you need:\n- CPT Request Form from y...   
2  Processing usually takes 5-10 business days af...   
3  No you must wait for approval and receive an u...   
4  No CPT does not require a separate application...   

                                           reasoning  
0  Question Understanding\nThe question asks abou...  
1  Question Understanding\nThe question concerns ...  
2  Question Understanding\nThe question asks abou...  
3  Question Understanding\nThe question concerns ...  
4  Question Understanding\nThe question concerns ...  


In [17]:
df.head()

Unnamed: 0,Question,Answer,reasoning
0,How do I apply for CPT?,You must get approval from your Designated Sch...,Question Understanding\nThe question asks abou...
1,What documents are required for CPT?,Typically you need:\n- CPT Request Form from y...,Question Understanding\nThe question concerns ...
2,How long does CPT processing take?,Processing usually takes 5-10 business days af...,Question Understanding\nThe question asks abou...
3,Can I work before receiving CPT authorization?,No you must wait for approval and receive an u...,Question Understanding\nThe question concerns ...
4,Do I need to pay any fees for CPT?,No CPT does not require a separate application...,Question Understanding\nThe question concerns ...


In [18]:
df.to_csv('reasoning.csv', index=False)