In [1]:
import google.generativeai as genai
import dotenv
import os
dotenv.load_dotenv()
import pandas as pd
import time

In [2]:
df = pd.read_csv('../dataset/cleaned_dataset_answer_improved.csv')
df.head()

Unnamed: 0,Question,Answer
0,Is a high school diploma required for an F-1 v...,A high school diploma or its equivalent is gen...
1,Is it important to memorize my SEVIS ID?,"It's crucial to know your SEVIS ID, as it's yo..."
2,Is proof of housing required at the port of en...,While proof of housing is not always required ...
3,What document does a school provide for an F-1...,"A school provides Form I-20, a Certificate of ..."
4,What if I plan to do research collaboration wi...,If asked about potential research collaboratio...


In [3]:
df.shape

(1000, 2)

In [4]:
class FinetunePreprocessor:
    def __init__(self) -> None:
        # Retrieve the API key from environment variables
        api_key = os.getenv("GOOGLE_API_KEY")
        if not api_key:
            raise ValueError("GOOGLE_API_KEY environment variable is not set.")
        
        # Configure the generative AI model
        model_name = 'gemini-2.0-flash-exp'
        genai.configure(api_key=api_key)

        # Define the finetuning prompt
        self.finetuning_prompt = """Act as a legal expert analyzing a complex question. Generate detailed reasoning that leads to the provided answer. Follow these steps:

1. **Question Understanding**:
   - Break down the key legal concepts, jurisdictions, and implied context in the question
   - Identify potential ambiguities or multiple interpretations

2. **Core Analysis**:
   - Explain the primary legal framework(s) applicable
   - Reference relevant statutes, case law, and legal principles
   - Outline logical steps connecting facts to legal conclusions

3. **Alternative Perspectives**:
   - Present 2-3 plausible counterarguments or different interpretations
   - Consider opposing viewpoints and conflicting precedents
   - Discuss edge cases or exceptional circumstances

4. **Self-Reflection**:
   - Evaluate the strength of each perspective
   - Identify potential weaknesses in the main answer's reasoning
   - Explain why the provided answer is preferable despite alternatives

5. **Conclusion Synthesis**:
   - Clearly restate how the reasoning supports the final answer
   - Acknowledge any remaining uncertainties or limitations

Format Requirements:
- Use clear section headers without markdown
- Maintain academic tone but avoid unnecessary jargon
- Keep paragraphs concise (2-3 sentences)
- Reference specific legal concepts when possible

Input:
Question: {question}
Answer: {answer}

Output: 
[Only provide the reasoning text using the specified structure]"""

        # Initialize the generative model with the finetuning prompt
        self.chat_model = genai.GenerativeModel(model_name, system_instruction=self.finetuning_prompt)


    def generate_reasoning_new(self, df: pd.DataFrame) -> None:
        """
        Generates reasoning for each question-answer pair in the DataFrame and saves the results to a CSV file.

        Args:
            df (pd.DataFrame): A DataFrame containing 'Question' and 'Answer' columns.
        """
        output_file = '../dataset/cleaned_dataset_answer_improved_reasoned.csv'
        max_retries = 3  # Maximum number of retries for a failed question
        processed_rows = []

        for index, row in df.iterrows():
            question = row['Question']
            answer = row['Answer']
            reasoning = None
            retries = 0

            while retries < max_retries:
                try:
                    # Generate reasoning using the chat model
                    response = self.chat_model.generate_content(f"Question: {question}\nAnswer: {answer}")
                    time.sleep(3)  # Rate limiting
                    reasoning = response.text
                    break  # Exit retry loop on success
                except Exception as e:
                    print(f"Error generating reasoning for row {index}: {e}")
                    retries += 1
                    if retries < max_retries:
                        print(f"Retrying ({retries}/{max_retries}) after 10 seconds...")
                        time.sleep(10)
                    else:
                        print(f"Max retries reached. Setting reasoning to None for row {index}.")
                        reasoning = None
                        break

            # Prepare row for CSV saving
            new_row = row.to_dict()
            new_row['reasoning'] = reasoning
            processed_rows.append(new_row)

            # Rate limiting
            time.sleep(3)
            if index % 15 == 0:
                time.sleep(10)  # Extra sleep every 15 rows

            # Save progress every 50 rows
            if len(processed_rows) >= 50:
                self._save_chunk(processed_rows, output_file)
                processed_rows = []

        # Save remaining rows
        if processed_rows:
            self._save_chunk(processed_rows, output_file)

    def _save_chunk(self, chunk: list, output_file: str) -> None:
        """Helper function to save a chunk of processed rows to CSV."""
        df_chunk = pd.DataFrame(chunk)
        header = not os.path.exists(output_file)
        try:
            df_chunk.to_csv(output_file, mode='a', header=header, index=False)
            print(f"Saved {len(chunk)} rows to {output_file}")
        except Exception as e:
            print(f"Error saving chunk: {e}")


In [5]:
preprocessor = FinetunePreprocessor()
preprocessor.generate_reasoning_new(df)

Saved 50 rows to ../dataset/cleaned_dataset_answer_improved_reasoned.csv
Saved 50 rows to ../dataset/cleaned_dataset_answer_improved_reasoned.csv
Saved 50 rows to ../dataset/cleaned_dataset_answer_improved_reasoned.csv
Saved 50 rows to ../dataset/cleaned_dataset_answer_improved_reasoned.csv
Error generating reasoning for row 226: Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 4. Meaning that the model was reciting from copyrighted material.
Retrying (1/3) after 10 seconds...
Saved 50 rows to ../dataset/cleaned_dataset_answer_improved_reasoned.csv
Saved 50 rows to ../dataset/cleaned_dataset_answer_improved_reasoned.csv
Saved 50 rows to ../dataset/cleaned_dataset_answer_improved_reasoned.csv
Error generating reasoning for row 399: 504 Deadline Exceeded
Retrying (1/3) after 10 seconds...
Saved 50 rows to ../dataset/clea

In [7]:
result = pd.read_csv('../dataset/cleaned_dataset_answer_improved_reasoned.csv')
result.shape

(1000, 3)