# Detect PII with Gemini API Call

**Author:** Lang Min  
**Date:** 2 December 2025  
**Email:** min.la@northeastern.edu  

## 1. Setup

In [None]:
import google.generativeai as genai
import pandas as pd
import time
import os
from typing import List, Dict

In [None]:
# Configure Gemini API
GOOGLE_API_KEY = "APIAPIAPIAPIAPIAPIAPIAPIAPIAPIAPIAPI"  # Use the API key <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
genai.configure(api_key=GOOGLE_API_KEY)

# Initialize the model
model = genai.GenerativeModel('gemini-2.5-flash')


## 2. Load Data

In [None]:
df = pd.read_csv('Complaints_0925_filtered.csv')  # Use the filtered file <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
print("File read successfully.")
print(f'Number of complaints from filtered file: {df.shape[0]}')
print()

# Define output file
output_file = 'result.csv'

# Check if output file exists and load existing results
if os.path.exists(output_file):
    existing_results = pd.read_csv(output_file)
    start_index = len(existing_results)
    print(f"Found existing results. Resuming from row {start_index + 1}")
else:
    # Create new file with headers
    existing_results = pd.DataFrame(columns=['Complaint ID', 'Date', 'Product', 'Issue', 'Complaint', 'Answer', 'Explanation'])
    existing_results.to_csv(output_file, index=False)
    start_index = 0
    print("Starting fresh processing")


## 3. Call Gemini API

In [None]:
# Prepare lists to store results
result_data = []

# Define your detection criteria here
DETECTION_PROMPT = """
Analyze the following redacted complaint and answer the question.

Does this redacted complaint still contain any of these four types of personal information? (Note that things like XXXXX means info has been redacted)
1. Basic (names, ages, legal representation)
2. Physical characteristics or personal descriptors (for example: race, ethnicity, sexual orientation, nationality, immigration status, character traits, number of children, religion, and disabilities) 
3. Detailed military/veteran information (for example: ranks, awards, and degrees of disability)
4. Medical conditions (for example: “high blood pressure”, “fibromyalgia”, and “broke my arm”)
Answer with "Yes" or "No". If "Yes", which of the four categories does it belong to? And explain with one short sentence. 

Complaint: {complaint}

"""

# Process each complaint case
for index in range(start_index, len(df)):
    row = df.iloc[index]
    date = row['Date received']
    product = row['Product']
    issue = row['Issue']
    complaint_id = row['Complaint ID']
    complaint = row['Consumer complaint narrative']
    print('A case start!')
    
    # Create the prompt
    prompt = DETECTION_PROMPT.format(complaint=complaint)
    
    try:
        # Generate response from Gemini
        response = model.generate_content(prompt)
        full_response = response.text.strip()

        # Parse the response
        # Check if the first word is "Yes" or "No"
        first_word = full_response.split()[0].strip('.,;:!?')

        if first_word.upper() == "YES":
            answer = "Yes"
            explanation = full_response.split(None, 1)[1] if len(full_response.split(None, 1)) > 1 else ""
            explanation = explanation.lstrip('.,;:!? ')
            
        elif first_word.upper() == "NO":
            answer = "No"
            explanation = full_response.split(None, 1)[1] if len(full_response.split(None, 1)) > 1 else ""
            explanation = explanation.lstrip('.,;:!? ')
            
        else:
            # If response doesn't start with Yes or No, store as-is
            answer = "INVALID"
            explanation = full_response
        
        print(f"Processed row {index + 1}/{len(df)}: {answer}")

        result_temp = {
            "Complaint ID": complaint_id,
            "Date": date,
            "Product": product,
            "Issue": issue,
            "Complaint": complaint,
            "Answer": answer,
            "Explanation": explanation
        }
        result_df = pd.DataFrame([result_temp])
        result_df.to_csv(output_file, mode='a', header=False, index=False)
        
        # Add a small delay to avoid rate limiting
        time.sleep(2)
        
    except Exception as e:
        print(f"Error processing row {index + 1}: {str(e)}")
        raise


## 4. Output

In [None]:
# Final summary
final_results = pd.read_csv(output_file)
print("\nProcessing complete! Results saved to 'result.csv'")
print(f"Total rows processed: {len(final_results)}")
print(f"Yes count: {(final_results['Answer'] == 'Yes').sum()}")
print(f"No count: {(final_results['Answer'] == 'No').sum()}")
print(f"Invalid/Error count: {(~final_results['Answer'].isin(['Yes', 'No'])).sum()}")