In [16]:
import pandas as pd
import requests
import json
import time
from google.colab import files
import os

# Constants
GEMINI_API_KEY = "AIzaSyDDUk6hzrBSPlH60nJ8GKJfY5yrhBaNz90"
GEMINI_API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={GEMINI_API_KEY}"

# Function to upload CSV file
def upload_csv():
    print("Please upload your CSV file...")
    uploaded = files.upload()
    file_name = list(uploaded.keys())[0]
    print(f"Uploaded file: {file_name}")
    return file_name

# Function to call Gemini API
def get_insurance_label(category, withdrawal, deposit, ref_no,remark):
    # Creating a prompt for the Gemini API
    prompt = f"""

    You are an expert financial assistant. Based on the following transaction, determine the most suitable type of insurance the person might need based on their spending behavior and context. Analyze the remark and category fields like a human would, considering what kind of activity the person is doing.

Choose ONLY from the following insurance types:
- Life
- Health
- Accident
- Motor
- Credit
- Liability
- Travel
- Home

If none of the above applies, respond with "Other".

### Considerations:
- Medical, hospital, pharmacy, diagnostic lab = Health
- Food delivery, dining out, party, snacks, restaurants,smoke ,medical = Life (general wellbeing)
- Bus, train, toll, fuel, ride services (e.g., Uber) , Uber = Travel
- Driving-related, vehicle repairs, fuel station = Motor
- School fees, tuition, courses, educational services = Liability
- Loans, EMI payments, credit cards, finance-related , bond , fund , deposite ,stock,Dividend , related to any finance event = Credit
- Insurance-related payments (home, property, car) = Use exact match: Home, Motor, etc.
- Gym, fitness, sports injuries, risky activities = Accident
- Real estate, property purchases, house repairs = Home

### Transaction:
- Category: {category}
- Remark: {remark}
- Withdrawal amount: {withdrawal}
- Deposit amount: {deposit}
- Reference No: {ref_no}

Respond with only one of the 8 categories or "Other". Do not explain your reasoning.
"""

    # Prepare the payload for the API request
    payload = {
        "contents": [{
            "parts": [{"text": prompt}]
        }]
    }

    # Set headers for the API request
    headers = {
        "Content-Type": "application/json"
    }

    # Make API request
    try:
        response = requests.post(GEMINI_API_URL, headers=headers, data=json.dumps(payload))

        if response.status_code == 200:
            result = response.json()
            # Extract the text from the response
            if "candidates" in result and len(result["candidates"]) > 0:
                if "content" in result["candidates"][0] and "parts" in result["candidates"][0]["content"]:
                    return result["candidates"][0]["content"]["parts"][0]["text"].strip()
            return "Other"  # Default if we can't parse the response
        else:
            print(f"API Error: {response.status_code} - {response.text}")
            return "API Error"
    except Exception as e:
        print(f"Exception during API call: {e}")
        return "Error"

# Main function to process the CSV file
def process_csv(file_name):
    # Read the CSV file
    df = pd.read_csv(file_name)

    # Print the columns to verify
    print(f"Columns in the CSV: {df.columns.tolist()}")

    # Initialize the new column
    df['InsuranceLabel'] = 'Unknown'

    # Process each row
    total_rows = len(df)
    print(f"Processing {total_rows} transactions...")

    for i, row in df.iterrows():
        # Extract relevant fields
        category = str(row.get('Category', '')) if pd.notna(row.get('Category', '')) else ''
        withdrawal = str(row.get('Withdrawal', 0)) if pd.notna(row.get('Withdrawal', 0)) else '0'
        deposit = str(row.get('Deposit', 0)) if pd.notna(row.get('Deposit', 0)) else '0'
        ref_no = str(row.get('RefNo', '')) if pd.notna(row.get('RefNo', '')) else ''
        remark = str(row.get('Remark','')) if pd.notna(row.get('Remark','')) else ''
        # Call the API to get the label
        label = get_insurance_label(category, withdrawal, deposit, ref_no , remark)

        # Update the dataframe
        df.at[i, 'InsuranceLabel'] = label

        # Print progress
        if (i + 1) % 10 == 0 or (i + 1) == total_rows:
            print(f"Processed {i + 1}/{total_rows} transactions")

        # Add a small delay to avoid hitting API rate limits
        time.sleep(0.5)

    # Save the updated dataframe to a new CSV file
    output_file = f"labeled_{file_name}"
    df.to_csv(output_file, index=False)
    print(f"Finished processing. Output saved to {output_file}")

    # Download the processed file
    files.download(output_file)

    # Return a summary
    label_counts = df['InsuranceLabel'].value_counts()
    return label_counts

# Execute the program
if __name__ == "__main__":
    file_name = upload_csv()
    results = process_csv(file_name)
    print("\nLabel Distribution:")
    print(results)

Please upload your CSV file...


Saving transaction_dataset.csv to transaction_dataset (2).csv
Uploaded file: transaction_dataset (2).csv
Columns in the CSV: ['Date', 'Remark', 'RefNo', 'ValueDate', 'Withdrawal', 'Deposit', 'Balance', 'Category']
Processing 500 transactions...
Processed 10/500 transactions
Processed 20/500 transactions
Processed 30/500 transactions
Processed 40/500 transactions
Processed 50/500 transactions
Processed 60/500 transactions
Processed 70/500 transactions
Processed 80/500 transactions
Processed 90/500 transactions
Processed 100/500 transactions
Processed 110/500 transactions
Processed 120/500 transactions
Processed 130/500 transactions
Processed 140/500 transactions
Processed 150/500 transactions
Processed 160/500 transactions
Processed 170/500 transactions
Processed 180/500 transactions
Processed 190/500 transactions
Processed 200/500 transactions
Processed 210/500 transactions
Processed 220/500 transactions
Processed 230/500 transactions
Processed 240/500 transactions
Processed 250/500 tr

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Label Distribution:
InsuranceLabel
Other        162
Credit        70
Health        46
Life          45
Home          44
Travel        41
API Error     38
Motor         34
Liability     14
Accident       6
Name: count, dtype: int64
