In [1]:
import pandas as pd
import google.generativeai as genai
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
import re  # Importing the re module for regular expressions

# Step 1: Load the CSV file into a DataFrame
file_path = 'Extended_Transaction_Dataset.csv'
transactions_df = pd.read_csv(file_path)

# Step 2: Configure the Gemini API with your API key
api_key = "AIzaSyDOExmBe0spo7h7PXGRbFqiPRPzfn5FdxE"  # Replace with your actual API key
genai.configure(api_key=api_key)

# Step 3: Extract unique transaction types
unique_transaction_types = transactions_df['Transaction_Type'].unique()

# Step 4: Send all unique transaction types in a single prompt with a structured format request
bulk_prompt = """
Categorize the following transaction types and return the results in a JSON format where each category is a key and the transaction types that belong to that category are listed as an array of values. For example:
{
  "Transportation": ["Uber", "Taxi"],
  "Food": ["Restaurant", "Grocery Store"],
  ...
}
do not leave out any transaction type
Transaction types:
"""
bulk_prompt += "\n".join(unique_transaction_types)

response = genai.generate_text(
    model="models/text-bison-001",  # Use the appropriate model name
    prompt=bulk_prompt
)

# Step 5: Print the raw response for debugging
print("Raw response from the model:")
print(response.result)

# Step 6: Manual parsing of the response
# Remove any problematic characters that might interfere with JSON parsing
raw_response = response.result.strip()

# Use regex to find all the category names and their associated transaction types
category_mapping = {}
category_pattern = r'\"(.*?)\":\s*\[(.*?)\]'
matches = re.findall(category_pattern, raw_response, re.DOTALL)

for match in matches:
    category = match[0]
    transactions = re.findall(r'\"(.*?)\"', match[1])
    category_mapping[category] = transactions

# Step 7: Create a reverse mapping from transaction type to category
transaction_to_category = {}
for category, transactions in category_mapping.items():
    for transaction in transactions:
        transaction_to_category[transaction] = category

# Step 8: Assign categories to the dataset
transactions_df['Gemini_Category'] = transactions_df['Transaction_Type'].map(transaction_to_category)

# Step 9: Handle NaN values (replace with "other" if any)
transactions_df['Gemini_Category'].fillna('other', inplace=True)

# Step 10: Train a machine learning model on the categorized data
model = make_pipeline(CountVectorizer(), MultinomialNB())

X_train, X_test, y_train, y_test = train_test_split(
    transactions_df['Transaction_Type'], transactions_df['Gemini_Category'], test_size=0.2, random_state=42
)

model.fit(X_train, y_train)

accuracy = model.score(X_test, y_test)
print(f"Model accuracy: {accuracy * 100:.2f}%")

# Step 11: Predict categories for new transaction types
new_transaction_types = ["New Transaction 1", "New Transaction 2"]
predicted_categories = model.predict(new_transaction_types)
print("Predicted categories for new transactions:", predicted_categories)

# Step 12: Save the updated DataFrame with categories
output_file_path = 'final_categorized_transactions.csv'
transactions_df.to_csv(output_file_path, index=False)
print(f"Updated dataset saved to '{output_file_path}'")


  from .autonotebook import tqdm as notebook_tqdm


Raw response from the model:
```json
{
  "Food": ["Bakery", "Butcher", "Coffee Shop", "Fast Food", "Grocery Store", "Restaurant"],
  "Health": ["Doctor's Visit", "Dentist", "Pharmacy", "Vet Visit"],
  "Home": ["Home Improvement", "House Cleaning", "Home Insurance", "Property Tax"],
  "Personal Care": ["Beauty Salon", "Florist", "Gift Shop", "Hair Salon", "Spa"],
  "Transportation": ["Airline Ticket", "Car Rental", "Courier Service", "Gas Station", "Parking", "Public Transport", "Taxi", "Uber"],
  "Entertainment": ["Concert Ticket", "Movie Theater", "Netflix", "Online Course", "Subscription Box", "Streaming Service"],
  "Shopping": ["Amazon", "Book Store", "Clothing Store", "Electronics Store", "Furniture Store", "Hardware Purchase", "Music Instrument Store"],
  "Services": ["Accounting Service", "Business Service", "Childcare Service", "Consulting Fee", "Education Fee", "Freelance Payment", "Gardening Service", "Legal Service", "Maintenance Fee"],
  "Investments": ["Investment", "Mortg