In [1]:
# ! pip install -q google-generativeai pandas

In [2]:
import google.generativeai as genai
import import_ipynb
import pandas as pd
from Utils import build_training_set
from sklearn.metrics import accuracy_score
import json
import time

# Configure GOOGLE_API_KEY

In [3]:
# Load JSON file
with open('secrets.json', 'r') as file:
    secrets = json.load(file)

GEMINI_API_KEY = secrets["GOOGLE_API_KEY"]

# Configure Gemini API Key
genai.configure(api_key=GEMINI_API_KEY)


# Train model

#### Chunk size = 512, Sentence alignment = False

In [4]:
# Load dataset
num_cases = 300
chunk_size = 512
use_title = False
respect_sentence_boundaries = False

df_sampled = build_training_set(num_cases, chunk_size, use_title, respect_sentence_boundaries)

# Extract unique categories
categories = df_sampled["Category"].unique()
categories_str = ", ".join(categories)

# Initialize Gemini model
model = genai.GenerativeModel("gemini-pro")

# Define classification function
def classify_text(text):
    prompt = f"""
    You are an AI assistant trained for text classification. Categorize the given text into one of the following categories:
    {categories_str}.
    
    Text: "{text}"
    
    Respond only with the category name.
    """
    response = model.generate_content(prompt)
    time.sleep(6)
    return response.text.strip() if response.text else "Unknown"

# Apply classification to dataset
df_sampled["Predicted_Category"] = df_sampled["Chunk"].apply(classify_text)

# Calculate accuracy
accuracy = accuracy_score(df_sampled["Category"], df_sampled["Predicted_Category"])
print(f"Classification Accuracy: {accuracy:.2%}")

# Save results locally
df_sampled.to_csv("gemini_classified_texts.csv", index=False)
print("Classification complete! Results saved as 'gemini_classified_texts.csv'.")

Classification Accuracy: 57.00%
Classification complete! Results saved as 'gemini_classified_texts.csv'.


Classification Accuracy: 57.00%
Classification complete! Results saved as 'gemini_classified_texts.csv'.


#### Chunk size = 512, Sentence alignment = True

In [7]:
# Load dataset
num_cases = 300
chunk_size = 512
use_title = False
respect_sentence_boundaries = True

df_sampled = build_training_set(num_cases, chunk_size, use_title, respect_sentence_boundaries)

# Extract unique categories
categories = df_sampled["Category"].unique()
categories_str = ", ".join(categories)

# Initialize Gemini model
model = genai.GenerativeModel("gemini-pro")

# Define classification function
def classify_text(text):
    prompt = f"""
    You are an AI assistant trained for text classification. Categorize the given text into one of the following categories:
    {categories_str}.
    
    Text: "{text}"
    
    Respond only with the category name.
    """
    response = model.generate_content(prompt)
    time.sleep(6)
    return response.text.strip() if response.text else "Unknown"

# Apply classification to dataset
df_sampled["Predicted_Category"] = df_sampled["Chunk"].apply(classify_text)

# Calculate accuracy
accuracy = accuracy_score(df_sampled["Category"], df_sampled["Predicted_Category"])
print(f"Classification Accuracy: {accuracy:.2%}")

# Save results locally
df_sampled.to_csv("gemini_classified_texts_sentence_alignment_true.csv", index=False)
print("Classification complete! Results saved as 'gemini_classified_texts_sentence_alignment_true.csv'.")

Classification Accuracy: 60.00%
Classification complete! Results saved as 'gemini_classified_texts_sentence_alignment_true.csv'.


Classification Accuracy: 60.00%
Classification complete! Results saved as 'gemini_classified_texts_sentence_alignment_true.csv'.