In [1]:
import json

with open('data/trainingData/labels.json', 'r') as file:
        file = json.load(file)

file

{'study_types': {'Quantitative': {'Experimental': ['Randomized (controlled) trial',
    'Natural experiments'],
   'Quasi-experimental': ['Regression discontinuity',
    'Difference-in-difference',
    'Instrumental variable design',
    'Natural experiments'],
   'Non-experimental': ['Observational',
    'Cross-sectional',
    'Longitudinal / Panel'],
   'Meta-analysis': ['Meta-analysis']},
  'Qualitative': {'Methods': ['Case-study',
    'Ethnography',
    'First-hand observations',
    'Interviews',
    'Focus groups',
    'Recordings'],
   'Mixed Method': ['Systematic review']}},
 'poverty_contexts': {'Low resource level (Difficulty in meeting basic needs)': 'Low resource level',
  'Resource volatility (Income fluctuations, emergency expenses)': 'Resource volatility',
  'Physical environment (Neighborhood quality, noise, violence, crime)': 'Physical environment',
  'Human capital inputs (School quality, health/nutrition, social services)': 'Human capital inputs',
  'Social environme

In [2]:
study_types = [item for subcat in file['study_types'].values() for method_list in subcat.values() for item in method_list]
poverty_context = list(file['poverty_contexts'].values())
mechanisms = [item for subcat in file['mechanisms'].values() for item in subcat]
behaviors = [item for subcat in file['Behaviors'].values() for item in subcat]


In [3]:
import pycountry 

countries = [country.name for country in pycountry.countries]

In [4]:
# import pandas as pd

# excels_dict = {
# }
# excels = []

# for year in range(18, 24):
#     excel = pd.read_csv(f'data/extractedPapers20{year:02}.csv')  # Format year with leading zeros
#     excels_dict[year] = excel
#     excels.append(excel)


In [5]:
all_df = pd.concat(excels, ignore_index=True)


In [6]:
all_df.shape


(87257, 19)

In [7]:
sample_df = all_df.sample(1000, random_state=42)  
sample_df_dict = sample_df.to_dict(orient='records')

In [8]:
all_df_dict = all_df.to_dict(orient='records')

In [9]:
import time
import google.generativeai as genai
import json
import logging
import pycountry
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
import tiktoken
# from collections import defaultdict # Not used, can be removed
# import pandas as pd # Not used, can be removed

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

class GeminiModel:
    def __init__(self, api_keys, max_abstracts_per_batch=5, max_input_tokens=2000, max_output_tokens=1000, delay=2, model_name="gemma-3-27b-it", temperature=0.3,
                 retry_model_name='gemini-2.0-flash', retry_max_input_tokens=10000, retry_max_output_tokens=1000, retry_delay=5, retry_temperature=None, max_retries=10, 
                 retry_model_max_abstracts_per_batch=10):
        """
        Initializes the GeminiModel for parallel abstract classification with numeric optimizations.

        Args:
            api_keys: List of API keys or a single API key string.
            max_input_tokens: Maximum input tokens allowed per batch for initial attempts.
            max_output_tokens: Maximum output tokens allowed per batch for initial attempts.
            delay: Delay in seconds between API calls per key to manage rate limiting for initial attempts.
            model_name: Gemini model name to use for initial attempts (e.g., "gemini-2.0-flash").
            temperature: Controls the randomness of the model's output for initial attempts. Lower values (e.g., 0.1-0.3)
                         result in more deterministic output.
            retry_model_name: Model name to use for retries. If None, uses the initial model_name.
            retry_max_input_tokens: Max input tokens for retry batches. If None, uses initial max_input_tokens.
            retry_max_output_tokens: Max output tokens for retry batches. If None, uses initial max_output_tokens.
            retry_delay: Delay for retry API calls. If None, uses initial delay.
            retry_temperature: Temperature for retry API calls. If None, uses initial temperature.
            max_retries: Maximum number of retry attempts for a single API call.
        """
        # Ensure api_keys is a list
        self.api_keys = [api_keys] if isinstance(api_keys, str) else api_keys
        self.max_input_tokens = max_input_tokens
        self.max_output_tokens = max_output_tokens
        self.max_abstracts_per_batch = max_abstracts_per_batch
        self.model_name = model_name
        self.delay = delay
        self.temperature = temperature
        self.max_retries = max_retries # Max retries for _generate method

        # Retry-specific configurations
        self.retry_model_name = retry_model_name if retry_model_name is not None else self.model_name
        self.retry_max_input_tokens = retry_max_input_tokens if retry_max_input_tokens is not None else self.max_input_tokens
        self.retry_max_output_tokens = retry_max_output_tokens if retry_max_output_tokens is not None else self.max_output_tokens
        self.retry_delay = retry_delay if retry_delay is not None else self.delay
        self.retry_temperature = retry_temperature if retry_temperature is not None else self.temperature
        self.retry_model_max_abstracts_per_batch = retry_model_max_abstracts_per_batch if retry_model_max_abstracts_per_batch is not None else self.max_abstracts_per_batch


        # Initialize tiktoken encoder
        self.tokenizer = tiktoken.get_encoding("o200k_base")

        # Initialize caches and GenerativeModel instances for each API key
        self.caches = {key: {} for key in self.api_keys}
        # Initialize models for both initial and retry configurations
        self.models = {}
        for api_key in self.api_keys:
            self.models[api_key] = {
                'initial': self.model_name,
                'retry': self.retry_model_name
            }
        # Thread locks for each API key to manage per-key rate limiting
        self.locks = {key: threading.Lock() for key in self.api_keys}
        # Shared rate limit tracking across threads
        self.api_key_cooldowns = {key: 0 for key in self.api_keys} # Track cooldown end times
        self.cooldown_lock = threading.Lock()
        # Initialize label mappings; these will be populated by _create_label_mappings
        self.label_mappings = {}
        self.reverse_mappings = {}
        # Output field mappings - converts field names to numbers
        self.field_mappings = {
            'study_type': 0,
            'poverty_context': 1,
            'mechanism': 2,
            'behavior': 3
        }
        self.reverse_field_mappings = {v: k for k, v in self.field_mappings.items()}
        # Pre-calculate token overhead for the static part of the prompt
        self.base_prompt_tokens = self._calculate_base_prompt_tokens()
        logger.info(f"Base prompt token overhead: {self.base_prompt_tokens} tokens")
        logger.info(f"Initialized with {len(self.api_keys)} API keys for parallel processing")

    def _calculate_base_prompt_tokens(self) -> int:
        """
        Calculates the token count of the static part of the prompt using tiktoken.
        This helps in determining how many abstract tokens can fit into a batch.
        """
        # Generate a prompt with an empty abstracts_batch to get the base prompt size
        # This includes all instructions and label mappings but no abstract content.
        dummy_abstracts_batch = {}
        # Dummy data for prompt creation to ensure all static parts are counted
        study_types = ["Type A"]
        poverty_contexts = ["Context X"]
        mechanisms = ["Mechanism M"]
        behaviors = ["Behavior P"]

        base_prompt = self._create_prompt(
            dummy_abstracts_batch, study_types, poverty_contexts, mechanisms, behaviors
        )
        return len(self.tokenizer.encode(base_prompt))

    def _count_tokens(self, text: str) -> int:
        """
        Counts tokens for a given text using tiktoken.
        """
        return len(self.tokenizer.encode(text))

    def _estimate_output_tokens(self, num_abstracts: int) -> int:
        """
        Estimates the output token count for a given number of abstracts.
        Based on the optimized JSON structure with numeric keys and DOI mappings.
        """
        # Optimized structure: {"0": {"0": [999], "1": [999], "2": [999], "3": [999]}}
        # Numeric DOI (1-3 digits) + numeric field keys (1 digit) + values (1-3 digits each)
        # Plus JSON formatting characters
        tokens_per_abstract = 20
        # Array wrapper tokens
        array_overhead = 10
        return (num_abstracts * tokens_per_abstract) + array_overhead

    def _create_label_mappings(self, study_types: list, poverty_contexts: list, mechanisms: list, behaviors: list):
        """
        Creates numeric mappings for all label categories.
        These mappings are used in the prompt and for decoding results.
        """
        self.label_mappings = {
            'study_type': {label: i for i, label in enumerate(study_types)},
            'poverty_context': {label: i for i, label in enumerate(poverty_contexts)},
            'mechanism': {label: i for i, label in enumerate(mechanisms)},
            'behavior': {label: i for i, label in enumerate(behaviors)}
        }
        # Add a special code for "Insufficient info" for all categories
        insufficient_code = 999
        for category in self.label_mappings:
            self.label_mappings[category]["Insufficient info"] = insufficient_code
        # Create reverse mappings for decoding numeric codes back to text labels
        self.reverse_mappings = {
            category: {num: label for label, num in mapping.items()}
            for category, mapping in self.label_mappings.items()
        }

    def _create_doi_mapping(self, abstracts_batch: dict) -> tuple[dict, dict]:
        """
        Creates numeric mapping for DOIs in current batch to save tokens.
        """
        doi_to_num = {doi: i for i, doi in enumerate(abstracts_batch.keys())}
        num_to_doi = {i: doi for doi, i in doi_to_num.items()}
        return doi_to_num, num_to_doi
    
    def _get_available_api_keys(self) -> list:
        """
        Returns list of API keys not currently in cooldown.
        """
        with self.cooldown_lock:
            current_time = time.time()
            return [key for key in self.api_keys if current_time >= self.api_key_cooldowns[key]]

    def _generate(self, prompt: str, api_key: str, attempt_type: str = 'initial') -> str:
        """
        Generates content using a specific API key with comprehensive retry logic.
        Uses model and parameters based on `attempt_type` ('initial' or 'retry').
        """
        cache = self.caches[api_key]
        if prompt in cache:
            return cache[prompt]

        # Determine which model and parameters to use based on attempt_type
        model_name = self.models[api_key][attempt_type]
        current_delay = self.delay if attempt_type == 'initial' else self.retry_delay
        current_temperature = self.temperature if attempt_type == 'initial' else self.retry_temperature
        current_max_retries = self.max_retries

        # Use lock to ensure rate limiting per API key AND fix race condition
        with self.locks[api_key]:
            # Check cooldown INSIDE the API key lock to prevent race conditions
            with self.cooldown_lock:
                current_time = time.time()
                if current_time < self.api_key_cooldowns[api_key]:
                    cooldown_remaining = self.api_key_cooldowns[api_key] - current_time
                    logger.info(f"API key {api_key[:6]}... is in cooldown. Waiting for {cooldown_remaining:.2f} seconds.")
                    time.sleep(cooldown_remaining)

            # Configure API key for this specific request (fixes the global configuration issue)
            genai.configure(api_key=api_key)
            
            # Create model instance with the correct configuration
            model_instance = genai.GenerativeModel(model_name)
            
            for attempt in range(current_max_retries):
                try:
                    time.sleep(current_delay)  # Rate limit management
                    generation_config = {
                        "temperature": current_temperature,
                    }
                    response = model_instance.generate_content(
                        prompt,
                        generation_config=generation_config
                    )
                    output = response.text
                    cache[prompt] = output
                    return output
                    
                except Exception as e:
                    # More robust error handling for different types of rate limiting
                    is_rate_limited = False
                    retry_seconds = 10  # Default fallback
                    
                    # Check for rate limiting in various ways the API might indicate it
                    if hasattr(e, 'status_code') and e.status_code == 429:
                        is_rate_limited = True
                    elif hasattr(e, 'retry_delay'):
                        is_rate_limited = True
                        if hasattr(e.retry_delay, 'seconds'):
                            retry_seconds = e.retry_delay.seconds
                        else:
                            retry_seconds = e.retry_delay
                    elif 'rate limit' in str(e).lower() or 'quota' in str(e).lower():
                        is_rate_limited = True
                    
                    if is_rate_limited:
                        logger.warning(f"Rate limited on attempt {attempt + 1} ({attempt_type}) with API key {api_key[:6]}... Waiting {retry_seconds} seconds...")
                        with self.cooldown_lock:
                            self.api_key_cooldowns[api_key] = time.time() + retry_seconds
                        time.sleep(retry_seconds)
                    else:
                        logger.error(f"Error on attempt {attempt + 1} ({attempt_type}) with API key {api_key[:6]}...: {e}")
                    
                    if attempt == current_max_retries - 1:
                        raise e  # Re-raise if all retries are exhausted
                
    def _create_prompt(self, abstracts_batch: dict, study_types: list, poverty_contexts: list, mechanisms: list, behaviors: list) -> str:
        """
        Creates a prompt for the Gemini model to process a batch of abstracts,
        including instructions and numeric mappings for both DOIs and field keys.
        """
        doi_to_num, num_to_doi = self._create_doi_mapping(abstracts_batch)

        formatted_abstracts = ""
        for doi, abstract in abstracts_batch.items():
            numeric_id = doi_to_num[doi]
            formatted_abstracts += f"ID: {numeric_id}\nAbstract: {abstract}\n\n"

        study_type_mapping = "\n".join([f"{i}: {label}" for i, label in enumerate(study_types)])
        poverty_context_mapping = "\n".join([f"{i}: {label}" for i, label in enumerate(poverty_contexts)])
        mechanism_mapping = "\n".join([f"{i}: {label}" for i, label in enumerate(mechanisms)])
        behavior_mapping = "\n".join([f"{i}: {label}" for i, label in enumerate(behaviors)])

        prompt = f"""
You will analyze {len(abstracts_batch)} research abstracts. For each abstract, assign numeric codes in the following four categories:

**Field Key Mappings:**
0: Study Type (can assign multiple numbers if applicable)
1: Poverty Context (can assign multiple numbers if applicable)
2: Mechanism (can assign multiple numbers if applicable)
3: Behavior (can assign multiple numbers if applicable)

**Critical Instructions:**
- You must assign **at least one** numeric code for **each** of the four fields (0–3).
- Use ONLY the numeric codes provided below for all categories.
- Assign multiple codes if the abstract supports more than one. Format as: `[1, 3, 5]`. A single label may be written as `2` or `[2]`.
- Do **not** leave any field blank. Return a full set of labels for every abstract.

**On Using '999' (Insufficient Info):**
- Use code `999` **only if no reasonable inference is possible** after full consideration of the abstract and plausible background knowledge.
- You are expected to **infer beyond the literal abstract** where appropriate, based on:
    - Field knowledge
    - The study setting or goals
    - Known practices in similar research

**Labeling Guidance by Category:**
- **Study Type:** Infer from references to interventions, design, or data (e.g., control group implies experiment).
- **Poverty Context:** Infer from setting, economic indicators, demographic focus, or target population.
- **Mechanisms:** Identify cognitive, psychological, or behavioral processes mentioned or implied.
- **Behavior:** Identify actions, decisions, or patterns studied or influenced.

**Numeric Label Codes:**

**Study Type Codes (Field 0):** {study_type_mapping}
999: Insufficient info

**Poverty Context Codes (Field 1):** {poverty_context_mapping}
999: Insufficient info

**Mechanism Codes (Field 2):** {mechanism_mapping}
999: Insufficient info

**Behavior Codes (Field 3):** {behavior_mapping}
999: Insufficient info

**Input Abstracts:** {formatted_abstracts}

**IMPORTANT:** Return a valid JSON array. Each entry must include all four fields (0, 1, 2, 3), with one or more numeric codes per field.

**Output Format Example:**
[
{{"0": [0, 2], "1": [1], "2": [2, 4], "3": [3]}},
{{"0": [1], "1": [0], "2": [5], "3": [2]}}
]
"""
        return prompt

    def _parse_json_response(self, json_str_to_parse: str, api_key_snippet: str, doi_mapping: dict) -> dict:
        """
        Robustly parses a JSON string response from the Gemini model, handling the array format
        and converting numeric DOI/field keys back to original format.
        Uses simplified JSON extraction approach.
        """
        numeric_results = {}
        parsed_json = None
        try:
            # Try direct JSON parsing first
            parsed_json = json.loads(json_str_to_parse)
            logger.info("Successfully parsed JSON directly.")
        except json.JSONDecodeError as e:
            # Use the simpler extraction method from the second version
            start_idx = json_str_to_parse.find('{')
            if json_str_to_parse.find('[') >= 0 and (start_idx == -1 or json_str_to_parse.find('[') < start_idx):
                # Array format detected
                start_idx = json_str_to_parse.find('[')
                end_idx = json_str_to_parse.rfind(']') + 1
            else:
                # Object format
                end_idx = json_str_to_parse.rfind('}') + 1
            if start_idx >= 0 and end_idx > start_idx:
                json_str_extracted = json_str_to_parse[start_idx:end_idx]
                try:
                    parsed_json = json.loads(json_str_extracted)
                    logger.info("Successfully parsed JSON after extraction.")
                except json.JSONDecodeError as e_extract:
                    raise ValueError(f"Could not extract valid JSON from response for API key {api_key_snippet}: {e_extract}")
            else:
                raise ValueError(f"Could not find valid JSON object in response for API key {api_key_snippet}")

        # Convert array format to flat dictionary or handle nested format
        if isinstance(parsed_json, list):
            # Handle array format where each item corresponds to a DOI by index
            for idx, fields in enumerate(parsed_json):
                if isinstance(fields, dict):
                    numeric_results[str(idx)] = fields
        elif isinstance(parsed_json, dict):
            # Handle nested format directly
            numeric_results = parsed_json
        else:
            raise ValueError(f"Unexpected JSON structure: {type(parsed_json)}")

        # Convert numeric DOI and field keys back to original format
        converted_results = {}
        for numeric_doi, fields in numeric_results.items():
            # Convert numeric DOI back to original DOI
            try:
                numeric_doi_int = int(numeric_doi)
                original_doi = doi_mapping[numeric_doi_int]
            except (ValueError, KeyError):
                logger.warning(f"Could not map numeric DOI {numeric_doi} back to original DOI")
                continue

            # Convert numeric field keys back to original field names
            converted_fields = {}
            for numeric_field, values in fields.items():
                try:
                    numeric_field_int = int(numeric_field)
                    original_field = self.reverse_field_mappings[numeric_field_int]
                    converted_fields[original_field] = values
                except (ValueError, KeyError):
                    logger.warning(f"Could not map numeric field {numeric_field} back to original field name")
                    continue
            converted_results[original_doi] = converted_fields
        return converted_results

    def _decode_numeric_labels(self, numeric_results: dict) -> tuple[dict, dict]:
        """
        Converts numeric codes in the classification results back to text labels.
        Handles both single numeric values and lists of numeric values.
        Returns decoded results and a dictionary of papers with invalid codes.
        """
        decoded_results = {}
        invalid_papers = {} # Track papers with invalid codes
        for doi, labels in numeric_results.items():
            decoded_labels = {}
            has_invalid_code = False
            # Decode each category
            for category in ['study_type', 'poverty_context', 'mechanism', 'behavior']:
                if category in labels:
                    numeric_codes = labels[category]
                    # Ensure numeric_codes is iterable (handle single int vs list)
                    code_list = [numeric_codes] if not isinstance(numeric_codes, list) else numeric_codes
                    # Decode each numeric code to text
                    decoded_texts = []
                    for code in code_list:
                        if code in self.reverse_mappings[category]:
                            decoded_texts.append(self.reverse_mappings[category][code])
                        else:
                            logger.warning(f"Invalid numeric code {code} for category {category} in DOI {doi}")
                            has_invalid_code = True
                            break # Break from inner loop on first invalid code
                    if has_invalid_code:
                        break # Break from outer loop if an invalid code was found
                    # Join multiple labels with comma or return single label
                    decoded_labels[category] = ", ".join(decoded_texts) if len(decoded_texts) > 1 else (decoded_texts[0] if decoded_texts else "Insufficient info")
                else:
                    # If category is missing in the model's response, mark as insufficient
                    decoded_labels[category] = "Insufficient info"

            if has_invalid_code:
                invalid_papers[doi] = labels # Store original numeric labels for failed papers
            else:
                decoded_results[doi] = decoded_labels
        return decoded_results, invalid_papers

    def _process_batch_with_key(self, batch_data: tuple, attempt_type: str = 'initial') -> tuple[dict, dict]:
        """
        Processes a single batch of abstracts using an assigned API key.
        Handles API calls, JSON parsing, and decoding of numeric labels.
        Returns successfully decoded results and any abstracts that failed.
        `attempt_type` specifies if it's an 'initial' call or a 'retry'.
        """
        batch, api_key, study_types, poverty_contexts, mechanisms, behaviors = batch_data
        try:
            # Create DOI mapping for this batch
            doi_to_num, num_to_doi = self._create_doi_mapping(batch)
            # Generate classification response from the model
            batch_raw_response = self._generate(
                self._create_prompt(batch, study_types, poverty_contexts, mechanisms, behaviors),
                api_key,
                attempt_type=attempt_type # Pass the attempt type
            )
            logger.info(f"Successfully classified batch of {len(batch)} abstracts using API key: {api_key[:6]}... ({attempt_type} attempt)")
            # Robustly parse the JSON response with DOI mapping
            numeric_results = self._parse_json_response(batch_raw_response, api_key + '...', num_to_doi)
            # Decode numeric codes back to text labels
            decoded_results, invalid_papers = self._decode_numeric_labels(numeric_results)
            # Determine which papers from the current batch successfully processed
            failed_abstracts = {doi: batch[doi] for doi in batch if doi not in decoded_results}
            return decoded_results, failed_abstracts
        except Exception as e:
            logger.error(f"Error processing batch with API key {api_key[:6]}... ({attempt_type} attempt): {e}")
            return {}, batch # If any error, assume all papers in batch failed

    def _create_abstract_batches(self, all_abstracts: dict, categories: tuple, max_input_tokens: int, max_output_tokens: int, max_abstracts_per_batch: int = None) -> list:
        """
        Generates batches of abstracts, ensuring each batch respects both input and output token limits.
        Assigns API keys in a round-robin fashion to each batch.
        Now accounts for optimized token usage with numeric DOIs and field keys.

        Args:
            all_abstracts: Dictionary of all abstracts {doi: abstract_text}.
            categories: Tuple of (study_types, poverty_contexts, mechanisms, behaviors).
            max_input_tokens: The maximum input token count allowed per batch.
            max_output_tokens: The maximum output token count allowed per batch.
            max_abstracts_per_batch: Optional, maximum number of abstracts per batch.

        Returns:
            A list of tuples, where each tuple is (abstracts_batch_dict, api_key, *categories).
        """
        study_types, poverty_contexts, mechanisms, behaviors = categories
        batches = []
        current_batch_abstracts = {}
        current_batch_input_tokens = self.base_prompt_tokens # Start with the prompt's fixed overhead
        abstracts_items = list(all_abstracts.items())

        for doi, abstract in abstracts_items:
            # For token counting, use optimized format (numeric ID instead of full DOI)
            numeric_id = len(current_batch_abstracts) # Approximate numeric ID
            abstract_content_for_prompt = f"ID: {numeric_id}\nAbstract: {abstract}\n\n"
            abstract_tokens = self._count_tokens(abstract_content_for_prompt)

            # Estimate output tokens for current batch size + 1
            estimated_output_tokens = self._estimate_output_tokens(len(current_batch_abstracts) + 1)

            # Calculate new input tokens
            new_input_tokens = current_batch_input_tokens + abstract_tokens

            # Check if adding this abstract exceeds either input or output token limits, or abstract count limit
            if (new_input_tokens > max_input_tokens or
                estimated_output_tokens > max_output_tokens or
                (self.max_abstracts_per_batch and len(current_batch_abstracts) >= self.max_abstracts_per_batch)):
                if current_batch_abstracts: # Only add if the batch is not empty
                    api_key = self.api_keys[len(batches) % len(self.api_keys)] # Round-robin API key
                    batches.append((current_batch_abstracts, api_key, *categories))
                # Start a new batch with the current abstract
                current_batch_abstracts = {doi: abstract}
                current_batch_input_tokens = self.base_prompt_tokens + abstract_tokens
            else:
                # Add abstract to the current batch
                current_batch_abstracts[doi] = abstract
                current_batch_input_tokens = new_input_tokens

        # Add any remaining abstracts in the last batch
        if current_batch_abstracts:
            available_keys = self._get_available_api_keys()
            if not available_keys:
                available_keys = self.api_keys # Fallback to all keys if none available
            api_key = available_keys[len(batches) % len(available_keys)]
            batches.append((current_batch_abstracts, api_key, *categories))

        return batches

    def _retry_failed_papers(self, failed_abstracts: dict, categories: tuple, max_retry_attempts: int = 3) -> dict:
        """
        Retries processing abstracts that failed initial classification.
        Uses retry-specific model, token limits, and delay.
        """
        successfully_processed_retries = {}
        retries_remaining = max_retry_attempts

        while failed_abstracts and retries_remaining > 0:
            retries_remaining -= 1
            logger.info(f"Retry attempt for {len(failed_abstracts)} papers. {retries_remaining} attempts remaining.")

            # Create retry batches using retry-specific configurations
            retry_batches = self._create_abstract_batches(
                failed_abstracts, categories, self.retry_max_input_tokens, self.retry_max_output_tokens,
                max_abstracts_per_batch= self.retry_model_max_abstracts_per_batch 
            )
            logger.info(f"Created {len(retry_batches)} retry batches using retry configurations.")

            with ThreadPoolExecutor(max_workers=len(self.api_keys)) as executor:
                future_to_batch = {executor.submit(self._process_batch_with_key, batch_data, 'retry'): batch_data[0]
                                   for batch_data in retry_batches}

                for future in as_completed(future_to_batch):
                    try:
                        decoded_results, batch_failed_on_retry = future.result()
                        successfully_processed_retries.update(decoded_results)
                        # Remove successfully processed papers from the `failed_abstracts` list
                        for doi in decoded_results:
                            failed_abstracts.pop(doi, None)
                    except Exception as e:
                        logger.error(f"Error in retry batch: {e}")

            if failed_abstracts and retries_remaining > 0:
                # Exponential backoff between retry rounds
                wait_time = self.retry_delay * (max_retry_attempts - retries_remaining)
                logger.info(f"Waiting for {wait_time:.2f} seconds before next retry round.")
                time.sleep(wait_time)

        return successfully_processed_retries

    def _validate_and_fix_labels(self, labels_dict: dict, doi: str) -> dict:
        """
        Validates that all required classification fields are present for a given DOI
        and fills in "Insufficient info" for any missing fields.
        """
        required_fields = ['study_type', 'poverty_context', 'mechanism', 'behavior']
        if doi not in labels_dict:
            logger.warning(f"DOI {doi} not found in classification results, providing default 'Insufficient info' for all fields.")
            return {field: "Insufficient info" for field in required_fields}
        result = labels_dict[doi]
        # Check if all required fields are present and fix missing ones
        for field in required_fields:
            if field not in result or result[field] is None or (isinstance(result[field], str) and result[field].strip() == ""):
                logger.warning(f"Missing or empty field '{field}' for DOI {doi}, setting to 'Insufficient info'")
                result[field] = "Insufficient info"
        return result

    def classify(self, papers: list, study_types: list, poverty_contexts: list, mechanisms: list, behaviors: list, max_papers_to_classify: int = None) -> list:
        """
        Classifies a list of research abstracts using parallel processing with multiple API keys,
        batching based on token limits. Includes retry mechanism for failed classifications.
        Uses numeric optimizations for DOIs and field keys to save tokens.

        Args:
            papers: List of paper dictionaries, each containing 'doi' and 'abstract'.
            study_types: List of study type definitions.
            poverty_contexts: List of poverty context definitions.
            mechanisms: List of mechanism definitions.
            behaviors: List of behavior definitions.
            max_papers_to_classify: Optional. If provided, limits the classification to this many papers.
                                    The papers will be taken from the beginning of the `papers` list.
            max_abstracts_per_batch: Optional. If provided, limits the number of abstracts in each batch.

        Returns:
            List of paper dictionaries updated with classification results.
        """
        # Store categories for consistent access
        categories = (study_types, poverty_contexts, mechanisms, behaviors)

        # Create label mappings first, as they are needed for decoding
        self._create_label_mappings(*categories)

        # Filter valid abstracts from the input papers
        all_abstracts = {p['doi']: p['abstract'] for p in papers if 'doi' in p and 'abstract' in p and p['abstract']}
        if not all_abstracts:
            logger.warning("No valid abstracts found in input papers.")
            return papers

        # Apply the total classification limit if specified
        if max_papers_to_classify is not None and max_papers_to_classify > 0:
            original_abstracts_count = len(all_abstracts)
            # Create a new dictionary with only the limited number of abstracts
            limited_abstracts = {}
            for i, (doi, abstract) in enumerate(all_abstracts.items()):
                if i >= max_papers_to_classify:
                    break
                limited_abstracts[doi] = abstract
            all_abstracts = limited_abstracts
            if len(all_abstracts) < original_abstracts_count:
                logger.info(f"Limiting total classification to {len(all_abstracts)} papers out of {original_abstracts_count} available.")


        logger.info(f"Starting classification for {len(all_abstracts)} abstracts...")
        # Initial batch creation, now respecting max_abstracts_per_batch
        batches = self._create_abstract_batches(
            all_abstracts, categories, self.max_input_tokens, self.max_output_tokens, self.max_abstracts_per_batch # Use self.max_abstracts_per_batch for consistency if not overridden
        )
        logger.info(f"Created {len(batches)} initial batches for classification.")
        final_results = {}
        failed_abstracts_initial = {}

        # Process batches in parallel
        with ThreadPoolExecutor(max_workers=len(self.api_keys)) as executor:
            future_to_batch = {executor.submit(self._process_batch_with_key, batch_data, 'initial'): batch_data[0]
                               for batch_data in batches}
            for future in as_completed(future_to_batch):
                try:
                    decoded_results, batch_failed = future.result()
                    final_results.update(decoded_results)
                    failed_abstracts_initial.update(batch_failed) # Collect abstracts that failed this batch
                except Exception as e:
                    logger.error(f"Error in main batch processing: {e}")

        # Retry failed abstracts
        if failed_abstracts_initial:
            logger.info(f"Attempting to retry {len(failed_abstracts_initial)} failed abstracts.")
            successfully_retried = self._retry_failed_papers(failed_abstracts_initial, categories, max_retry_attempts=3)
            final_results.update(successfully_retried)

        # Prepare final output: update original papers with classification results
        classified_papers = []
        for p in papers:
            doi = p.get('doi')
            if doi and doi in all_abstracts: # Only process papers that were part of the initial classification set
                # Validate and fix labels for each paper
                validated_labels = self._validate_and_fix_labels(final_results, doi)
                p.update(validated_labels)
            else:
                # If a paper wasn't processed (e.g., no abstract or not in limited set),
                # ensure it still has the classification fields with "Insufficient info"
                for field in ['study_type', 'poverty_context', 'mechanism', 'behavior']:
                    if field not in p:
                        p[field] = "Insufficient info"
            classified_papers.append(p)

        return classified_papers

In [10]:
excels_dict.keys()

dict_keys([18, 19, 20, 21, 22, 23])

In [11]:
# dict_to_process = excels_dict[5].to_dict(orient='records')

In [12]:
import re
import pycountry
from typing import List, Dict, Set
import spacy
from collections import Counter

class FastCountryExtractor:
    """
    Fast country extraction using multiple methods for research abstracts.
    """
    
    def __init__(self):
        """Initialize the country extractor with various lookup methods."""
        self.country_patterns = self._build_country_patterns()
        self.city_to_country = self._build_city_mappings()
        
        # Try to load spaCy model (optional, fallback to regex if not available)
        try:
            self.nlp = spacy.load("en_core_web_sm")
            self.use_spacy = True
        except OSError:
            print("Warning: spaCy model not found. Using regex-only approach.")
            self.nlp = None
            self.use_spacy = False
    
    def _build_country_patterns(self) -> Dict[str, str]:
        """Build comprehensive country name patterns including variations."""
        patterns = {}
        
        for country in pycountry.countries:
            # Official name from pycountry
            patterns[country.name.lower()] = country.name
            
            # Add official_name if available
            if hasattr(country, 'official_name') and country.official_name:
                patterns[country.official_name.lower()] = country.name
            
            # Add alpha codes as text variations  
            patterns[country.alpha_2.lower()] = country.name
            patterns[country.alpha_3.lower()] = country.name
        
        # Manual variations that pycountry doesn't provide
        # These are common informal names found in research papers
        manual_variations = {
            'united states': ['usa', 'us', 'america', 'united states of america'],
            'united kingdom': ['uk', 'britain', 'great britain', 'england', 'scotland', 'wales', 'northern ireland'],
            'south korea': ['korea', 'republic of korea', 'south korea'],
            'north korea': ['dprk', 'democratic people\'s republic of korea'],
            'russia': ['russian federation', 'ussr', 'soviet union'],
            'iran': ['islamic republic of iran', 'persia'],
            'vietnam': ['viet nam'],
            'czech republic': ['czechia', 'czechoslovakia'],
            'democratic republic of the congo': ['drc', 'congo', 'zaire'],
            'republic of the congo': ['congo-brazzaville'],
            'ivory coast': ['côte d\'ivoire'],
            'bosnia and herzegovina': ['bosnia'],
            'trinidad and tobago': ['trinidad'],
            'antigua and barbuda': ['antigua'],
            'saint vincent and the grenadines': ['saint vincent'],
            'sao tome and principe': ['sao tome'],
            'myanmar': ['burma'],
            'netherlands': ['holland'],
            'switzerland': ['swiss confederation'],
            'vatican city': ['holy see'],
        }
        
        # Add manual variations to patterns
        for standard_name, variations in manual_variations.items():
            for variant in variations:
                patterns[variant.lower()] = self._get_standard_country_name(standard_name)
        
        return patterns
    
    def _get_standard_country_name(self, country_name_lower: str) -> str:
        """Get the standard pycountry name for a country."""
        # Try to find the country in pycountry
        for country in pycountry.countries:
            if country.name.lower() == country_name_lower:
                return country.name
        
        # Fallback to title case if not found
        return country_name_lower.title()
    
    def _build_city_mappings(self) -> Dict[str, str]:
        """Build mappings from major cities to countries."""
        # This is a simplified version - in practice, you'd want a comprehensive database
        city_mappings = {
            # Major cities that frequently appear in research
            'new york': 'United States',
            'los angeles': 'United States',
            'chicago': 'United States',
            'boston': 'United States',
            'washington': 'United States',
            'london': 'United Kingdom',
            'manchester': 'United Kingdom',
            'birmingham': 'United Kingdom',
            'paris': 'France',
            'lyon': 'France',
            'marseille': 'France',
            'berlin': 'Germany',
            'munich': 'Germany',
            'hamburg': 'Germany',
            'tokyo': 'Japan',
            'osaka': 'Japan',
            'kyoto': 'Japan',
            'beijing': 'China',
            'shanghai': 'China',
            'guangzhou': 'China',
            'delhi': 'India',
            'mumbai': 'India',
            'bangalore': 'India',
            'kolkata': 'India',
            'toronto': 'Canada',
            'vancouver': 'Canada',
            'montreal': 'Canada',
            'sydney': 'Australia',
            'melbourne': 'Australia',
            'brisbane': 'Australia',
            'seoul': 'South Korea',
            'moscow': 'Russia',
            'st petersburg': 'Russia',
            'cairo': 'Egypt',
            'nairobi': 'Kenya',
            'lagos': 'Nigeria',
            'cape town': 'South Africa',
            'johannesburg': 'South Africa',
            'sao paulo': 'Brazil',
            'rio de janeiro': 'Brazil',
            'mexico city': 'Mexico',
            'buenos aires': 'Argentina',
        }
        return city_mappings
    
    def extract_countries_regex(self, text: str) -> List[str]:
        """Extract countries using regex pattern matching."""
        text_lower = text.lower()
        found_countries = set()
        
        # Look for country patterns
        for pattern, country_name in self.country_patterns.items():
            if re.search(r'\b' + re.escape(pattern) + r'\b', text_lower):
                found_countries.add(country_name)
        
        # Look for city patterns
        for city, country in self.city_to_country.items():
            if re.search(r'\b' + re.escape(city) + r'\b', text_lower):
                found_countries.add(country)
        
        return list(found_countries)
    
    def extract_countries_spacy(self, text: str) -> List[str]:
        """Extract countries using spaCy NER."""
        if not self.use_spacy:
            return []
        
        doc = self.nlp(text)
        found_countries = set()
        
        for ent in doc.ents:
            if ent.label_ in ['GPE', 'LOC']:  # Geopolitical entity or location
                entity_text = ent.text.lower()
                if entity_text in self.country_patterns:
                    found_countries.add(self.country_patterns[entity_text])
                elif entity_text in self.city_to_country:
                    found_countries.add(self.city_to_country[entity_text])
        
        return list(found_countries)
    
    def extract_countries_combined(self, text: str) -> List[str]:
        """Combine regex and spaCy approaches for best results."""
        countries_regex = self.extract_countries_regex(text)
        countries_spacy = self.extract_countries_spacy(text) if self.use_spacy else []
        
        # Combine and deduplicate
        all_countries = list(set(countries_regex + countries_spacy))
        
        return all_countries
    
    def extract_from_abstract(self, abstract: str, method: str = 'combined') -> str:
        """
        Extract countries from research abstract and return as comma-separated string.
        
        Args:
            abstract: The research abstract text
            method: 'regex', 'spacy', or 'combined'
        
        Returns:
            Comma-separated country names or "Insufficient info"
        """
        if method == 'regex':
            countries = self.extract_countries_regex(abstract)
        elif method == 'spacy':
            countries = self.extract_countries_spacy(abstract)
        else:
            countries = self.extract_countries_combined(abstract)
        
        if countries:
            return ', '.join(sorted(countries))
        else:
            return "Insufficient info"


In [1]:
def create_country_extractor_function(method='combined'):
    """
    Creates a function optimized for pandas .apply() operations.
    
    Args:
        method: 'regex', 'spacy', or 'combined'
    
    Returns:
        Function that can be used with df['column'].apply()
    """
    # Initialize extractor once (important for performance!)
    extractor = FastCountryExtractor()
    
    def extract_countries_from_text(text):
        """Function optimized for pandas apply."""
        if pd.isna(text) or not isinstance(text, str) or not text.strip():
            return "Insufficient info"
        
        return extractor.extract_from_abstract(text, method=method)
    
    return extract_countries_from_text

# country_name_extractor = FastCountryExtractor()
# dict_to_process_df_testing = pd.DataFrame(dict_to_process)
# country_extractor_func = create_country_extractor_function(method='combined')
# dict_to_process_df_testing['country_of_study'] = dict_to_process_df_testing['abstract'].apply(country_extractor_func)


In [14]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Fetch API key from environment variables
api_key = os.getenv("GEMINI_API_KEY")

# Load Google Sheets credentials from environment variables
type= os.getenv("type")
project_id = os.getenv("project_id")
private_key_id = os.getenv("private_key_id") 
private_key = os.getenv("private_key").replace("\\n", "\n")  
client_email = os.getenv("client_email")
client_id = os.getenv("client_id")
auth_uri = os.getenv("auth_uri")
token_uri = os.getenv("token_uri")
auth_provider_x509_cert_url = os.getenv("auth_provider_x509_cert_url")
client_x509_cert_url= os.getenv("client_x509_cert_url")
universe_domain= os.getenv("universe_domain")
credenetials = {
    "type": type,
    "project_id": project_id,
    "private_key_id": private_key_id,
    "private_key": private_key,
    "client_email": client_email,
    "client_id": client_id,
    "auth_uri": auth_uri,
    "token_uri": token_uri,
    "auth_provider_x509_cert_url": auth_provider_x509_cert_url,
    "client_x509_cert_url": client_x509_cert_url,
    "universe_domain": universe_domain
}
# Load Google Sheets IDs from environment variables
papers_spreadsheet_id = os.getenv("papers_spreadsheet_id")
density = os.getenv("density")
density_X = os.getenv("X")
density_Y = os.getenv("Y")
topics_spreadsheet_id = os.getenv("topics_spreadsheet_id")
spreadsheet_id =  {
    "papers": papers_spreadsheet_id,
    "density": density,
    "density_X": density_X,
    "density_Y": density_Y,
    "topics": topics_spreadsheet_id
}

In [15]:
from data import googleSheets
import importlib

importlib.reload(googleSheets)

google_sheets = googleSheets.API(credentials_json=credenetials)

# # Upload the results DataFrame to Google Sheets
# google_sheets.replace(
#     results_df, 
#     spreadsheet_id=spreadsheet_id['papers'], 
#     sheet_name='Sheet1'
#     )

2025-06-20 00:11:57,413 - INFO - Initializing Google Sheets API
2025-06-20 00:11:57,414 - INFO - Building Google Sheets service
2025-06-20 00:11:57,473 - INFO - file_cache is only supported with oauth2client<4.0.0
2025-06-20 00:11:57,477 - INFO - Google Sheets service built successfully
2025-06-20 00:11:57,477 - INFO - API initialized with batch size: 1000


In [16]:
# https://docs.google.com/spreadsheets/d/1EMG3n3Ywt1X4Y-Z5eQCxyrZNA9Yzx00ALUe2HjsdeAQ/edit?gid=0#gid=0


In [17]:
import pandas as pd

classifier = GeminiModel(
    api_keys=[
'AIzaSyA3nUJg48V7qtKShrBmoXCZxe_kkJq0boA',
'AIzaSyDpLNLD65KEkp8-jXLpemwCidvKx5Pq7OE',
'AIzaSyD8NGIMPELnLoBw1VxdvzuM7_6M9WA6_00',
              ])


df = pd.read_csv('data/extractedPapers2024.csv')
df.dropna(subset=['abstract', 'doi'], inplace=True)
df_dict = df.groupby('year').apply(lambda x: x[['doi', 'abstract']]).to_dict(orient='index')
classified = classifier.classify(
    df_dict,
    study_types=study_types,
    poverty_contexts=poverty_context,
    mechanisms=mechanisms,
    behaviors=behaviors ) 
classified_df = pd.DataFrame(classified)
country_extractor_func = create_country_extractor_function(method='spacy')
classified_df['country_of_study'] = classified_df['abstract'].apply(country_extractor_func)
classified_df.to_csv('data/classified_results_2024.csv', index=False)



# classified_results = []

# for key, value in excels_dict.items():

#     if key <= 22:
#         continue
#     print(f"Processing ex el year: {key}")
#     value_df = value
#     # drop missing values in 'abstract' and 'doi' columns
#     value_df = value_df.dropna(subset=['abstract', 'doi'])
#     value_dict = value_df.to_dict(orient='records')
#     classified_results = classifier.classify(
#         value_dict,
#         study_types=study_types,
#         poverty_contexts=poverty_context,
#         mechanisms=mechanisms,
#         behaviors=behaviors
#                 )
#     classified_results_df = pd.DataFrame(classified_results)
#     country_extractor_func = create_country_extractor_function(method='spacy')
#     classified_results_df['country_of_study'] = classified_results_df['abstract'].apply(country_extractor_func)
#     # Upload the results DataFrame to Google Sheets
#     google_sheets.append(
#         classified_results_df, 
#         spreadsheet_id='1EMG3n3Ywt1X4Y-Z5eQCxyrZNA9Yzx00ALUe2HjsdeAQ', 
#     )

#     classified_results.append(classified_results_df)
#     classified_results_df.to_csv(f'data/classified_results_{key}.csv', index=False)

2025-06-20 00:11:58,829 - INFO - Base prompt token overhead: 538 tokens
2025-06-20 00:11:58,831 - INFO - Initialized with 3 API keys for parallel processing
2025-06-20 00:11:58,969 - INFO - Starting classification for 10131 abstracts...


Processing ex el year: 23


2025-06-20 00:12:01,448 - INFO - Created 2481 initial batches for classification.
2025-06-20 00:12:08,390 - INFO - Successfully classified batch of 4 abstracts using API key: AIzaSy... (initial attempt)
2025-06-20 00:12:08,394 - INFO - Successfully parsed JSON after extraction.
2025-06-20 00:12:08,932 - INFO - Successfully classified batch of 5 abstracts using API key: AIzaSy... (initial attempt)
2025-06-20 00:12:08,933 - INFO - Successfully parsed JSON after extraction.
2025-06-20 00:12:09,521 - INFO - Successfully classified batch of 5 abstracts using API key: AIzaSy... (initial attempt)
2025-06-20 00:12:09,532 - INFO - Successfully parsed JSON after extraction.
2025-06-20 00:12:15,424 - INFO - Successfully classified batch of 3 abstracts using API key: AIzaSy... (initial attempt)
2025-06-20 00:12:15,425 - INFO - Successfully classified batch of 3 abstracts using API key: AIzaSy... (initial attempt)
2025-06-20 00:12:15,426 - INFO - Successfully parsed JSON after extraction.
2025-06-2

In [17]:
classified_results_df = pd.concat(classified_results, ignore_index=True)

TypeError: cannot concatenate object of type '<class 'dict'>'; only Series and DataFrame objs are valid

2025-06-13 08:42:59,639 - ERROR - Error processing batch with API key AIzaSyAc...: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
}
]
2025-06-13 08:42:59,640 - ERROR - Error processing batch with API key AIzaSyB1...: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
}
]
2025-06-13 08:42:59,684 - ERROR - Error processing batch with API key AIzaSyD4...: 429 You exceeded your current quota, please check your plan and billing details

In [17]:
classified_results_df = [pd.DataFrame([d]) for d in classified_results if isinstance(d, dict)]


NameError: name 'classified_results' is not defined

In [23]:
classified_results_df = pd.concat(classified_results_df, ignore_index=True)

2025-06-13 08:48:59,378 - ERROR - Error processing batch with API key AIzaSyAc...: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
}
]
2025-06-13 08:48:59,464 - ERROR - Error processing batch with API key AIzaSyD4...: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
}
]
2025-06-13 08:48:59,483 - ERROR - Error processing batch with API key AIzaSyB1...: 429 You exceeded your current quota, please check your plan and billing details

In [29]:
google_sheets.append(
    classified_results_df, 
    spreadsheet_id=spreadsheet_id['papers'], 
    sheet_name='Sheet1'
)

2025-06-13 08:50:26,545 - INFO - Starting append operation - Spreadsheet: 1nrZC6zJ50DouMCHIWZOl-tQ4BAZfEojJymtzUh26nP0, Sheet: Sheet1
2025-06-13 08:50:26,547 - INFO - Preparing data - DataFrame shape: (12573, 23), include_headers: False
2025-06-13 08:50:26,548 - INFO - Starting DataFrame validation and cleaning - Shape: (12573, 23)
2025-06-13 08:50:26,642 - INFO - Found 12580 NaN values across 3 columns
2025-06-13 08:50:26,643 - INFO - DataFrame validation completed
2025-06-13 08:50:26,643 - INFO - Applying comprehensive data sanitization...
2025-06-13 08:50:28,007 - INFO - Performing final JSON serialization validation...
2025-06-13 08:50:28,008 - INFO - JSON serialization validation passed
2025-06-13 08:50:28,008 - INFO - Data preparation completed - Total rows: 12573
2025-06-13 08:50:28,009 - INFO - Total rows to append: 12573
2025-06-13 08:50:28,009 - INFO - Processing batch 1 - Rows 0 to 1000
2025-06-13 08:50:30,795 - ERROR - Error processing batch with API key AIzaSyAc...: 429 Yo

2025-06-13 08:50:36,874 - ERROR - Error processing batch with API key AIzaSyAc...: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 23
}
]
2025-06-13 08:50:36,926 - ERROR - Error processing batch with API key AIzaSyD4...: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 23
}
]
--- Logging error ---
Traceback (most recent call last):
  File "/var/folders/gr/mhlc8kfs39b7v4v54d1fd4bc0000gn/T/ipykernel_10173/3038

In [None]:
classified_results_df['fields'].value_counts()

2025-06-13 08:50:06,447 - ERROR - Error processing batch with API key AIzaSyD4...: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 53
}
]
2025-06-13 08:50:06,447 - ERROR - Error processing batch with API key AIzaSyB1...: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 53
}
]
--- Logging error ---
Traceback (most recent call last):
  File "/var/folders/gr/mhlc8kfs39b7v4v54d1fd4bc0000gn/T/ipykernel_10173/3038

KeyError: 'fields'

2025-06-13 08:50:12,530 - ERROR - Error processing batch with API key AIzaSyAc...: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 47
}
]
2025-06-13 08:50:12,531 - ERROR - Error processing batch with API key AIzaSyD4...: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 47
}
]
--- Logging error ---
Traceback (most recent call last):
  File "/var/folders/gr/mhlc8kfs39b7v4v54d1fd4bc0000gn/T/ipykernel_10173/3038

In [20]:
classifier = GeminiModel(
    api_keys=['AIzaSyBAP-xpwB-umiazKw895-zR9v8l_PULlgk' , 'AIzaSyBtSUqDqAkge2dOcCQ07CmmbWiB2IoOgL0', 'AIzaSyAfPuHj1ZwmPh7P4-Vzg2moLTGtRM1UfF8',
              'AIzaSyB6zWJJnRjcdbj_6KH09Hle4MnJ-9tF8fA', 'AIzaSyBsgOzMGft9bP2Gz-gdpBIoPEkgsFVid6s', 'AIzaSyDv8plW3CjrhQ8JTU15ay1B9rHNQvKU-EM',
              
              ])



# Example usage
results = classifier.classify(
                            dict_to_process,
                            study_types=study_types,
                            poverty_contexts=poverty_context,
                            mechanisms=mechanisms,
                            behaviors=behaviors
                        )




2025-06-12 18:27:11,293 - INFO - Base prompt token overhead: 653 tokens
2025-06-12 18:27:11,293 - INFO - Initialized with 3 API keys for parallel processing
2025-06-12 18:27:11,303 - INFO - Processing 6270 abstracts in parallel using 3 API keys with numeric optimizations
2025-06-12 18:27:12,048 - INFO - Created 161 batches for parallel processing based on token limits
2025-06-12 18:27:27,253 - INFO - Successfully classified batch of 39 abstracts using API key: AIzaSyAl...
2025-06-12 18:27:27,256 - INFO - Successfully parsed JSON directly with explicit JSON output.
2025-06-12 18:27:27,556 - INFO - Successfully classified batch of 39 abstracts using API key: AIzaSyAu...
2025-06-12 18:27:27,557 - INFO - Successfully parsed JSON directly with explicit JSON output.
2025-06-12 18:27:27,768 - INFO - Successfully classified batch of 39 abstracts using API key: AIzaSyC8...
2025-06-12 18:27:27,770 - INFO - Successfully parsed JSON directly with explicit JSON output.
2025-06-12 18:27:43,839 - INF

Problematic snippet: [
  {
    "0": [
      999
    ],
    "1": [
      999
    ],
    "2": [
      999
    ],
    "3": [
      999
    ],
    "4": [
      999
    ]
  },
  {
    "0": [
      999
    ],
    "1": [
      0
    ],
    "2": [
      999
    ],
    "3": [
      999
    ],
    "4": [
      125
    ]
  },
  {
    "0": [
      12
    ],
    "1": [
      0
    ],
    "2": [
      999
    ],
    "3": [
      12
    ],
    "4": [
      999
    ]
  },
  {
    "0": [
      999
    ],
    "1": [
      999
    ],



2025-06-12 18:32:29,336 - INFO - Successfully classified batch of 39 abstracts using API key: AIzaSyAl...
2025-06-12 18:32:29,342 - INFO - Successfully parsed JSON directly with explicit JSON output.
2025-06-12 18:32:29,950 - INFO - Successfully classified batch of 39 abstracts using API key: AIzaSyAu...
2025-06-12 18:32:29,951 - INFO - Successfully parsed JSON directly with explicit JSON output.
2025-06-12 18:32:41,727 - INFO - Successfully classified batch of 39 abstracts using API key: AIzaSyC8...
2025-06-12 18:32:41,730 - INFO - Successfully parsed JSON directly with explicit JSON output.
2025-06-12 18:32:44,081 - INFO - Successfully classified batch of 39 abstracts using API key: AIzaSyAu...
2025-06-12 18:32:44,083 - INFO - Successfully parsed JSON directly with explicit JSON output.
2025-06-12 18:32:45,002 - INFO - Successfully classified batch of 39 abstracts using API key: AIzaSyAl...
2025-06-12 18:32:45,003 - INFO - Successfully parsed JSON directly with explicit JSON output.


Problematic snippet: [
  {"0": [999], "1": [999], "2": [999], "3": [999], "4": [13]},
  {"0": [999], "1": [999], "2": [999], "3": [999], "4": [999]},
  {"0": [999], "1": [4], "2": [999], "3": [12], "4": [141]},
  {"0": [999], "1": [2], "2": [999], "3": [999], "4": [228]},
  {"0": [999], "1": [999], "2": [999], "3": [13], "4": [141]},
  {"0": [6], "1": [0, 4], "2": [14], "3": [9, 11], "4": [228]},
  {"0": [999], "1": [999], "2": [999], "3": [11], "4": [228]},
  {"0": [999], "1": [999], "2": [999], "3": [999], "4": [1


2025-06-12 18:38:16,580 - INFO - Successfully classified batch of 39 abstracts using API key: AIzaSyC8...
2025-06-12 18:38:16,583 - INFO - Successfully parsed JSON directly with explicit JSON output.
2025-06-12 18:38:18,320 - INFO - Successfully classified batch of 39 abstracts using API key: AIzaSyAu...
2025-06-12 18:38:18,322 - INFO - Successfully parsed JSON directly with explicit JSON output.
2025-06-12 18:38:24,260 - INFO - Successfully classified batch of 39 abstracts using API key: AIzaSyAl...
2025-06-12 18:38:24,262 - INFO - Successfully parsed JSON directly with explicit JSON output.
2025-06-12 18:38:31,840 - INFO - Successfully classified batch of 39 abstracts using API key: AIzaSyC8...
2025-06-12 18:38:31,842 - INFO - Successfully parsed JSON directly with explicit JSON output.
2025-06-12 18:38:39,210 - INFO - Successfully classified batch of 39 abstracts using API key: AIzaSyAu...
2025-06-12 18:38:39,211 - INFO - Successfully parsed JSON directly with explicit JSON output.


In [30]:
6270/39

160.76923076923077

In [38]:
results_df = pd.DataFrame(results)
results_df.head()

Unnamed: 0,doi,title,link,authors,keyword,publication,country,date,field,institution,...,embedding,UMAP1,UMAP2,predicted_label,confidence,study_type,poverty_context,mechanism,behavior,country_of_study
0,10.1177/10253823050120020104x,"Poverty, gender and mental health promotion in...",,['Vikram Patel'],Poverty and mental health,OpenAlex,[None],2005,Affective,[None],...,"[0.0949455052614212, -0.022747956216335297, -0...",12.368342,5.261272,Related,1.0,Insufficient info,"Low resource level, Social environment",Insufficient info,Insufficient info,Insufficient info
1,10.1001/jama.294.5.571,Mental Health of Cambodian Refugees 2 Decades ...,,['Grant N. Marshall'],Poverty and mental health,OpenAlex,['US'],2005,Affective,['RAND Corporation'],...,"[0.06730816513299942, 0.009284263476729393, -0...",14.557115,4.056287,Related,1.0,Cross-sectional,"Low resource level, Social environment","Anxiety, Stress",Insufficient info,Tuvalu
2,10.1353/hpu.2005.0038,Barriers to Health and Social Services for Str...,,"['Steven P. Kurtz', 'Hilary L. Surratt', 'Mari...",Poverty and mental health,OpenAlex,"['US', None, None, None]",2005,Affective,"['University of Delaware', None, None, None]",...,"[-0.007396651431918144, -0.06940397620201111, ...",14.353561,3.524021,Related,1.0,"Interviews, Focus groups","Low resource level, Social environment",Insufficient info,Insufficient info,Tuvalu
3,10.1136/jech.2005.039180,Maternal mental health and child nutritional s...,,['Trudy Harpham'],Poverty and mental health,OpenAlex,['GB'],2005,Affective,['London South Bank University'],...,"[0.05841507390141487, 0.030578309670090675, -0...",15.548842,4.862946,Related,1.0,Cross-sectional,Low resource level,"Depression, Insufficient info",Food diet,"Côte d'Ivoire, Guadeloupe, Uzbekistan, Pitcairn"
4,10.1111/j.0038-4941.2005.00321.x,Race/Ethnic Differences in Nonspecific Psychol...,,"['Jenifer L. Bratter', 'Karl Eschbach']",Poverty and mental health,OpenAlex,"['US', 'US']",2005,Affective,"['University of Houston', 'The University of T...",...,"[0.07612796872854233, 0.028869790956377983, -0...",14.888777,4.692533,Related,1.0,Cross-sectional,"Low resource level, Social environment",Insufficient info,Insufficient info,Tuvalu


In [43]:
results_df.drop(columns=['country_of_study'], inplace=True)

In [49]:
country_extractor_func = create_country_extractor_function(method='spacy')
results_df['country_of_study'] = results_df['abstract'].apply(country_extractor_func)


In [48]:
list(results_df[results_df['doi']== '10.1505/ifor.2005.7.4.294']['abstract'])

['Weighted goal programming (WGP) is employed to reconcile the goals of food security, improved incomes and woodland conservation in households from selected sites in Malawi, Mozambique and Zimbabwe.The three goals are attainable simultaneously, albeit with trade-offs useful in guiding development of rural development policies.The WGP model provides a framework for evaluating impact, on the household goals and woodland condition, of some macroeconomic and sectoral policies and demographic changes.']

In [21]:
len(results)

6270

2025-06-12 20:11:43,067 - INFO - Initializing Google Sheets API
2025-06-12 20:11:43,068 - INFO - Building Google Sheets service
2025-06-12 20:11:43,115 - INFO - file_cache is only supported with oauth2client<4.0.0
2025-06-12 20:11:43,123 - INFO - Google Sheets service built successfully
2025-06-12 20:11:43,124 - INFO - API initialized with batch size: 1000
2025-06-12 20:11:43,124 - INFO - Starting replace operation - Spreadsheet: 1nrZC6zJ50DouMCHIWZOl-tQ4BAZfEojJymtzUh26nP0, Sheet: Sheet1
2025-06-12 20:11:43,125 - INFO - Checking if sheet exists
2025-06-12 20:11:44,303 - INFO - Sheet 'Sheet1' exists
2025-06-12 20:11:44,305 - INFO - Clearing existing content from sheet
2025-06-12 20:11:44,913 - INFO - Sheet cleared
2025-06-12 20:11:44,916 - INFO - Preparing data - DataFrame shape: (6270, 24), include_headers: True
2025-06-12 20:11:44,916 - INFO - Starting DataFrame validation and cleaning - Shape: (6270, 24)
2025-06-12 20:11:44,995 - INFO - Found 6287 NaN values across 2 columns
2025-06

[{'spreadsheetId': '1nrZC6zJ50DouMCHIWZOl-tQ4BAZfEojJymtzUh26nP0',
  'updatedRange': 'Sheet1!A1:X1000',
  'updatedRows': 1000,
  'updatedColumns': 24,
  'updatedCells': 24000},
 {'spreadsheetId': '1nrZC6zJ50DouMCHIWZOl-tQ4BAZfEojJymtzUh26nP0',
  'updatedRange': 'Sheet1!A1001:X2000',
  'updatedRows': 1000,
  'updatedColumns': 24,
  'updatedCells': 24000},
 {'spreadsheetId': '1nrZC6zJ50DouMCHIWZOl-tQ4BAZfEojJymtzUh26nP0',
  'updatedRange': 'Sheet1!A2001:X3000',
  'updatedRows': 1000,
  'updatedColumns': 24,
  'updatedCells': 24000},
 {'spreadsheetId': '1nrZC6zJ50DouMCHIWZOl-tQ4BAZfEojJymtzUh26nP0',
  'updatedRange': 'Sheet1!A3001:X4000',
  'updatedRows': 1000,
  'updatedColumns': 24,
  'updatedCells': 24000},
 {'spreadsheetId': '1nrZC6zJ50DouMCHIWZOl-tQ4BAZfEojJymtzUh26nP0',
  'updatedRange': 'Sheet1!A4001:X5000',
  'updatedRows': 1000,
  'updatedColumns': 24,
  'updatedCells': 24000},
 {'spreadsheetId': '1nrZC6zJ50DouMCHIWZOl-tQ4BAZfEojJymtzUh26nP0',
  'updatedRange': 'Sheet1!A5001:X600

In [28]:
from featureEngineering import topicClusters

all_papers = pd.concat(excels, ignore_index=True)


all_papers['UMAP1'] = all_papers['UMAP1'].astype(float)
all_papers['UMAP2'] = all_papers['UMAP2'].astype(float)

papers_list = all_papers.to_dict(orient='records')

clusters = topicClusters.run(
    papers = papers_list,
    n_clusters= 12
)

In [32]:
results_df = pd.DataFrame(clusters[1])
results_df.head()

Unnamed: 0,doi,title,link,authors,keyword,publication,country,date,field,institution,abstract,cited_by_count,citing_works,referenced_works,embedding,UMAP1,UMAP2,predicted_label,confidence,cluster
0,10.1177/10253823050120020104x,"Poverty, gender and mental health promotion in...",,['Vikram Patel'],Poverty and mental health,OpenAlex,[None],2005,Affective,[None],which is computed on the basis of a number of ...,29,[],"['W137796145', 'W1563422447', 'W1595706829', '...","[0.0949455052614212, -0.022747956216335297, -0...",12.368342,5.261272,Related,1.0,7
1,10.1001/jama.294.5.571,Mental Health of Cambodian Refugees 2 Decades ...,,['Grant N. Marshall'],Poverty and mental health,OpenAlex,['US'],2005,Affective,['RAND Corporation'],Little is known about the long-term mental hea...,647,[],"['W1489641999', 'W1498057507', 'W1663918944', ...","[0.06730816513299942, 0.009284263476729393, -0...",14.557115,4.056287,Related,1.0,6
2,10.1353/hpu.2005.0038,Barriers to Health and Social Services for Str...,,"['Steven P. Kurtz', 'Hilary L. Surratt', 'Mari...",Poverty and mental health,OpenAlex,"['US', None, None, None]",2005,Affective,"['University of Delaware', None, None, None]","Homelessness, poverty, drug abuse and violent ...",202,[],"['W117577764', 'W1488716830', 'W1510369582', '...","[-0.007396651431918144, -0.06940397620201111, ...",14.353561,3.524021,Related,1.0,6
3,10.1136/jech.2005.039180,Maternal mental health and child nutritional s...,,['Trudy Harpham'],Poverty and mental health,OpenAlex,['GB'],2005,Affective,['London South Bank University'],<b>Objective:</b> To test the hypothesis that ...,257,[],"['W1481892733', 'W1508821355', 'W1603277768', ...","[0.05841507390141487, 0.030578309670090675, -0...",15.548842,4.862946,Related,1.0,6
4,10.1111/j.0038-4941.2005.00321.x,Race/Ethnic Differences in Nonspecific Psychol...,,"['Jenifer L. Bratter', 'Karl Eschbach']",Poverty and mental health,OpenAlex,"['US', 'US']",2005,Affective,"['University of Houston', 'The University of T...",Objective. This article documents the patterns...,177,[],"['W150846618', 'W1531151302', 'W1928999099', '...","[0.07612796872854233, 0.028869790956377983, -0...",14.888777,4.692533,Related,1.0,6


In [None]:
from data import googleSheets
import importlib

importlib.reload(googleSheets)

google_sheets = googleSheets.API(credentials_json=credenetials)

# Upload the results DataFrame to Google Sheets
google_sheets.replace(
    results_df, 
    spreadsheet_id=spreadsheet_id['papers'], 
    sheet_name='Sheet1'
    )

In [49]:
results_df = generate(topics_df, 
                      n_terms=5, 
                      min_df=10, 
                      max_df=0.99, 
                      term_column='title', 
                      cluster_column='cluster')

In [50]:
results_df

Unnamed: 0,cluster,title,abstract,umap_1_mean,umap_2_mean,label
0,0,"Mental health promotion in public health, What...",La sante mentale est un etat de bien-etre dans...,8.962639,-0.210486,church food systems
1,1,"SUMMARIES/ZUSAMMENFASSUNGEN RESUMIDAS/PRÉCIS, ...",Restricted accessAbstractFirst published June ...,-3.972352,8.249377,tobacco tobacco control
2,2,"Human Rights and Capabilities, Human security ...",The two concepts — human rights and capabiliti...,9.923976,5.80949,press statement
3,3,Estimating water demand for domestic use in ru...,The paper applies the travel cost method (TCM)...,0.856667,-1.193828,scp scp immigration
4,4,Social Vulnerabilities and Hurricane Katrina: ...,Social science research on natural disasters d...,6.291086,6.697604,medicaid care act
5,5,Behavioral Economics and Marketing in Aid of D...,This article considers several aspects of the ...,5.135398,3.230848,statement discussion
6,6,Mental Health of Cambodian Refugees 2 Decades ...,Little is known about the long-term mental hea...,14.672923,2.968043,antiretroviral american women
7,7,"Poverty, gender and mental health promotion in...",which is computed on the basis of a number of ...,13.202727,5.875512,care act child health
8,8,The Staircase to Terrorism: A Psychological Ex...,To foster a more in-depth understanding of the...,8.620198,3.316085,civil society press
9,9,"Abstracts, Key Lectures, Abstracts, Agricultur...",Restricted accessAbstractFirst published April...,3.493768,16.751293,earth prep


In [None]:
import pandas as pd 

gemini_api = GeminiModel('AIzaSyDAR99qLXKX_phG9gyUfR5SBb93vNqadZ4', batch_size = 50, delay = 2, model_name = 'gemini-2.0-flash-lite' ) 

study_types = [item for subcat in file['study_types'].values() for method_list in subcat.values() for item in method_list]
poverty_context = list(file['poverty_contexts'].values())
mechanisms = [item for subcat in file['mechanisms'].values() for item in subcat]
behaviors = [item for subcat in file['Behaviors'].values() for item in subcat]

results =  []


df = pd.read_csv(f'data/extractedPapers20{12}.csv')
results = gemini_api.classify(papers = df.to_dict(orient='records'), 
                            study_types =study_types, 
                            poverty_contexts = poverty_context, 
                            mechanisms = mechanisms, 
                            behaviors = behaviors)

# results.extend(classified)  

# results

2025-06-11 16:59:25,678 - INFO - Initialized with 1 API keys for parallel processing
2025-06-11 16:59:26,984 - INFO - Processing 11588 abstracts in parallel using 1 API keys
2025-06-11 16:59:26,987 - INFO - Created 232 batches for parallel processing
2025-06-11 16:59:27,868 - INFO - Successfully classified batch of 50 abstracts using API key: AIzaSyDA...
--- Logging error ---
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.11/logging/__init__.py", line 1113, in emit
    stream.write(msg + self.terminator)
  File "/opt/anaconda3/lib/python3.11/site-packages/ipykernel/iostream.py", line 662, in write
    self._schedule_flush()
  File "/opt/anaconda3/lib/python3.11/site-packages/ipykernel/iostream.py", line 559, in _schedule_flush
    self.pub_thread.schedule(_schedule_in_thread)
  File "/opt/anaconda3/lib/python3.11/site-packages/ipykernel/iostream.py", line 266, in schedule
    self._event_pipe.send(b"")
  File "/opt/anaconda3/lib/python3.11/site-packages/zmq/sugar

KeyboardInterrupt: 

2025-06-11 16:59:45,192 - ERROR - Error generating classification with API key AIzaSyCI...: 400 API key expired. Please renew the API key. [reason: "API_KEY_INVALID"
domain: "googleapis.com"
metadata {
  key: "service"
  value: "generativelanguage.googleapis.com"
}
, locale: "en-US"
message: "API key expired. Please renew the API key."
]
2025-06-11 16:59:45,192 - ERROR - Error generating classification with API key AIzaSyB9...: 400 API key expired. Please renew the API key. [reason: "API_KEY_INVALID"
domain: "googleapis.com"
metadata {
  key: "service"
  value: "generativelanguage.googleapis.com"
}
, locale: "en-US"
message: "API key expired. Please renew the API key."
]
--- Logging error ---
Traceback (most recent call last):
  File "/var/folders/gr/mhlc8kfs39b7v4v54d1fd4bc0000gn/T/ipykernel_1744/1966920006.py", line 175, in classify_batch
    response = self._generate(prompt, api_key)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/gr/mhlc8kfs39b7v4v54d1fd4bc0000g

In [12]:
results

2025-06-11 16:59:06,316 - ERROR - Error generating classification with API key AIzaSyB9...: 400 API key expired. Please renew the API key. [reason: "API_KEY_INVALID"
domain: "googleapis.com"
metadata {
  key: "service"
  value: "generativelanguage.googleapis.com"
}
, locale: "en-US"
message: "API key expired. Please renew the API key."
]
--- Logging error ---
Traceback (most recent call last):
  File "/var/folders/gr/mhlc8kfs39b7v4v54d1fd4bc0000gn/T/ipykernel_1744/1966920006.py", line 175, in classify_batch
    response = self._generate(prompt, api_key)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/gr/mhlc8kfs39b7v4v54d1fd4bc0000gn/T/ipykernel_1744/1966920006.py", line 92, in _generate
    output = self.models[api_key].generate_content(prompt).text
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/google/generativeai/generative_models.py", line 331, in generate_content
    response = self._client.generat

[]

2025-06-11 16:59:14,333 - INFO - Successfully classified batch of 50 abstracts using API key: AIzaSyCF...
--- Logging error ---
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.11/logging/__init__.py", line 1113, in emit
    stream.write(msg + self.terminator)
  File "/opt/anaconda3/lib/python3.11/site-packages/ipykernel/iostream.py", line 662, in write
    self._schedule_flush()
  File "/opt/anaconda3/lib/python3.11/site-packages/ipykernel/iostream.py", line 559, in _schedule_flush
    self.pub_thread.schedule(_schedule_in_thread)
  File "/opt/anaconda3/lib/python3.11/site-packages/ipykernel/iostream.py", line 266, in schedule
    self._event_pipe.send(b"")
  File "/opt/anaconda3/lib/python3.11/site-packages/zmq/sugar/socket.py", line 696, in send
    return super().send(data, flags=flags, copy=copy, track=track)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "zmq/backend/cython/socket.pyx", line 742, in zmq.backend.cython.socket.Socket.

In [42]:
import pandas as pd

results_df = pd.DataFrame(results)
results_df['poverty_context'].value_counts() 

poverty_context
Insufficient info                                                                                          15757
Low resource level                                                                                          9396
Social environment                                                                                          5477
Low resource level, Social environment                                                                      4292
Low resource level, Human capital inputs                                                                    1253
Low resource level, Physical environment                                                                    1032
Human capital inputs                                                                                         795
Physical environment                                                                                         450
Low resource level, Resource volatility                                         

In [4]:
results_df.head()

NameError: name 'results_df' is not defined

In [46]:
results_df['country_of_study'].value_counts()

country_of_study
Insufficient info                                                                                                                                         9880
Turks and Caicos Islands                                                                                                                                  2157
United States Minor Outlying Islands                                                                                                                       992
Tuvalu                                                                                                                                                     961
Tunisia                                                                                                                                                    937
                                                                                                                                                          ... 
Syrian Arab Republic, Faroe I

In [2]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Fetch API key from environment variables
api_key = os.getenv("GEMINI_API_KEY")

# Load Google Sheets credentials from environment variables
type= os.getenv("type")
project_id = os.getenv("project_id")
private_key_id = os.getenv("private_key_id") 
private_key = os.getenv("private_key").replace("\\n", "\n")  
client_email = os.getenv("client_email")
client_id = os.getenv("client_id")
auth_uri = os.getenv("auth_uri")
token_uri = os.getenv("token_uri")
auth_provider_x509_cert_url = os.getenv("auth_provider_x509_cert_url")
client_x509_cert_url= os.getenv("client_x509_cert_url")
universe_domain= os.getenv("universe_domain")
credenetials = {
    "type": type,
    "project_id": project_id,
    "private_key_id": private_key_id,
    "private_key": private_key,
    "client_email": client_email,
    "client_id": client_id,
    "auth_uri": auth_uri,
    "token_uri": token_uri,
    "auth_provider_x509_cert_url": auth_provider_x509_cert_url,
    "client_x509_cert_url": client_x509_cert_url,
    "universe_domain": universe_domain
}
# Load Google Sheets IDs from environment variables
papers_spreadsheet_id = os.getenv("papers_spreadsheet_id")
density = os.getenv("density")
density_X = os.getenv("X")
density_Y = os.getenv("Y")
topics_spreadsheet_id = os.getenv("topics_spreadsheet_id")
spreadsheet_id =  {
    "papers": papers_spreadsheet_id,
    "density": density,
    "density_X": density_X,
    "density_Y": density_Y,
    "topics": topics_spreadsheet_id
}

In [29]:
# country_of_paper = results_df['country']
# results_df.drop(columns=['country'], inplace=True)
# # results_df.drop(columns=['country_of_paper'], inplace=True)
# results_df['country'] = df['country']
# # results_df['country_of_paper'] = country_of_paper


In [3]:
from data import googleSheets
import importlib

importlib.reload(googleSheets)

google_sheets = googleSheets.API(credentials_json=credenetials)

# Upload the results DataFrame to Google Sheets
google_sheets.replace(
    results_df, 
    spreadsheet_id=spreadsheet_id['papers'], 
    sheet_name='Sheet1'
    )


2025-06-11 08:23:08,852 - API - INFO - Initializing Google Sheets API
2025-06-11 08:23:08,853 - API - INFO - Building Google Sheets service
2025-06-11 08:23:08,905 - googleapiclient.discovery_cache - INFO - file_cache is only supported with oauth2client<4.0.0
2025-06-11 08:23:08,907 - API - INFO - Google Sheets service built successfully
2025-06-11 08:23:08,907 - API - INFO - API initialized with batch size: 1000


NameError: name 'results_df' is not defined