In [16]:
import pandas as pd
import boto3
import json
from dotenv import load_dotenv
import time
import random
import os
from tenacity import retry, wait_exponential, stop_after_attempt, retry_if_exception_type
import botocore.exceptions
import re

In [2]:
def load():
    path = '../data/credit_data_with_descriptions.csv'
    data = pd.read_csv(path)
    
    print(data.info())

    return data

In [3]:
BATCH_SIZE = 50
MODEL_ID = "anthropic.claude-3-sonnet-20240229-v1:0"
REGION = 'sa-east-1'

In [4]:
def construir_prompt_en(batch):
    prompt = (
        "Below are profiles summaries of credit applicants. For each profile, generate a risk target (“bad risk” or “good risk”). "
        f"Return the targets numbered from 1 to {len(batch)}.\n\n"
    )
    for i, (_, row) in enumerate(batch.iterrows(), 1):
        prompt += (
            f"{i}.\n"
            f"Summary: {row['Description']}\n\n"
        )
    return prompt

In [None]:
# Crear cliente Bedrock
bedrock = boto3.client("bedrock-runtime", region_name=REGION)

# Función con retry automático para manejar throttling
@retry(
    wait=wait_exponential(multiplier=1, min=2, max=10),
    stop=stop_after_attempt(6),
    retry=retry_if_exception_type(botocore.exceptions.ClientError)
)
def obtener_respuesta(prompt):
    response = bedrock.invoke_model(
        modelId=MODEL_ID,
        contentType="application/json",
        accept="application/json",
        body=json.dumps({
            "messages": [
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            "max_tokens": 800,
            "temperature": 0.7,
            "anthropic_version": "bedrock-2023-05-31"
        })
    )
    result = json.loads(response['body'].read())
    return result['content'][0]['text']

In [18]:
def main(chunk_size=5):
    load_dotenv()
    df = load()
    targets = []

    for i in range(0, len(df), chunk_size):
        batch = df.iloc[i:i+chunk_size]
        prompt = construir_prompt_en(batch)
        print(f"Procesando filas {i + 1} a {i + len(batch)}...")

        try:
            respuesta = obtener_respuesta(prompt)
            print(respuesta)

            descripciones_batch = [
                re.sub(r"^\d+\.\s*", "", line.strip())  # quita "1. " y deja "Good risk"
                for line in respuesta.strip().split("\n")
                if re.match(r"^\d+\.\s*", line.strip())  # solo líneas que empiezan con número
            ]

            if len(descripciones_batch) != len(batch):
                raise ValueError(f"Se esperaban {len(batch)} descripciones pero se obtuvieron {len(descripciones_batch)}")

            targets.extend(descripciones_batch)

        except Exception as e:
            print(f"Error en lote {i}–{i+chunk_size}: {e}")
            targets.extend(["ERROR"] * len(batch))

        time.sleep(2)  # descanso entre lotes para evitar throttling

    df["Risk target"] = targets
    df.to_csv("credit_data_with_targets.csv", index=False)
    print("Archivo exportado TOTALMENTE")


In [19]:
main(50)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               1000 non-null   int64 
 1   Sex               1000 non-null   object
 2   Job               1000 non-null   int64 
 3   Housing           1000 non-null   object
 4   Saving accounts   1000 non-null   object
 5   Checking account  1000 non-null   object
 6   Credit amount     1000 non-null   int64 
 7   Duration          1000 non-null   int64 
 8   Purpose           1000 non-null   object
 9   Description       1000 non-null   object
dtypes: int64(4), object(6)
memory usage: 78.3+ KB
None
Procesando filas 1 a 50...
1. Good risk
2. Bad risk
3. Good risk
4. Bad risk
5. Bad risk
6. Bad risk
7. Good risk
8. Good risk
9. Good risk
10. Good risk
11. Good risk
12. Bad risk
13. Good risk
14. Bad risk
15. Bad risk
16. Good risk
17. Good risk
18. Bad risk
19. Bad risk
20. Good ri