<h2>Proyecto 1: Modelado<h2>

In [1]:
## Instalación de librerías necesarias
!pip install --quiet boto3


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
## Importación de librerías
import pandas as pd
import numpy as np
import pickle
import boto3
import yaml
from datetime import date
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [3]:
## Cargar credenciales
with open("credentials.yaml", "r") as f:
    config = yaml.safe_load(f)

In [4]:
## Función para cargar datos desde S3
def cargar_datos_s3(bucket, bucket_path):
    session = boto3.Session(
        aws_access_key_id=config['s3']['aws_access_key_id'],
        aws_secret_access_key=config['s3']['aws_secret_access_key'],
        aws_session_token=config['s3']['aws_session_token']
    )

    s3 = session.resource('s3')
    obj = s3.Object(bucket, bucket_path).get()['Body'].read()
    dataset = pickle.loads(obj)
    
    return dataset

## Función para guardar modelos en S3
def save_model(bucket, bucket_path, model):
    session = boto3.Session(
        aws_access_key_id=config['s3']['aws_access_key_id'],
        aws_secret_access_key=config['s3']['aws_secret_access_key'],
        aws_session_token=config['s3']['aws_session_token']
    )

    s3 = session.resource('s3')
    pickle_data = pickle.dumps(model)
    s3.Object(bucket, bucket_path).put(Body=pickle_data)

In [5]:
## Configuración de la sesión S3
session = boto3.Session(
    aws_access_key_id=config['s3']['aws_access_key_id'],
    aws_secret_access_key=config['s3']['aws_secret_access_key'],
    aws_session_token=config['s3']['aws_session_token']
)

s3 = session.client('s3')

In [6]:
## Cargar la matriz de diseño desde S3
source_bucket = "aplicaciones-cd-1-" + config['iexe']['matricula']
key = "feature-matrix/"
bucket_path = s3.list_objects_v2(Bucket=source_bucket, Prefix=key)['Contents'][-1]['Key']
feature_matrix = cargar_datos_s3(source_bucket, bucket_path)

## Configuración de semilla aleatoria
np.random.seed(20201122)

In [7]:
## Dividir en datos de entrenamiento y prueba
feature_matrix = feature_matrix.sort_index()  # Asumiendo que el índice contiene fechas
train_size = int(0.7 * len(feature_matrix))
train_data = feature_matrix[:train_size]
test_data = feature_matrix[train_size:]

In [14]:
# Crear sesión y recursos de S3
session = boto3.Session(
    aws_access_key_id = config['s3']['aws_access_key_id'],
    aws_secret_access_key = config['s3']['aws_secret_access_key'],
    aws_session_token = config['s3']['aws_session_token'],
    region_name='us-east-1'  # Especificamos la región
)

s3_resource = session.resource('s3')
target_bucket = "aplicaciones-cd-2-" + config['iexe']['matricula']

# Verificar si el bucket existe
bucket_exists = False
try:
    s3_resource.meta.client.head_bucket(Bucket=target_bucket)
    bucket_exists = True
except:
    bucket_exists = False

# Crear el bucket si no existe
if not bucket_exists:
    try:
        bucket = s3_resource.create_bucket(Bucket=target_bucket)
        bucket.wait_until_exists()  # Esperar hasta que el bucket exista
        print(f"Bucket {target_bucket} creado exitosamente")
    except Exception as e:
        print(f"Error creando el bucket: {str(e)}")
        raise  # Re-lanzar el error para detener la ejecución
else:
    print(f"El bucket {target_bucket} ya existe")

# Asegurarnos de que el bucket existe antes de crear las carpetas
try:
    s3_resource.meta.client.head_bucket(Bucket=target_bucket)
    
    # Crear las carpetas necesarias
    folders = ["dataset/train/", "dataset/test/", "modelos/"]
    for folder in folders:
        try:
            s3_resource.Object(target_bucket, folder).put(Body='')
            print(f"Carpeta {folder} creada exitosamente")
        except Exception as e:
            print(f"Error creando la carpeta {folder}: {str(e)}")
except Exception as e:
    print(f"No se pudo verificar el bucket: {str(e)}")

Bucket aplicaciones-cd-2-mcda24a004 creado exitosamente
Carpeta dataset/train/ creada exitosamente
Carpeta dataset/test/ creada exitosamente
Carpeta modelos/ creada exitosamente


In [15]:
# Guardar dataset de entrenamiento
save_model(target_bucket, "dataset/train/train_dataset.pkl", train_data)

In [16]:
# Guardar dataset de prueba
save_model(target_bucket, "dataset/test/test_dataset.pkl", test_data)

In [17]:
## Grid Search para Árbol de Decisión
dt_param_grid = {
    'max_depth': [5, 7, 9, 11, 15],
    'min_samples_leaf': [5, 7, 9, 11, 13],
    'criterion': ['gini', 'entropy']
}

In [18]:
ts_split = TimeSeriesSplit(n_splits=5)
dt_model = DecisionTreeClassifier()
dt_grid_search = GridSearchCV(estimator=dt_model, param_grid=dt_param_grid, cv=ts_split, scoring='accuracy', n_jobs=-1)
dt_grid_search.fit(train_data.iloc[:, :-1], train_data.iloc[:, -1])

In [19]:
## Guardar el mejor modelo
best_dt_model = dt_grid_search.best_estimator_
best_model_path = "models/decision_tree_best_model.pkl"
save_model(target_bucket, best_model_path, best_dt_model)

print("Mejor modelo guardado en S3.")

Mejor modelo guardado en S3.
