## Celda 1: Importaciones necesarias

In [10]:
import pandas as pd
import numpy as np
import boto3
import pickle
from datetime import date
import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

## 1 Función para cargar datos desde S3

In [2]:
# Cargar configuración
import yaml
with open("credentials.yaml", "r") as f:
    config = yaml.safe_load(f)

# 1. Cargar los datos de S3
def carga_datos_s3(bucket, key_prefix):
    session = boto3.Session(
        aws_access_key_id=config['s3']['aws_access_key_id'],
        aws_secret_access_key=config['s3']['aws_secret_access_key'],
        aws_session_token=config['s3']['aws_session_token']
    )
    s3 = session.client('s3')
    
    # Obtener la lista de objetos en el bucket con el prefijo dado
    response = s3.list_objects_v2(Bucket=bucket, Prefix=key_prefix)
    
    # Obtener la clave del archivo más reciente
    latest_file = max(response['Contents'], key=lambda x: x['LastModified'])['Key']
    
    # Descargar el objeto
    obj = s3.get_object(Bucket=bucket, Key=latest_file)
    
    # Cargar los datos del pickle
    dataset = pickle.loads(obj['Body'].read())
    
    return pd.DataFrame(dataset)

# Configurar el bucket y la clave
bucket = "aplicaciones-cd-1-" + config['iexe']['matricula']
key_prefix = "limpieza/"

# Cargar los datos
df = carga_datos_s3(bucket, key_prefix)
print(f"Se cargaron {len(df)} registros de S3.")

Se cargaron 279463 registros de S3.


## 2 Eliminar observaciones con resultados inválidos

In [3]:
def delete_inspections(df):
    return df[~df.results.isin(['Business Not Located', 'No Entry', 'Out of Business'])]

df = delete_inspections(df)
print(f"Después de eliminar observaciones inválidas, quedan {len(df)} registros.")


Después de eliminar observaciones inválidas, quedan 244358 registros.


## 3. Transformar la variable results

In [4]:
def transform_result(df):
    df.loc[df['results'].isin(['Pass', 'Pass w/ Conditions']), 'results'] = 'pass'
    df.loc[~df['results'].isin(['pass']), 'results'] = 'fail'
    return df

df = transform_result(df)
print(f"Registros con 'pass': {sum(df['results'] == 'pass')}")
print(f"Registros con 'fail': {sum(df['results'] == 'fail')}")

Registros con 'pass': 186204
Registros con 'fail': 58154


## 4. Transformar la variable risk

In [5]:
def transform_risk(df):
    risk_mapping = {
        'Risk 1 (High)': 'high',
        'Risk 2 (Medium)': 'medium',
        'Risk 3 (Low)': 'low',
        'All': 'all'
    }
    df['risk'] = df['risk'].map(risk_mapping)
    return df

df = transform_risk(df)
df.head(3)

Unnamed: 0,inspection_id,dba_name,aka_name,license_,facility_type,risk,address,city,state,zip,inspection_date,inspection_type,results,latitude,longitude,location,violations
0,67732,WOLCOTT'S,TROQUET,1992039,Restaurant,high,1834 W MONTROSE AVE,CHICAGO,IL,60613,2010-01-04,License Re-Inspection,pass,41.961606,-87.675967,"{'latitude': '41.961605669949854', 'longitude'...",
2,67733,WOLCOTT'S,TROQUET,1992040,Restaurant,high,1834 W MONTROSE AVE,CHICAGO,IL,60613,2010-01-04,License Re-Inspection,pass,41.961606,-87.675967,"{'latitude': '41.961605669949854', 'longitude'...",
3,104236,TEMPO CAFE,TEMPO CAFE,80916,Restaurant,high,6 E CHESTNUT ST,CHICAGO,IL,60611,2010-01-04,Canvass,fail,41.898431,-87.628009,"{'latitude': '41.89843137207629', 'longitude':...",18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...


## 5. Transformar la variable facility_type

In [6]:
def transform_facility(df):
    df.loc[df['facility_type'].str.contains('Daycare', case=False, na=False), 'facility_type'] = 'daycare'
    df.loc[df['facility_type'].str.contains('Restaurant', case=False, na=False), 'facility_type'] = 'restaurant'
    df.loc[df['facility_type'].str.contains('Mobile Food', case=False, na=False), 'facility_type'] = 'mobile food'
    return df

def transform_facility_other(df):
    top_20 = df['facility_type'].value_counts().nlargest(20).index
    df.loc[~df['facility_type'].isin(top_20), 'facility_type'] = 'other'
    return df

df = transform_facility(df)
df = transform_facility_other(df)
df.head(3)

Unnamed: 0,inspection_id,dba_name,aka_name,license_,facility_type,risk,address,city,state,zip,inspection_date,inspection_type,results,latitude,longitude,location,violations
0,67732,WOLCOTT'S,TROQUET,1992039,restaurant,high,1834 W MONTROSE AVE,CHICAGO,IL,60613,2010-01-04,License Re-Inspection,pass,41.961606,-87.675967,"{'latitude': '41.961605669949854', 'longitude'...",
2,67733,WOLCOTT'S,TROQUET,1992040,restaurant,high,1834 W MONTROSE AVE,CHICAGO,IL,60613,2010-01-04,License Re-Inspection,pass,41.961606,-87.675967,"{'latitude': '41.961605669949854', 'longitude'...",
3,104236,TEMPO CAFE,TEMPO CAFE,80916,restaurant,high,6 E CHESTNUT ST,CHICAGO,IL,60611,2010-01-04,Canvass,fail,41.898431,-87.628009,"{'latitude': '41.89843137207629', 'longitude':...",18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...


## 6. Generar nuevas características

In [7]:
def generate_features(df):
    df['month'] = df['inspection_date'].dt.month
    df['year'] = df['inspection_date'].dt.year
    df['day_of_month'] = df['inspection_date'].dt.day
    df['week_of_year'] = df['inspection_date'].dt.isocalendar().week
    df['day_of_week'] = df['inspection_date'].dt.dayofweek
    df['week_day'] = df['day_of_week'].isin([0, 1, 2, 3, 4]).astype(int)
    df['weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    return df

df = generate_features(df)
df.head(3)

Unnamed: 0,inspection_id,dba_name,aka_name,license_,facility_type,risk,address,city,state,zip,...,longitude,location,violations,month,year,day_of_month,week_of_year,day_of_week,week_day,weekend
0,67732,WOLCOTT'S,TROQUET,1992039,restaurant,high,1834 W MONTROSE AVE,CHICAGO,IL,60613,...,-87.675967,"{'latitude': '41.961605669949854', 'longitude'...",,1,2010,4,1,0,1,0
2,67733,WOLCOTT'S,TROQUET,1992040,restaurant,high,1834 W MONTROSE AVE,CHICAGO,IL,60613,...,-87.675967,"{'latitude': '41.961605669949854', 'longitude'...",,1,2010,4,1,0,1,0
3,104236,TEMPO CAFE,TEMPO CAFE,80916,restaurant,high,6 E CHESTNUT ST,CHICAGO,IL,60611,...,-87.628009,"{'latitude': '41.89843137207629', 'longitude':...",18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,1,2010,4,1,0,1,0


## 7. Crear la matriz de diseño

In [12]:
print(sklearn.__version__)

1.5.2


In [13]:
def feature_matrix(df):
    features = ['facility_type', 'risk', 'latitude', 'longitude', 'month', 'year', 
                'day_of_month', 'week_of_year', 'day_of_week', 'week_day', 'weekend']
    
    X = df[features]
    
    ct = ColumnTransformer([
        ('onehot_facility', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ['facility_type']),
        ('onehot_risk', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ['risk'])
    ], remainder='passthrough')
    
    X_encoded = ct.fit_transform(X)
    
    # Obtener los nombres de las características para las columnas codificadas
    onehot_cols = (ct.named_transformers_['onehot_facility'].get_feature_names_out(['facility_type']).tolist() +
                   ct.named_transformers_['onehot_risk'].get_feature_names_out(['risk']).tolist())
    
    # Combinar con las características no codificadas
    feature_names = onehot_cols + features[2:]
    
    return pd.DataFrame(X_encoded, columns=feature_names)

feature_matrix_df = feature_matrix(df)
print(feature_matrix_df.head())
print(f"Shape of feature matrix: {feature_matrix_df.shape}")

  facility_type_BANQUET HALL facility_type_Bakery facility_type_Catering  \
0                        0.0                  0.0                    0.0   
1                        0.0                  0.0                    0.0   
2                        0.0                  0.0                    0.0   
3                        0.0                  0.0                    0.0   
4                        0.0                  0.0                    0.0   

  facility_type_Children's Services Facility facility_type_GAS STATION  \
0                                        0.0                       0.0   
1                                        0.0                       0.0   
2                                        0.0                       0.0   
3                                        0.0                       0.0   
4                                        0.0                       0.0   

  facility_type_Golden Diner facility_type_Grocery Store  \
0                        0.0          

## 8. Guardar la matriz de características en S3

In [14]:
def save_feature_matrix(bucket, matrix_df):
    session = boto3.Session(
        aws_access_key_id=config['s3']['aws_access_key_id'],
        aws_secret_access_key=config['s3']['aws_secret_access_key'],
        aws_session_token=config['s3']['aws_session_token']
    )
    s3 = session.client('s3')
    
    today = date.today().strftime("%Y-%m-%d")
    key = f"feature-matrix/feature-matrix.{today}.pkl"
    
    pickle_data = pickle.dumps(matrix_df)
    s3.put_object(Bucket=bucket, Key=key, Body=pickle_data)
    print(f"Matriz de características guardada en {bucket}/{key}")

save_feature_matrix(bucket, feature_matrix_df)

print("Proceso de feature engineering completado.")

Matriz de características guardada en aplicaciones-cd-1-mcda24a004/feature-matrix/feature-matrix.2024-10-20.pkl
Proceso de feature engineering completado.
