In [1]:
%%capture

# libraries for data manipulation
import pandas as pd
import numpy as np

# libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

# autoreload modules
%load_ext autoreload
%autoreload 2


# import train_test_split from sklearn
from sklearn.model_selection import train_test_split

# Libraries for Experiment Tracking
import mlflow
import IPython.display


# Libraries for Model Training and Evaluation
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.utils.validation import check_is_fitted

# libraries for file management
import os
import shutil
import joblib
# 
# import data_utils



In [52]:
json_data = {
  "AGE": 32,
  "APPLICATION_SUBMISSION_TYPE": "Web",
  "CITY_OF_BIRTH": "RIO DE JANEIRO",
  "COMPANY": "Y",
  "FLAG_EMAIL": 0,
  "FLAG_MASTERCARD": 1,
  "FLAG_PROFESSIONAL_PHONE": "N",
  "FLAG_RESIDENCIAL_PHONE": "Y",
  "FLAG_VISA": 0,
  "MARITAL_STATUS": 3,
  "NACIONALITY": 1,
  "OCCUPATION_TYPE": 5,
  "OTHER_INCOMES": 3,
  "PAYMENT_DAY": 40,
  "PERSONAL_ASSETS_VALUE": "50000",
  "PERSONAL_MONTHLY_INCOME": "60000",
  "MONTHS_IN_RESIDENCE": 2,
  "PRODUCT": 2,
  "PROFESSIONAL_ZIP_3": 18,
  "PROFESSION_CODE": 5,
  "QUANT_BANKING_ACCOUNTS": 1,
  "QUANT_CARS": 0,
  "QUANT_DEPENDANTS": 0,
  "QUANT_SPECIAL_BANKING_ACCOUNTS": 1,
  "RESIDENCE_TYPE": 1,
  "RESIDENCIAL_BOROUGH": 1,
  "RESIDENCIAL_CITY": "MENDES",
  "RESIDENCIAL_PHONE_AREA_CODE": 2,
  "RESIDENCIAL_STATE": "RS",
  "RESIDENCIAL_ZIP_3": 4,
  "SEX": "F",
  "STATE_OF_BIRTH": "RJ"
}

In [53]:
path = os.path.join(os.getcwd(), "jobs/encoding_features.pkl")

# Abrimos el archivo en modo lectura binaria ('rb')
with open(path, 'rb') as f:
    # Cargamos los datos del archivo pickle
    encoding_features = pickle.load(f)

encoding_features

['MARITAL_STATUS',
 'NACIONALITY',
 'RESIDENCE_TYPE',
 'FLAG_EMAIL',
 'FLAG_VISA',
 'FLAG_MASTERCARD',
 'QUANT_BANKING_ACCOUNTS',
 'QUANT_SPECIAL_BANKING_ACCOUNTS',
 'QUANT_CARS',
 'OCCUPATION_TYPE',
 'PRODUCT',
 'PROFESSION_CODE',
 'APPLICATION_SUBMISSION_TYPE',
 'SEX',
 'STATE_OF_BIRTH',
 'CITY_OF_BIRTH',
 'RESIDENCIAL_STATE',
 'RESIDENCIAL_CITY',
 'RESIDENCIAL_BOROUGH',
 'FLAG_RESIDENCIAL_PHONE',
 'RESIDENCIAL_PHONE_AREA_CODE',
 'COMPANY',
 'FLAG_PROFESSIONAL_PHONE',
 'RESIDENCIAL_ZIP_3',
 'PROFESSIONAL_ZIP_3']

In [54]:
def convert_payment_day(df):
    # Creamos las nuevas columnas
    for i in range(5, 30, 5):
        df[f'PAYMENT_DAY_{i}'] = df['PAYMENT_DAY'].apply(lambda x: 1 if i-5 < x <= i else 0)
    
    # Agregamos los valores más altos a la columna PAYMENT_DAY_25
    df['PAYMENT_DAY_25'] = df['PAYMENT_DAY'].apply(lambda x: 1 if x >= 25 else 0)

    # Eliminamos la columna original
    df.drop(columns=['PAYMENT_DAY'], inplace=True)
    
    return df


def convert_age(df):
    # Define los rangos de edades que quieres usar
    age_ranges = [(0, 18), (19, 30), (31, 50), (51, 70), (71, 100)]
    
    for i, age_range in enumerate(age_ranges):
        min_age, max_age = age_range
        # Crea una nueva columna para cada rango de edad
        df[f'AGE_RANGE_{min_age}_{max_age}'] = df['AGE'].apply(lambda x: 1 if min_age <= x <= max_age else 0)
    
    # Elimina la columna original de 'AGE'
    df.drop(columns=['AGE'], inplace=True)
    
    return df



In [55]:
# json to dataframe
df = pd.DataFrame.from_dict(json_data, orient='index', columns=['Value'])

In [56]:
df = df.T

In [57]:
df

Unnamed: 0,AGE,APPLICATION_SUBMISSION_TYPE,CITY_OF_BIRTH,COMPANY,FLAG_EMAIL,FLAG_MASTERCARD,FLAG_PROFESSIONAL_PHONE,FLAG_RESIDENCIAL_PHONE,FLAG_VISA,MARITAL_STATUS,...,QUANT_DEPENDANTS,QUANT_SPECIAL_BANKING_ACCOUNTS,RESIDENCE_TYPE,RESIDENCIAL_BOROUGH,RESIDENCIAL_CITY,RESIDENCIAL_PHONE_AREA_CODE,RESIDENCIAL_STATE,RESIDENCIAL_ZIP_3,SEX,STATE_OF_BIRTH
Value,32,Web,RIO DE JANEIRO,Y,0,1,N,Y,0,3,...,0,1,1,1,MENDES,2,RS,4,F,RJ


In [58]:
path = os.path.join(os.getcwd(), "jobs/start_columns.pkl")

# Abrimos el archivo en modo lectura binaria ('rb')
with open(path, 'rb') as f:
    # Cargamos los datos del archivo pickle
    start_columns = pickle.load(f)

start_columns

['PAYMENT_DAY',
 'APPLICATION_SUBMISSION_TYPE',
 'SEX',
 'MARITAL_STATUS',
 'QUANT_DEPENDANTS',
 'STATE_OF_BIRTH',
 'CITY_OF_BIRTH',
 'NACIONALITY',
 'RESIDENCIAL_STATE',
 'RESIDENCIAL_CITY',
 'RESIDENCIAL_BOROUGH',
 'FLAG_RESIDENCIAL_PHONE',
 'RESIDENCIAL_PHONE_AREA_CODE',
 'RESIDENCE_TYPE',
 'MONTHS_IN_RESIDENCE',
 'FLAG_EMAIL',
 'PERSONAL_MONTHLY_INCOME',
 'OTHER_INCOMES',
 'FLAG_VISA',
 'FLAG_MASTERCARD',
 'QUANT_BANKING_ACCOUNTS',
 'QUANT_SPECIAL_BANKING_ACCOUNTS',
 'PERSONAL_ASSETS_VALUE',
 'QUANT_CARS',
 'COMPANY',
 'FLAG_PROFESSIONAL_PHONE',
 'PROFESSION_CODE',
 'OCCUPATION_TYPE',
 'PRODUCT',
 'AGE',
 'RESIDENCIAL_ZIP_3',
 'PROFESSIONAL_ZIP_3']

In [59]:
# ordena las columnas de df en funcion de start_columns
df = df[start_columns]

In [60]:
df

Unnamed: 0,PAYMENT_DAY,APPLICATION_SUBMISSION_TYPE,SEX,MARITAL_STATUS,QUANT_DEPENDANTS,STATE_OF_BIRTH,CITY_OF_BIRTH,NACIONALITY,RESIDENCIAL_STATE,RESIDENCIAL_CITY,...,PERSONAL_ASSETS_VALUE,QUANT_CARS,COMPANY,FLAG_PROFESSIONAL_PHONE,PROFESSION_CODE,OCCUPATION_TYPE,PRODUCT,AGE,RESIDENCIAL_ZIP_3,PROFESSIONAL_ZIP_3
Value,40,Web,F,3,0,RJ,RIO DE JANEIRO,1,RS,MENDES,...,50000,0,Y,N,5,5,2,32,4,18


In [52]:
df

Unnamed: 0,PAYMENT_DAY,APPLICATION_SUBMISSION_TYPE,SEX,MARITAL_STATUS,QUANT_DEPENDANTS,STATE_OF_BIRTH,CITY_OF_BIRTH,NACIONALITY,RESIDENCIAL_STATE,RESIDENCIAL_CITY,...,PERSONAL_ASSETS_VALUE,QUANT_CARS,COMPANY,FLAG_PROFESSIONAL_PHONE,PROFESSION_CODE,OCCUPATION_TYPE,PRODUCT,AGE,RESIDENCIAL_ZIP_3,PROFESSIONAL_ZIP_3
Value,40,Web,F,3,0,RJ,RIO DE JANEIRO,1,RS,MENDES,...,50000,0,Y,N,5,5,2,32,4,18


In [61]:
df = convert_payment_day(df)
df.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'PAYMENT_DAY_{i}'] = df['PAYMENT_DAY'].apply(lambda x: 1 if i-5 < x <= i else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'PAYMENT_DAY_{i}'] = df['PAYMENT_DAY'].apply(lambda x: 1 if i-5 < x <= i else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'PAYMENT_DAY_{i}'] = df['PAYMEN

(1, 36)

In [15]:
df

Unnamed: 0,APPLICATION_SUBMISSION_TYPE,SEX,MARITAL_STATUS,QUANT_DEPENDANTS,STATE_OF_BIRTH,CITY_OF_BIRTH,NACIONALITY,RESIDENCIAL_STATE,RESIDENCIAL_CITY,RESIDENCIAL_BOROUGH,...,OCCUPATION_TYPE,PRODUCT,AGE,RESIDENCIAL_ZIP_3,PROFESSIONAL_ZIP_3,PAYMENT_DAY_5,PAYMENT_DAY_10,PAYMENT_DAY_15,PAYMENT_DAY_20,PAYMENT_DAY_25
Value,Web,F,3,0,RJ,RIO DE JANEIRO,1,RS,MENDES,1,...,5,2,32,4,18,0,0,0,0,1


In [62]:
import copy

new_data = copy.deepcopy(df)

In [63]:
path = os.path.join(os.getcwd(), "jobs/encoding_features.pkl")

# Abrimos el archivo en modo lectura binaria ('rb')
with open(path, 'rb') as f:
    # Cargamos los datos del archivo pickle
    encoding_features = pickle.load(f)

encoding_features

['MARITAL_STATUS',
 'NACIONALITY',
 'RESIDENCE_TYPE',
 'FLAG_EMAIL',
 'FLAG_VISA',
 'FLAG_MASTERCARD',
 'QUANT_BANKING_ACCOUNTS',
 'QUANT_SPECIAL_BANKING_ACCOUNTS',
 'QUANT_CARS',
 'OCCUPATION_TYPE',
 'PRODUCT',
 'PROFESSION_CODE',
 'APPLICATION_SUBMISSION_TYPE',
 'SEX',
 'STATE_OF_BIRTH',
 'CITY_OF_BIRTH',
 'RESIDENCIAL_STATE',
 'RESIDENCIAL_CITY',
 'RESIDENCIAL_BOROUGH',
 'FLAG_RESIDENCIAL_PHONE',
 'RESIDENCIAL_PHONE_AREA_CODE',
 'COMPANY',
 'FLAG_PROFESSIONAL_PHONE',
 'RESIDENCIAL_ZIP_3',
 'PROFESSIONAL_ZIP_3']

In [64]:
# remove features with lot of classes to perform ohe
# encoding_features.remove("CITY_OF_BIRTH")
# encoding_features.remove("RESIDENCIAL_CITY")
# encoding_features.remove("RESIDENCIAL_BOROUGH")

list_dorp = ["CITY_OF_BIRTH", "RESIDENCIAL_CITY", "RESIDENCIAL_BOROUGH"]

new_data.drop(columns=list_dorp, inplace=True)

In [65]:
path = os.path.join(os.getcwd(), "jobs/ohe_fitted.pkl")

# Abrimos el archivo en modo lectura binaria ('rb')
with open(path, 'rb') as f:
    # Cargamos los datos del archivo pickle
    ohe_fitted = pickle.load(f)

ohe_fitted

In [66]:
features_in_ohe = ohe_fitted.feature_names_in_
features_in_ohe

array(['MARITAL_STATUS', 'NACIONALITY', 'RESIDENCE_TYPE', 'FLAG_EMAIL',
       'FLAG_VISA', 'FLAG_MASTERCARD', 'QUANT_BANKING_ACCOUNTS',
       'QUANT_SPECIAL_BANKING_ACCOUNTS', 'QUANT_CARS', 'OCCUPATION_TYPE',
       'PRODUCT', 'PROFESSION_CODE', 'APPLICATION_SUBMISSION_TYPE', 'SEX',
       'STATE_OF_BIRTH', 'RESIDENCIAL_STATE', 'FLAG_RESIDENCIAL_PHONE',
       'RESIDENCIAL_PHONE_AREA_CODE', 'COMPANY',
       'FLAG_PROFESSIONAL_PHONE', 'RESIDENCIAL_ZIP_3',
       'PROFESSIONAL_ZIP_3'], dtype=object)

In [67]:
len(features_in_ohe)

22

In [68]:
list_dorp = ["PAYMENT_DAY_5", "PAYMENT_DAY_10", "PAYMENT_DAY_15", "PAYMENT_DAY_20", "PAYMENT_DAY_25"]

new_data.drop(columns=list_dorp, inplace=True)

In [69]:
features_in_new_data = new_data.columns.tolist()
print(features_in_new_data)

['APPLICATION_SUBMISSION_TYPE', 'SEX', 'MARITAL_STATUS', 'QUANT_DEPENDANTS', 'STATE_OF_BIRTH', 'NACIONALITY', 'RESIDENCIAL_STATE', 'FLAG_RESIDENCIAL_PHONE', 'RESIDENCIAL_PHONE_AREA_CODE', 'RESIDENCE_TYPE', 'MONTHS_IN_RESIDENCE', 'FLAG_EMAIL', 'PERSONAL_MONTHLY_INCOME', 'OTHER_INCOMES', 'FLAG_VISA', 'FLAG_MASTERCARD', 'QUANT_BANKING_ACCOUNTS', 'QUANT_SPECIAL_BANKING_ACCOUNTS', 'PERSONAL_ASSETS_VALUE', 'QUANT_CARS', 'COMPANY', 'FLAG_PROFESSIONAL_PHONE', 'PROFESSION_CODE', 'OCCUPATION_TYPE', 'PRODUCT', 'AGE', 'RESIDENCIAL_ZIP_3', 'PROFESSIONAL_ZIP_3']


In [70]:
len(features_in_new_data)

28

In [71]:
a =set(features_in_new_data) - set(features_in_ohe)

new_data= new_data.drop(columns=a, axis=1)

In [73]:
features_in_ohe = ohe_fitted.feature_names_in_
features_in_new_data = new_data.columns.tolist()

In [74]:
set(features_in_new_data) - set(features_in_ohe)

set()

In [75]:
assert set(features_in_new_data) == set(features_in_ohe)

In [None]:
new_data.co

In [77]:
ew_encoded = ohe_fitted.transform(new_data[features_in_ohe])



In [79]:
ew_encoded.shape

(1, 1762)

In [80]:
path = os.path.join(os.getcwd(), "jobs/scaler.pkl")

# Abrimos el archivo en modo lectura binaria ('rb')
with open(path, 'rb') as f:
    # Cargamos los datos del archivo pickle
    scaler = pickle.load(f)

scaler

In [81]:
ew_encoded.columns

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [None]:
new_data.shape

(1, 35)

In [None]:
# importar encoding

# remoder --> CITY_OF_BIRTH RESIDENCIAL_CITY RESIDENCIAL_BOROUGH

