# Context

In this notebook we will do some data cleaning and data preprocessing in order to fix some issues of the data and prepare it to further analysis and modelling. Moreover, we will create some new variables to improve our future analysis.

# Load packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import random
import os
from unidecode import unidecode  # Library to remove accents


pd.options.display.max_columns = 100
pd.options.display.max_rows = 100


# Load data

using relative paths

In [None]:
filename = "data_cleaning.ipynb" # Current file name
print(f'Current file name: {filename}\n')
print(f'Current absolute path: {os.getcwd()}')

# Specify the paths, relative to the current file
NOTEBOOKS_DIR = os.path.dirname(os.path.abspath(filename))
BASE_DIR = os.path.dirname(NOTEBOOKS_DIR)
DATA_DIR = os.path.join(BASE_DIR, "Data")
OUTPUT_DIR = os.path.join(DATA_DIR, "output_data")

print(f'BASE_DIR: {BASE_DIR}')
print(f'DATA_DIR: {DATA_DIR}')
print(f'OUTPUT_DIR: {OUTPUT_DIR}')

Current file name: data_preprocessing.ipynb

Current absolute path: c:\Users\jhona\Dropbox\ASPECTOS MAESTRIA\Retos_maestria\Reto_bancow\Analitica_1\notebooks
BASE_DIR: c:\Users\jhona\Dropbox\ASPECTOS MAESTRIA\Retos_maestria\Reto_bancow\Analitica_1
DATA_DIR: c:\Users\jhona\Dropbox\ASPECTOS MAESTRIA\Retos_maestria\Reto_bancow\Analitica_1\Data
OUTPUT_DIR: c:\Users\jhona\Dropbox\ASPECTOS MAESTRIA\Retos_maestria\Reto_bancow\Analitica_1\Data\output_data


In [3]:
df_base = pd.read_excel(os.path.join(DATA_DIR, "historico_hallazgos.xlsx"))
df_base.head(2)

Unnamed: 0,Num,ID_modif,Cliente,Analista,Nombre analista,Tipo crédito,Fecha desem,Visita_analista_crédito,Visita_auditor,Actividad,Monto,Cuota,Plazo,Categoria,Hallazgo,Tipo hallazgo,Riesgo,Calificación cartera,Relaciones Laborales,Oficina,zona,Regional,Validación unico,Tipo analisis,Clasificac analisis,Estado,Año,Tipo
0,1,7252440,Hector Julio Pabon Castano,AIZ,Juan Alejandro Trujillo Garcia,Renovacion,2021-06-11 00:00:00,,2022-09-20 00:00:00,don patacon postobon,15069.268,792.642,36,FRAUDE_Y_PRÁCTICAS_INDEBIDAS,Crédito otorgado a mas de un titular con el mi...,Acto Irregular,1 - Alto,Si,Si,Dosquebradas,12,4,,,,Cancelada,2022,Afecta Estabilidad
1,2,1143936676,Casas Marisol ...,JQQ,Alexander Joaqui Quintero,Renovacion,2021-06-15 00:00:00,2021-06-08 00:00:00,,expendio de comidas preparadas en cafeterias ...,402.0,209.228,626,FRAUDE_Y_PRÁCTICAS_INDEBIDAS,Presunto negocio inexistente,Acto Irregular,1 - Alto,No,Si,Poblado,1,1,,,,Castigo,2022,Afecta Estabilidad


# Overall check

Reviewing the columns, we can gather valuable information about the data and the context, in order to know how to develop an EDA and ML project.  
We can also clean the columns data, in order to have valuable information.

In [4]:
df_base.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3919 entries, 0 to 3918
Data columns (total 28 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Num                      3919 non-null   int64  
 1   ID_modif                 3919 non-null   int64  
 2   Cliente                  3918 non-null   object 
 3   Analista                 3919 non-null   object 
 4   Nombre analista          3919 non-null   object 
 5   Tipo crédito             3717 non-null   object 
 6   Fecha desem              3916 non-null   object 
 7   Visita_analista_crédito  3119 non-null   object 
 8   Visita_auditor           3800 non-null   object 
 9   Actividad                3867 non-null   object 
 10  Monto                    3919 non-null   float64
 11  Cuota                    3919 non-null   object 
 12  Plazo                    3919 non-null   int64  
 13  Categoria                3919 non-null   object 
 14  Hallazgo                

--> In this context, isnt necessary to use columns like "Num" & "ID_modif" because they are not relevant for the analysis. So, we **will eliminate it**.


In [5]:
df = df_base.drop(columns=["Num", "ID_modif"])

## Cliente

In [6]:
num_dif = df.Cliente.nunique()
print(f'Number of different clients: {num_dif}')

temp = df.Cliente.value_counts()
temp = temp[temp > 1]
print(f'Number of clients with more than one record: {len(temp)}')
print(f'\nClients with more than one record:')
temp

Number of different clients: 3459
Number of clients with more than one record: 229

Clients with more than one record:


Cliente
Jose Manuel Arroyo Benavides               7
Hector Angel Ochoa Caballero               6
Enith  Garcia Vanegas                      6
Maria Eugenia Velasco Causaya              6
Bairon Alejandro Toro Osuna                6
                                          ..
Carolina  Quintana Valencia                2
Daniela  Serrano Aguirre                   2
Carlos Julio Yaguara Guevara               2
Alvaro  Mafla Ruiz                         2
Marleny Del Socorro Jaramillo Casta?Eda    2
Name: count, Length: 229, dtype: int64

In [7]:
print("Example of one client with more than one record:\n")
df[df["Cliente"]== "Nelly  Munoz Vivas"]

Example of one client with more than one record:



Unnamed: 0,Cliente,Analista,Nombre analista,Tipo crédito,Fecha desem,Visita_analista_crédito,Visita_auditor,Actividad,Monto,Cuota,Plazo,Categoria,Hallazgo,Tipo hallazgo,Riesgo,Calificación cartera,Relaciones Laborales,Oficina,zona,Regional,Validación unico,Tipo analisis,Clasificac analisis,Estado,Año,Tipo
3471,Nelly Munoz Vivas,ROO,Robinson Ocoro Centeno,Preferenci,2024-04-15 00:00:00,2024-04-15 00:00:00,,arrendamientos,5254.384,335.892,36,Otras_Políticas,Formato de vinculación con campos en blanco y/...,Incumple\n P & P,2 - Medio,No,No,Palmira,3,1,,Análisis de carácter,Políticas,,2024,
3472,Nelly Munoz Vivas,ROO,Robinson Ocoro Centeno,Preferenci,2024-04-15 00:00:00,2024-04-15 00:00:00,,arrendamientos,5254.384,335.892,36,Otras_Políticas,Formato de autorización de consulta parcialmen...,Incumple\n P & P,2 - Medio,No,No,Palmira,3,1,,Análisis de carácter,Políticas,,2024,
3624,Nelly Munoz Vivas,ROO,Robinson Ocoro Centeno,Preferenci,2024-04-15 00:00:00,2024-04-15 00:00:00,2024-05-17 10:09:54.230000,arrendamientos,5254.384,335.892,36,Controles_Administrativos_de_Agencia,Deficiencias en el análisis de comité de crédito,Incumple\n P & P,2 - Medio,No,No,Palmira,3,1,,Análisis de carácter,Control,,2024,


--> With the above, we can see that each observation is a different audit analysis or **audit finding**, even for the same client/account.

In [8]:
df.Cliente.sample(10)

2031             Herlinda  Agudelo Cardenas
1104                   Elizabeth  Martinez 
2459           Idilvia Marina Gil Arregoces
2477              Francisco  Liberato Perez
2357                  Alfonso  Arias Torres
3273            Ana Isabel Hernandez Rivera
571     Laura  Alejandra  Ramirez  Mendoza 
437            Juan Sebastian Virguez Guana
3704           Tomas Albeiro Zabaleta Serpa
2566        Elizabeth  Anrea Florez Zuleta 
Name: Cliente, dtype: object

In [9]:
# df[df["Cliente"].str.contains("Simon Jose", na=False)]

--> After some explorations, we will assume that the client column doesnt have typos or gramatical errors, that affects in someway the data



**CONCLUSIONS** 


Taking in consideration the possible uses or goals for this project, we will take this considerations:
1. **This column isnt useful for predictions/inference of an anomaly detection or clasification model, but we will keep it only for reference and EDA**.
2. **The clients-credit in the dataset should be unique, so we will eliminate the duplicated observations for the same Client-credit combination**. We take in consideration the possibility of a same client to have more than one credit in different moments.

In [10]:
df = df.drop_duplicates(subset=["Cliente","Fecha desem", "Monto", "Cuota", "Plazo"]) #To have only one observation per client-credit

# rename Cliente column to avoid capital letters
df.rename(columns={"Cliente":"cliente"}, inplace=True)

In [11]:
df.shape

(3461, 26)

## Analista & Nombre analista

In [12]:
num_dif = df.Analista.nunique()
print(f'Number of different analyst: {num_dif}')

temp = df.Analista.value_counts()
temp = temp[temp > 1]
print(f'Number of analyst with more than one record: {len(temp)}')
print(f'\nAnalysts with more than one record:')
temp

Number of different analyst: 399
Number of analyst with more than one record: 382

Analysts with more than one record:


Analista
GEE    44
IOC    43
FPN    34
IMJ    31
CNZ    29
       ..
CRA     2
IJS     2
GDW     2
MTM     2
KXP     2
Name: count, Length: 382, dtype: int64

In [13]:
df.Analista.sample(5)

2728    IZA
1711    YPN
2635    JOT
343     EQJ
931     HKB
Name: Analista, dtype: object

In [14]:
df.Analista.apply(lambda x: len(x)).value_counts()

Analista
3    3461
Name: count, dtype: int64

--> All the analyst codes have 3 letters.

In [15]:
num_dif = df["Nombre analista"].nunique()
print(f'Number of different analyst"s name: {num_dif}')

temp = df["Nombre analista"].value_counts()
temp = temp[temp > 1]
print(f'Number of analyst"s name with more than one record: {len(temp)}')
print(f"\nAnalysts with more than one record:")
temp

Number of different analyst"s name: 399
Number of analyst"s name with more than one record: 379

Analysts with more than one record:


Nombre analista
Aleatorio                          94
Gilberto Duque Duque               44
Nicolas  Suaza Pulgarin            43
Lindy Yureidy Areiza Osorio        34
Esmeralda  Guayara Medina          31
                                   ..
Edinson Orlando Omana Clavijo       2
Miller Leidy Valencia Giraldo       2
Arleidy Xiomara Guerrero Suarez     2
Cheryjan Stid Velez Gaona           2
Daniel Bustamante                   2
Name: count, Length: 379, dtype: int64

**CONCLUSIONS**

1. **Both columns isnt valuable for our analysis nor modelling phase, but we will keep the analyst code for reference. For that reason, we will eliminate the analyst name.**
2. **We will rename the column Analista**.

In [16]:
df = df.drop(columns=["Nombre analista"])

#rename analista column to avoid capital letters
df = df.rename(columns={"Analista": "analista"})

## Tipo crédito

In [17]:
num_dif = df["Tipo crédito"].nunique()
print(f'Number of different credit types: {num_dif}')
print(f'Number of nan values: {df["Tipo crédito"].isna().sum()}')

temp = df["Tipo crédito"].value_counts()
print(f"\nCredit types with more than one record:")
temp

Number of different credit types: 5
Number of nan values: 202

Credit types with more than one record:


Tipo crédito
Nuevo           1385
Renovacion       816
Preferenci       469
Preferencial     321
Renovación       268
Name: count, dtype: int64

-- > There are a typo with the world "Preferencial" and "Renovacion". 

**CONCLUSIONS**

1. **We will rename the "Preferenci" values as "Preferencial".**
2. **We will rename the "Renovación" values as "Renovacion"**
3. **We will rename the nan values with "No_especificado"**.
4. **We will rename the column name to "tipo_credito"**
5. We consider this column an important one for further analysis

In [18]:
df = df.rename(columns={"Tipo crédito": "tipo_credito"})

In [19]:
df["tipo_credito"] = df["tipo_credito"].fillna("No_especificado")
df["tipo_credito"] = np.where(df["tipo_credito"] == "Renovación", "Renovacion", df["tipo_credito"])
df["tipo_credito"] = np.where(df["tipo_credito"] == "Preferenci", "Preferencial", df["tipo_credito"])
df["tipo_credito"].value_counts()

tipo_credito
Nuevo              1385
Renovacion         1084
Preferencial        790
No_especificado     202
Name: count, dtype: int64

## Fechas: de desembolso, visita analista y visita auditor

In [20]:
df[["Fecha desem","Visita_analista_crédito","Visita_auditor"]].dtypes

Fecha desem                object
Visita_analista_crédito    object
Visita_auditor             object
dtype: object

In [21]:
df[["Fecha desem","Visita_analista_crédito","Visita_auditor"]].sample(10, random_state=1)

Unnamed: 0,Fecha desem,Visita_analista_crédito,Visita_auditor
2431,2023-01-06 00:00:00,2023-01-02 00:00:00,2023-02-16 00:00:00
3342,2024-01-31 00:00:00,2024-01-30 00:00:00,2024-03-08 00:00:00
2107,2022-05-16 00:00:00,N/D,2023-08-16 00:00:00
1608,2022-06-13 00:00:00,2022-06-13 00:00:00,2023-01-13 00:00:00
3488,2024-03-27 00:00:00,,2024-05-07 16:30:00
3905,2024-07-10 00:00:00,2024-07-05 00:00:00,2024-08-16 11:37:49.210000
968,31/03/2022,31/03/2022,00:00:00
2308,2023-07-22 00:00:00,2023-07-18 00:00:00,2023-07-28 08:50:00
1881,2023-03-30 00:00:00,2023-03-29 00:00:00,2023-05-09 00:00:00
3364,2024-02-29 00:00:00,2024-02-22 00:00:00,2024-03-13 00:00:00


--> Above we can see at least 6 different formats that these columns have. This is an unwanted problem.

In [22]:
df[["Fecha desem","Visita_analista_crédito","Visita_auditor"]].isna().sum()

Fecha desem                  3
Visita_analista_crédito    753
Visita_auditor             111
dtype: int64

In [23]:
df[(df["Visita_auditor"]=="00:00:00") | (df["Visita_auditor"]==datetime.time(0, 0))].shape

(126, 25)

--> We can note that there are some observations with dates like "00:00:00"

**CONCLUSIONS**  
  
For further analysys or feature engineering, we can keep the columns "Fecha desem" and "Visita_analista_credito".

1. **We will rename the columns to avoid capital letters and blank spaces.**
2. **We will standarize the columns to a date format, avoiding errors and missing values**.
3. **We will drop the observations with dates like "00:00:00".**
4. **We will eliminate the auditor visit column**, because in an hipotetic modelling phase, we cant know the auditor visit in advance.

In [24]:
# rename columns
df = df.rename(columns={"Visita_analista_crédito": "visita_analista_credito","Fecha desem": "fecha_desembolso"})

# drop observations with dates similar to 00:00:00
df = df[(df["Visita_auditor"]!="00:00:00") & (df["Visita_auditor"]!=datetime.time(0, 0))]

# Define a function to handle different dates formats
def convert_to_datetime(value):
    if pd.isna(value) or value in ["N/D", "NAN"]:  # Handle missing or invalid values
        return pd.NaT
    try:
        return pd.to_datetime(value, dayfirst=True, errors='coerce')  # Convert to datetime and handle errors
    except:
        return pd.NaT  # Return NaT if it fails

# Apply function to both columns
df["fecha_desembolso"] = df["fecha_desembolso"].apply(convert_to_datetime)
df["visita_analista_credito"] = df["visita_analista_credito"].apply(convert_to_datetime)

# Drop column
df = df.drop(columns=["Visita_auditor"])

In [25]:
df[["fecha_desembolso","visita_analista_credito"]].isna().sum()

fecha_desembolso             4
visita_analista_credito    873
dtype: int64

In [26]:
df[["fecha_desembolso","visita_analista_credito"]].dtypes

fecha_desembolso           datetime64[ns]
visita_analista_credito    datetime64[ns]
dtype: object

In [27]:
df[["fecha_desembolso","visita_analista_credito"]].sample(10, random_state=1)

Unnamed: 0,fecha_desembolso,visita_analista_credito
1930,2023-04-28,NaT
3362,2024-02-16,2024-02-12
1047,2022-05-13,2022-05-10
2129,2023-04-14,NaT
1473,2022-08-20,2022-08-17
657,2022-10-20,2022-10-15
222,2022-09-22,2022-09-22
3328,2024-01-31,2024-01-20
2168,2022-11-21,2022-11-21
204,2022-08-20,2022-08-19


## Actividad

In [28]:
num_dif = df["Actividad"].nunique()
print(f'Number of different activities values: {num_dif}')
print(f'Number of nan values: {df["Actividad"].isna().sum()}')

temp = df["Actividad"].value_counts()
print(f"\nActivities with more than one record:")
temp[0:15]

Number of different activities values: 2273
Number of nan values: 52

Activities with more than one record:


Actividad
venta de mercancia    57
tienda                52
arriendos             48
venta de comidas      29
maestro de obra       24
venta de ropa         21
arrendamientos        18
confeccion            17
Venta De Mercancia    16
construccion          14
venta de comida       14
confecciones          14
venta de leche        14
Tienda                12
mercancia             12
Name: count, dtype: int64

In [29]:
temp.sample(10, random_state=10)

Actividad
producci?n mantecados y panaderia                                          1
ajonjoli                                                                   1
oficial construccion                                                       2
cuidado en casa                                                            1
venta de comidas y empanas - elaboracion de comidas y platos preparados    1
cria y comercio de cerdos                                                  1
fundicion de metal                                                         1
mandarin                                                                   1
restaurante el balcon                                                      1
estetic glam                                                               1
Name: count, dtype: int64

--> We can see a lot of different activities, but also we can see that there are some values that are referring to the same, for example "Tienda" and "tienda".

--> Several activities have typos and errors. This could difficult the analysis or next phases.

**CONCLUSIONS**  

The activity variable could be very important for any EDA or modelling, but it has a lot of values, errors and should be very difficult to fix.  
Despite that, we will try to fix the Activity column in order to gather some information in next steps.  

1. **We will rename the variable column.**
2. **We will run some fixes and considerations to the Activity column**.
3. **and we will create a new activity_v2 column to not drop the original one.**

In [30]:
# Function to clean Activity values
def clean_occupation(value):
    if pd.isna(value):  # Handle missing values
        return "no_especificado"
    value = value.strip()  # Remove spaces at the beginning and end
    value = unidecode(value)  # Remove accents
    value = value.lower()  # Convert to lowercase
    value = value.replace(" ", "_")  # Replace spaces with underscores
    return value

# Apply function to the column
df["Actividad"] = df["Actividad"].apply(clean_occupation)

# rename column
df = df.rename(columns={"Actividad": "actividad"})

## Monto, cuota & plazo

In [31]:
df[["Monto","Cuota","Plazo"]].dtypes

Monto    float64
Cuota     object
Plazo      int64
dtype: object

In [32]:
df[["Monto","Cuota","Plazo"]].isna().sum()

Monto    0
Cuota    0
Plazo    0
dtype: int64

In [33]:
df[(df["Cuota"]==0) | (df["Plazo"]==0) | (df["Monto"]==0)].shape

(12, 24)

--> We note that the "Cuota" variable is an object, also there isnt nan values.  

--> We also note that there are some observations with zero

In [34]:
df[["Monto","Cuota","Plazo"]].iloc[0:5]

Unnamed: 0,Monto,Cuota,Plazo
0,15069.268,792.642,36
1,402.0,209.228,626
3,19213.296,991.582,42
4,1055.378,205.648,6
7,2481.136,170.529,24


--> We note that the values of Monto & Cuota are divided by 1000

**CONCLUSIONS**

1. We will modify the column names to avoid capital letters.
2. **We will cast the Cuota column to float.** To do this, it is necessary to "clean" this column before cast it.
3. **We will multiply the Monto & Cuota by 1000.**
4. **We will drop the observations with a zero value in any of the three columns.**

In [35]:
# rename columns
df = df.rename(columns={"Monto": "monto", "Cuota": "cuota", "Plazo": "plazo"})

# eliminate characters that are not numbers in Cuota
df["cuota"] = df["cuota"].astype(str).str.replace(r'[^0-9.]', '', regex=True)

# turn empty strings into 0
df["cuota"] = np.where(df["cuota"] == "", "0", df["cuota"])

# convert to float
df["cuota"] = pd.to_numeric(df["cuota"], errors='coerce')

# multiply by 1000 to convert to thousands
df["cuota"] = round(df["cuota"] * 1000)
df["monto"] = round(df["monto"] * 1000)

# drop the observations with a zero value in any of the three columns.
df = df[(df["cuota"] != 0) & (df["plazo"] != 0) & (df["monto"] != 0)]

In [36]:
print(df[["monto","cuota","plazo"]].iloc[0:2])

df[["monto","cuota","plazo"]].dtypes


        monto     cuota  plazo
0  15069268.0  792642.0     36
1    402000.0  209228.0    626


monto    float64
cuota    float64
plazo      int64
dtype: object

## Categoria, hallazgo, tipo hallazgo, Riesgo

In [37]:
df[["Categoria","Hallazgo","Tipo hallazgo", "Riesgo"]].dtypes

Categoria        object
Hallazgo         object
Tipo hallazgo    object
Riesgo           object
dtype: object

In [38]:
df[["Categoria","Hallazgo","Tipo hallazgo", "Riesgo"]].isna().sum()

Categoria        0
Hallazgo         1
Tipo hallazgo    1
Riesgo           4
dtype: int64

In [39]:
df[["Categoria","Hallazgo","Tipo hallazgo", "Riesgo"]].sample(5,random_state=2)

Unnamed: 0,Categoria,Hallazgo,Tipo hallazgo,Riesgo
898,ESTABILIDAD,Negocio en sociedad,Incumple P & P,2 - Medio
2137,Flujo_de_Caja,Otros ingresos inexistentes,Acto Irregular,2 - Medio
1158,OTRAS_POLITICAS,Formato de vinculación con campos en blanco y/...,Incumple P & P,2 - Medio
2495,Otras_Políticas,Formato de vinculación con campos en blanco y/...,Incumple\n P & P,2 - Medio
1009,FRAUDE_Y_PRÁCTICAS_INDEBIDAS,Alerta de Irregularidades,Acto Irregular,1 - Alto


--> We can see that each column have different strings formats, for example, capital letters, blank spaces, patterns like "\n", etc.

Now, we will review each column further

### Categoria

In [40]:
num_dif = df["Categoria"].nunique()
print(f'Number of different Category values: {num_dif}')
print(f'Number of nan values: {df["Categoria"].isna().sum()}')

temp = df["Categoria"].value_counts()
print(f"\nCategory top:")
temp[0:]

Number of different Category values: 30
Number of nan values: 0

Category top:


Categoria
Sin_Hallazgo                            996
Otras_Políticas                         522
OTRAS_POLITICAS                         339
Limitación_en_el_Alcance                208
LIMITACIÓN_EN_EL_ALCANCE                165
ESTABILIDAD                             148
Políticas_Críticas                      121
Otros                                   107
CALIDAD_DE_DATOS                        106
Existencia_y_Continuidad_del_Negocio     87
FRAUDE_Y_PRÁCTICAS_INDEBIDAS             85
Estabilidad                              72
Reputación                               67
Calidad_de_Datos                         57
FLUJO_DE_CAJA                            45
INCUMPLIMIENTO_AL_SAC                    33
Flujo_de_Caja                            30
POLITICAS_CRITICAS                       27
Controles_Gestión_Comercial              23
EXISTENCIA_Y_CONTINUIDAD_DEL_NEGOCIO     22
REPUTACIÓN                               15
Habilidad_Empresarial                    13
Incumplimiento_Al_SAC 

--> We find some values that are referred to the same but are wroten different.

### Hallazgo

In [41]:
num_dif = df["Hallazgo"].nunique()
print(f'Number of different Findings values: {num_dif}')
print(f'Number of nan values: {df["Hallazgo"].isna().sum()}')

temp = df["Hallazgo"].value_counts()
print(f"\nFindings top:")
temp[0:]

Number of different Findings values: 88
Number of nan values: 1

Findings top:


Hallazgo
Sin hallazgo                                                                                                                  996
Formato de vinculación con campos en blanco y/o errados                                                                       390
Formato de autorización de consulta parcialmente diligenciado                                                                 273
Diferencias en el arraigo del cliente                                                                                          98
No se logró evidenciar la existencia del cliente y/o negocio                                                                   97
Se evidencio existencia del cliente pero no se pudo confirmar el negocio                                                       94
Se evidenció existencia del cliente pero no se pudo confirmar el negocio                                                       84
Negocio en sociedad                                                              

--> This column is a description of the findings, is like a comments column. So, it is not necessary to modify a lot this column. Even, we could ignore this column.

### Tipo hallazgo

In [42]:
num_dif = df["Tipo hallazgo"].nunique()
print(f'Number of different finding types values: {num_dif}')
print(f'Number of nan values: {df["Tipo hallazgo"].isna().sum()}')

temp = df["Tipo hallazgo"].value_counts()
print(f"\nfinding types top:")
temp[0:]

Number of different finding types values: 10
Number of nan values: 1

finding types top:


Tipo hallazgo
Sin Hallazgo                993
Incumple\n P & P            629
Fallas en la Metodología    555
Incumple P & P              434
Limitación                  373
Acto Irregular              215
Otros                       108
Incumple P&P                  9
Sin_Hallazgo                  3
Sin hallazgo                  1
Name: count, dtype: int64

--> "Limitacion" is referred that the audit cant be completed due to several reasons. This observations doesnt add value to the next steps.

### Riesgo

In [43]:
num_dif = df["Riesgo"].nunique()
print(f'Number of different Risk values: {num_dif}')
print(f'Number of nan values: {df["Riesgo"].isna().sum()}')

temp = df["Riesgo"].value_counts()
print(f"\nRisk top:")
temp[0:]

Number of different Risk values: 5
Number of nan values: 4

Risk top:


Riesgo
4 - Otro     1339
2 - Medio    1097
3 - Bajo      436
1 - Alto      327
No aplica     118
Name: count, dtype: int64

--> The "Riesgo" column is the cleanest one. It doesnt need cleaning steps.

Now lets check the "No aplica" level

In [44]:
df[df["Riesgo"]=="No aplica"]["Categoria"].value_counts()

Categoria
LIMITACIÓN_EN_EL_ALCANCE    65
Sin_Hallazgo                53
Name: count, dtype: int64

In [45]:
df[df["Riesgo"]=="No aplica"]["Hallazgo"].value_counts()

Hallazgo
Sin hallazgo                                                                               53
Se evidencio existencia del cliente pero no se pudo confirmar el negocio                   32
No se logro evidenciar la existencia del cliente y/o negocio                               23
Se evidencio existencia del negocio pero no se pudo confirmar la información financiera    10
Name: count, dtype: int64

In [46]:
df[df["Categoria"]=="Sin_Hallazgo"]["Riesgo"].value_counts()

Riesgo
4 - Otro     939
No aplica     53
Name: count, dtype: int64

--> The "No aplica" level have values without findings or with limitations to complete the audit.

----

### Conclusion

To clean this columns:

1. We will drop the few NA values for each column
2. **We will avoid capital letters and drop some patterns.**
3. **We will rename some values of 'Categoria' & 'Tipo hallazgo' columns in order to unify levels.**
4. We will rename the columns to avoid capital letters.
5. **We will drop the observations with "Tipo hallazgo" equal to "Limitacion"**.
6. Due to the previous step (#5), we will rename the level "No aplica" of the "Riesgo" column to "4 - otro"

In [47]:
# Drop nan values of the 4 columns
df = df.dropna(subset=["Categoria","Hallazgo","Tipo hallazgo", "Riesgo"])

# Function to clean the column values
def clean_string_column(value):
    if pd.isna(value):  # Handle missing values
        return "no_especificado"
    value = value.strip()  # Remove spaces at the beginning and end
    value = unidecode(value)  # Remove accents
    value = value.lower()  # Convert to lowercase
    return value

# Apply function to the 3 columns
df["Categoria"] = df["Categoria"].apply(clean_string_column)
df["Hallazgo"] = df["Hallazgo"].apply(clean_string_column)
df["Tipo hallazgo"] = df["Tipo hallazgo"].apply(clean_string_column)

# Combine some finding types values due to typos (incumple p&p)
df["Tipo hallazgo"] = df["Tipo hallazgo"].apply(lambda x: "incumple_p&p" if "incumple" in x else x)

# after review, there isnt more typos in "Categoria" column

# rename columns
df = df.rename(
    columns={
        "Tipo hallazgo": "tipo_hallazgo",
        "Riesgo": "riesgo",
        "Categoria": "categoria",
        "Hallazgo": "hallazgo",
    }
)

# drop observations with limitations (audit incomplete)
df=df[df["tipo_hallazgo"]!="limitacion"]

# Rename values "No aplica" of column Riesgo
df["riesgo"] = df["riesgo"].str.replace("No aplica","4 - Otro")

In [48]:
df[["categoria","hallazgo","tipo_hallazgo", "riesgo"]].sample(5,random_state=2)

Unnamed: 0,categoria,hallazgo,tipo_hallazgo,riesgo
2018,sin_hallazgo,sin hallazgo,sin hallazgo,4 - Otro
3348,otras_politicas,formato de vinculacion con campos en blanco y/...,incumple_p&p,2 - Medio
640,sin_hallazgo,sin hallazgo,sin hallazgo,4 - Otro
783,calidad_de_datos,informacion errada en datos basicos,fallas en la metodologia,3 - Bajo
3286,existencia_y_continuidad_del_negocio,credito otorgado a mas de un titular con el mi...,acto irregular,1 - Alto


In [49]:
df[["categoria","hallazgo","tipo_hallazgo", "riesgo"]].isna().sum()

categoria        0
hallazgo         0
tipo_hallazgo    0
riesgo           0
dtype: int64

In [50]:
df.tipo_hallazgo.value_counts()

tipo_hallazgo
incumple_p&p                1072
sin hallazgo                 994
fallas en la metodologia     555
acto irregular               215
otros                        108
Name: count, dtype: int64

In [51]:
df.riesgo.value_counts()

riesgo
2 - Medio    1097
4 - Otro     1084
3 - Bajo      436
1 - Alto      327
Name: count, dtype: int64

In [52]:
df.shape

(2944, 24)

## Calificacion cartera y Relaciones laborales

In [53]:
df["Calificación cartera"].value_counts()  

Calificación cartera
No    2725
Si     216
si       3
Name: count, dtype: int64

In [54]:
df["Relaciones Laborales"].value_counts()  

Relaciones Laborales
No    2287
Si     324
Name: count, dtype: int64

In [55]:
df[["Calificación cartera","Relaciones Laborales"]].isna().sum()  

Calificación cartera      0
Relaciones Laborales    333
dtype: int64

--> This columns have few typos and Nan values

**CONCLUSION**

1. Rename columns to avoid ccapital letters
2. We will change NA values to "no_especificado"
3. fix typos in Calificación cartera

In [56]:
# rename columns
df.rename(columns={"Calificación cartera": "calificacion_cartera", "Relaciones Laborales": "relaciones_laborales"}, inplace=True)

# NA values to "no_especificado"
df["calificacion_cartera"] = df["calificacion_cartera"].fillna("no_especificado")
df["relaciones_laborales"] = df["relaciones_laborales"].fillna("no_especificado")

# fix typos
df["calificacion_cartera"] = df["calificacion_cartera"].apply(lambda x: "Si" if "si" in x else x)


## Oficina

In [57]:
num_dif = df["Oficina "].nunique()
print(f'Number of different "Oficina" values: {num_dif}')
print(f'Number of nan values: {df["Oficina "].isna().sum()}')

temp = df["Oficina "].value_counts()
print(f"\n'Oficina' top:")
temp

Number of different "Oficina" values: 62
Number of nan values: 0

'Oficina' top:


Oficina 
Dosquebradas              177
Ibague                    127
Fonsecca                  125
Tulua                      91
Santa Rosa                 77
Facatativa                 70
Laureano Gómez             60
Independencia              57
Magangue                   55
Pasto                      55
Tulua Calle 25             54
Engativa                   52
Poblado                    51
Sincelejo                  49
Palmira                    49
Sur                        49
Maicao                     49
Pasto Las Lunas            48
Pereira Lagos              48
Chaparral                  48
Cartagena del Chaira       47
Santa Marta                46
El Espinal                 46
San Juan del CesAR         45
Armenia Sur                45
Soledad                    45
Santander de Quilichao     45
La Plata                   45
Jamundí                    45
Corozal                    44
Calima                     44
La Unión                   44
Ipiales                    43
C

--> This column seems without problems nor typos.

**CONCLUSION**

1. Change column name.
2. avoid capital letters and blank spaces.

In [58]:
# Rename column Oficina
df = df.rename(columns={"Oficina ": "oficina"})

# avoid capital letters and blank spaces
df["oficina"] = df["oficina"].apply(lambda x: x.strip().lower().replace(" ","_"))

## Zona, regional

In [59]:
num_dif = df["zona "].nunique()
print(f'Number of different "zona" values: {num_dif}')
print(f'Number of nan values: {df["zona "].isna().sum()}')

temp = df["zona "].value_counts()
print(f"\n'zona' top:")
temp

Number of different "zona" values: 17
Number of nan values: 0

'zona' top:


zona 
12    390
11    305
13    304
7     220
3     219
4     216
1     201
15    193
8     148
6     137
5     131
14    121
2      94
10     92
16     81
17     55
9      37
Name: count, dtype: int64

In [60]:
num_dif = df["Regional"].nunique()
print(f'Number of different "zona" values: {num_dif}')
print(f'Number of nan values: {df["Regional"].isna().sum()}')

temp = df["Regional"].value_counts()
print(f"\n'zona' top:")
temp

Number of different "zona" values: 5
Number of nan values: 0

'zona' top:


Regional
3    823
4    722
1    620
5    498
2    281
Name: count, dtype: int64

--> Both columns also seems without problems

**CONCLUSION**

1. We will only change the column names.

In [61]:
# change column names
df = df.rename(columns={"zona ": "zona", "Regional": "regional"})

## Validacion unico

In [62]:
num_dif = df["Validación unico"].nunique()
print(f'Number of different "validacion unico" values: {num_dif}')
print(f'Number of nan values: {df["Validación unico"].isna().sum()}')

temp = df["Validación unico"].value_counts()
print(f"\n'validacion unico' top:")
temp

Number of different "validacion unico" values: 2
Number of nan values: 2732

'validacion unico' top:


Validación unico
ok          184
repetido     28
Name: count, dtype: int64

--> This column seems like a validation column to check some "repeated" observations based on something that we currently dont know.

**CONCLUSION** 
1. For now, we will drop the observations with "repetido" and also drop the column.

In [63]:
# drop observations with "repetido" value and drop column
df = df[df["Validación unico"] != "repetido"]
df = df.drop(columns=["Validación unico"])

## Tipo analisis

In [64]:
num_dif = df["Tipo analisis "].nunique()
print(f'Number of different "Tipo analisis" values: {num_dif}')
print(f'Number of nan values: {df["Tipo analisis "].isna().sum()}')

temp = df["Tipo analisis "].value_counts()
print(f"\n'Tipo analisis' top:")
temp

Number of different "Tipo analisis" values: 5
Number of nan values: 1285

'Tipo analisis' top:


Tipo analisis 
Análisis de carácter     970
Sin hallazgo             553
Otros                     56
Analisis Cuantitativo     48
Analisis de carácter       4
Name: count, dtype: int64

--> This column have a lot of NA values and is no clear how this information could add value to our next steps.

**CONCLUSIONS**
1. We will drop this column due to the lack of valuable information

In [65]:
#drop column Tipo analisis
df = df.drop(columns=["Tipo analisis "])

## Clasificacion analisis

In [66]:
num_dif = df["Clasificac analisis"].nunique()
print(f'Number of different "Clasificacion analisis" values: {num_dif}')
print(f'Number of nan values: {df["Clasificac analisis"].isna().sum()}')

temp = df["Clasificac analisis"].value_counts()
print(f"\n'Clasificacion analisis' top:")
temp

Number of different "Clasificacion analisis" values: 12
Number of nan values: 1285

'Clasificacion analisis' top:


Clasificac analisis
Políticas                      614
Sin hallazgo                   553
Estabilidad                    103
Alerta de fraude                98
Reputación                      74
Calidad de datos                65
Flujo de caja                   48
Otros                           42
Quejas                          13
Habilidad empresarial           12
Control                          8
Seguridad de la información      1
Name: count, dtype: int64

--> This column have a lot of NA values and we think that this column data is very similar to the column "Categoria".

**CONCLUSIONS**
1. We will drop this column due that its data is similar to other column that have more information

In [67]:
#drop column Tipo analisis
df = df.drop(columns=["Clasificac analisis"])

## Estado

In [68]:
num_dif = df["Estado"].nunique()
print(f'Number of different status values: {num_dif}')
print(f'Number of nan values: {df["Estado"].isna().sum()}')

temp = df["Estado"].value_counts()
print(f"\nStatus top:")
temp

Number of different status values: 6
Number of nan values: 608

Status top:


Estado
Normal                            1060
Cancelada                          944
Castigo                            195
Modificado                          91
Cobro Judicial                      13
Reestructurado                       5
Name: count, dtype: int64

--> This columns seems to have valuable information about the credit status. Doesnt have typos but several NA values.

**CONCLUSIONS**
1. We will rename the column to avoid capital letters.
2. We will fill NA values with "no_especificado".
3. We will clean a little more the column to avoid capital letters nor blank spaces at the beginning or the last of the string.

In [69]:
#rename column Estado
df = df.rename(columns={"Estado": "estado"})

# fill NA with no_especificado
df["estado"] = df["estado"].fillna("no_especificado")

# eliminate capital letters and blank spaces
df["estado"] = df["estado"].apply(lambda x: x.strip().lower())

## Año

In [70]:
num_dif = df["Año"].nunique()
print(f'Number of different year values: {num_dif}')
print(f'Number of nan values: {df["Año"].isna().sum()}')

temp = df["Año"].value_counts()
print(f"\n Year top:")
temp

Number of different year values: 3
Number of nan values: 0

 Year top:


Año
2022    1285
2023    1023
2024     608
Name: count, dtype: int64

--> Seems without problems

**CONCLUSIONS**
1. We will only rename the column


In [71]:
# rename the column year
df = df.rename(columns={"Año": "year"})

## Tipo

In [72]:
num_dif = df["Tipo"].nunique()
print(f'Number of different type values: {num_dif}')
print(f'Number of nan values: {df["Tipo"].isna().sum()}')

temp = df["Tipo"].value_counts()
print(f"\n type top:")
temp

Number of different type values: 3
Number of nan values: 608

 type top:


Tipo
Otros Hallazgos       1431
Sin Hallazgo           735
Afecta Estabilidad     142
Name: count, dtype: int64

--> This column seems without typos but have several NA values. Right now, is not clear what is the information that this column have but we will keep it by now.

**CONCLUSIONS**
1. We will rename the column.
2. We will fill the NA values with "no_especificado"
3. We will avoid capital letters and blank spaces

In [73]:
#rename the column
df = df.rename(columns={"Tipo": "tipo"})

#fill NA with no_especificado
df["tipo"] = df["tipo"].fillna("no_especificado")

#avoid capital letters and blank spaces
df["tipo"] = df["tipo"].apply(lambda x: x.strip().lower())

# List of data cleaning steps

# Create new variables

## New objective variables

In [74]:
# Create new column "riesgo", labeling the values to numbers (integers)

df["riesgo_int"] = df["riesgo"].replace({
    "1 - Alto": 1,
    "2 - Medio": 2,
    "3 - Bajo": 3,
    "4 - Otro": 4
})

# Create new binomial "riesgo"ArithmeticError
df["riesgo_bin"] = df["riesgo"].replace({
    "1 - Alto": 1,
    "2 - Medio": 1,
    "3 - Bajo": 1,
    "4 - Otro": 0
})

print(f'Label encoding "riesgo":\n{df["riesgo_int"].value_counts()}')
print("--")
print(f'Binomial encodign "riesgo":\n{df["riesgo_bin"].value_counts()}')

Label encoding "riesgo":
riesgo_int
4    1084
2    1084
3     423
1     325
Name: count, dtype: int64
--
Binomial encodign "riesgo":
riesgo_bin
1    1832
0    1084
Name: count, dtype: int64


  df["riesgo_int"] = df["riesgo"].replace({
  df["riesgo_bin"] = df["riesgo"].replace({


## New occupation variable

In [75]:
# Function to combine some Activity values to create macro-activities
def combine_occupation(value):
    if any(char in value for char in ["no_especificado"]):
        return "no_especificado"
    if any(char in value for char in ["ambulant"]):
        return "comercio_ambulante"
    elif any(
        char in value
        for char in [
            "agrico",
            "agricultura",
            "agro",
            "agricultor",
            "culti",
            "ganad",
            "cerdo",
            "pollo",
            "vaca",
            "animal",
            "cr?",
            "cria",
            "platano",
            "banano",
        ]
    ):
        return "agricultura_ganaderia_y_afines"
    elif any(
        char in value
        for char in [
            "comida",
            "restau",
            "alimen",
            "condimento",
            "cafe",
            "caf?",
            "frit",
            "plato",
            "frut",
            "fruv",
            "pan",
            "helad",
            "arepa",
            "empana",
            "piz",
            "leche",
            "almuerz",
            "carnic",
            "salsa",
            "lacte",
        ]
    ):
        return "sector_alimenticio"
    elif any(char in value for char in ["arriendo", "arren", "inmobi", "alquil"]):
        return "arriendos_alquiler_e_inmobiliarios"
    elif any(char in value for char in ["obra", "const", "interior","acabado"]):
        return "construccion_obras_y_afines"
    elif any(
        char in value for char in ["confec", "modist", "sastr", "costur", "prenda"]
    ):
        return "confeccion_y_afines"
    elif any(char in value for char in ["miscel","papel","boutique","floriste","flores","detall","sorpre","perfum","artesani","variedades"]):
        return "comercios_varios_y_detallistas"
    elif any(
        char in value for char in ["venta", "vta", "comercio", "mercanc", "comerci"]
    ):
        return "comercio_y_ventas_general"
    elif any(char in value for char in ["tienda", "almac", "supermer", "abarro","mercado_","ferret"]):
        return "tiendas_almacenes_y_ferreterias"
    elif any(
        char in value
        for char in [
            "mecan",
            "autom",
            "metal",
            "tecnico",
            "t?cni",
            "fabricac",
            "electri",
            "soldad",
            "pint",
            "laton",
            "taller",
            "manteni",
            "reparaci",
            "arregl",
            "ebanist",
            "herr",
            "plome",
            "artesan",
            "mueble",
            "manualida",
            "fundici",
            "guada",
            "llant",
            "vulcaniz",
        ]
    ):
        return "oficios_tecnicos_y_manuales"
    elif any(
        char in value for char in ["parquead", "parki", "lavade", "carwash"]
    ):
        return "lavaderos_parqueaderos_y_afines"
    elif any(
        char in value for char in ["transp", "trasp", "taxi", "bus", "vehicu", "moto","carga"]
    ):
        return "transporte_vehiculos_y_afines"
    elif any(
        char in value
        for char in [
            "manicur",
            "peluq",
            "bell",
            "unas",
            "u?as",
            "nail",
            "spa",
            "pein",
            "estili",
            "maquill",
            "barber",
            "masaj",
            "depil",
            "esteti",
            "cosmet",
        ]
    ):
        return "belleza_y_estetica"
    elif any(
        char in value for char in ["enferm", "medic", "salud", "drog", "farm", "hospi","dental","odonto"]
    ):
        return "salud_y_afines"
    elif any(
        char in value
        for char in ["educ", "escol", "refuer", "univ", "coleg", "clase", "docen"]
    ):
        return "servicios_educativos"
    elif any(char in value for char in ["aseo", "limpieza", "aseador"]):
        return "servicios_de_limpieza"
    # elif any(
    #     char in value
    #     for char in ["eria", "aria", "vario", "guada", "miscel", "publici"]
    # ):
    #     return "otros_comercios_servicios_oficios"
    else:
        return "otros_servicios_y_negocios"


df["actividad_v2"] = df["actividad"].apply(combine_occupation)

# To check all the activities that have "venta" or "vta" or "comercio" or "mercancia" or "comerci" in the name
# pd.Series([x for x in df["actividad"] if any(char in x for char in ["venta", "vta", "comercio","mercancia","comerci"])]).unique()

In [76]:
num_dif = df["actividad_v2"].nunique()
print(f'Number of different activities (Version2) values: {num_dif}')
print(f'Number of nan values: {df["actividad_v2"].isna().sum()}')

temp = df["actividad_v2"].value_counts()
print(f"\nActivities (Version2) with more than one record:")
temp

Number of different activities (Version2) values: 18
Number of nan values: 0

Activities (Version2) with more than one record:


actividad_v2
comercio_y_ventas_general             593
otros_servicios_y_negocios            489
sector_alimenticio                    409
tiendas_almacenes_y_ferreterias       202
oficios_tecnicos_y_manuales           188
agricultura_ganaderia_y_afines        185
belleza_y_estetica                    158
confeccion_y_afines                   140
arriendos_alquiler_e_inmobiliarios    126
comercios_varios_y_detallistas        117
construccion_obras_y_afines            99
transporte_vehiculos_y_afines          65
no_especificado                        51
salud_y_afines                         42
servicios_educativos                   17
comercio_ambulante                     15
servicios_de_limpieza                  11
lavaderos_parqueaderos_y_afines         9
Name: count, dtype: int64

In [77]:
df

Unnamed: 0,cliente,analista,tipo_credito,fecha_desembolso,visita_analista_credito,actividad,monto,cuota,plazo,categoria,hallazgo,tipo_hallazgo,riesgo,calificacion_cartera,relaciones_laborales,oficina,zona,regional,estado,year,tipo,riesgo_int,riesgo_bin,actividad_v2
0,Hector Julio Pabon Castano,AIZ,Renovacion,2021-06-11,NaT,don_patacon_postobon,15069268.0,792642.0,36,fraude_y_practicas_indebidas,credito otorgado a mas de un titular con el mi...,acto irregular,1 - Alto,Si,Si,dosquebradas,12,4,cancelada,2022,afecta estabilidad,1,1,otros_servicios_y_negocios
1,Casas Marisol ...,JQQ,Renovacion,2021-06-15,2021-06-08,expendio_de_comidas_preparadas_en_cafeterias,402000.0,209228.0,626,fraude_y_practicas_indebidas,presunto negocio inexistente,acto irregular,1 - Alto,No,Si,poblado,1,1,castigo,2022,afecta estabilidad,1,1,sector_alimenticio
3,Jennifer Lopez Perez,IOC,Nuevo,2022-05-03,NaT,drogueria,19213296.0,991582.0,42,fraude_y_practicas_indebidas,negocio no es del solicitante,acto irregular,1 - Alto,Si,Si,dosquebradas,12,4,modificado,2022,afecta estabilidad,1,1,salud_y_afines
4,Laura Marcela Valdes Zapata,CZL,Renovacion,2022-03-11,2022-03-05,manualidades_artes_plasticas,1055378.0,205648.0,6,fraude_y_practicas_indebidas,credito otorgado a mas de un titular con el mi...,acto irregular,1 - Alto,Si,no_especificado,tulua,3,1,cancelada,2022,afecta estabilidad,1,1,oficios_tecnicos_y_manuales
7,Martha Cecilia Veru,AQT,Preferencial,2022-07-23,2022-07-18,venta_de_mercancia,2481136.0,170529.0,24,fraude_y_practicas_indebidas,presunto negocio inexistente,acto irregular,1 - Alto,Si,Si,independencia,1,1,normal,2022,afecta estabilidad,1,1,comercio_y_ventas_general
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3909,Jhon Alexander Valencia Rodriguez,MFP,Nuevo,2024-07-13,2024-07-11,venta_de_banano_verde,400000.0,87229.0,8,otras_politicas,formato de autorizacion de consulta parcialmen...,incumple_p&p,2 - Medio,No,No,buenaventura,4,1,no_especificado,2024,no_especificado,2,1,agricultura_ganaderia_y_afines
3912,Uriel Alejandro Sepulveda Morales,JPR,Nuevo,2024-07-12,2024-07-12,venta_de_jugos,500000.0,108938.0,8,otras_politicas,formato de vinculacion con campos en blanco y/...,incumple_p&p,2 - Medio,No,No,buenaventura,4,1,no_especificado,2024,no_especificado,2,1,comercio_y_ventas_general
3913,Yasneli Rodallega Mondragon,ASB,Nuevo,2024-07-15,2024-07-11,peinados_y_u?as,2796256.0,236302.0,24,sin_hallazgo,sin hallazgo,sin hallazgo,4 - Otro,No,No,buenaventura,4,1,no_especificado,2024,no_especificado,4,0,belleza_y_estetica
3914,Marcela Alexandra Torres Montano,OFE,Renovacion,2024-07-18,2024-07-08,venta_de_productos_por_catalogo,2861720.0,207570.0,30,otras_politicas,formato de vinculacion con campos en blanco y/...,incumple_p&p,2 - Medio,No,No,buenaventura,4,1,no_especificado,2024,no_especificado,2,1,comercio_y_ventas_general


# Final result

In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2916 entries, 0 to 3918
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   cliente                  2915 non-null   object        
 1   analista                 2916 non-null   object        
 2   tipo_credito             2916 non-null   object        
 3   fecha_desembolso         2913 non-null   datetime64[ns]
 4   visita_analista_credito  2127 non-null   datetime64[ns]
 5   actividad                2916 non-null   object        
 6   monto                    2916 non-null   float64       
 7   cuota                    2916 non-null   float64       
 8   plazo                    2916 non-null   int64         
 9   categoria                2916 non-null   object        
 10  hallazgo                 2916 non-null   object        
 11  tipo_hallazgo            2916 non-null   object        
 12  riesgo                   2916 non-null 

In [79]:
# re order the columns of the df
df = df[["cliente", "analista", "tipo_credito", "actividad", "actividad_v2", "monto", "cuota", "plazo", "oficina", "zona", "regional", "fecha_desembolso", "visita_analista_credito",
         "calificacion_cartera", "relaciones_laborales", "estado", "year", "tipo",
       "categoria", "hallazgo", "tipo_hallazgo", "riesgo","riesgo_int","riesgo_bin"]]

In [80]:
df.sample(5)

Unnamed: 0,cliente,analista,tipo_credito,actividad,actividad_v2,monto,cuota,plazo,oficina,zona,regional,fecha_desembolso,visita_analista_credito,calificacion_cartera,relaciones_laborales,estado,year,tipo,categoria,hallazgo,tipo_hallazgo,riesgo,riesgo_int,riesgo_bin
958,Dora Caicedo Perlaza,BHU,Nuevo,flor_del_mar,otros_servicios_y_negocios,1187350.0,227158.0,6,tulua,3,1,2022-05-05,2022-05-05,Si,no_especificado,castigo,2022,otros hallazgos,existencia_y_continuidad_del_negocio,negocio inexistente hechos sobrevinientes,fallas en la metodologia,3 - Bajo,3,1
3077,Nancy Trochez Trochez,ISY,Preferencial,distribucion_y_cultivo_de_cafe,agricultura_ganaderia_y_afines,30994080.0,1204510.0,48,jamundí,13,3,2023-07-07,NaT,Si,Si,no_especificado,2024,no_especificado,existencia_y_continuidad_del_negocio,credito otorgado a mas de un titular con el mi...,acto irregular,1 - Alto,1,1
1899,Daniel Alejandro Barrientos Correa,EJV,Nuevo,conercializadora_de_productos_alimenticios,sector_alimenticio,3478880.0,536980.0,8,armenia_sur,12,4,2023-04-12,2023-04-10,No,No,cancelada,2023,sin hallazgo,sin_hallazgo,sin hallazgo,sin hallazgo,4 - Otro,4,0
1367,Martha Elvia Perez Wilchez,IRI,Renovacion,venta_de_fritos,sector_alimenticio,4968375.0,638685.0,15,santa_marta,13,5,2022-07-28,2022-07-22,No,No,castigo,2022,otros hallazgos,otras_politicas,no cumple con los requisitos propios del credito,incumple_p&p,3 - Bajo,3,1
2396,Gloria Amparo Diaz Osorio,BGZ,Renovacion,venta_de_lociones_y_ropa,comercio_y_ventas_general,2245900.0,238326.0,18,la_unión,16,1,2022-12-06,2022-12-05,No,No,normal,2023,otros hallazgos,otras_politicas,formato de autorizacion de consulta parcialmen...,incumple_p&p,2 - Medio,2,1


# Export data

In [None]:
# # Export df to output DIR --- AWARE-ONLY WHEN NEEDED
# OUTPUT_FILE = os.path.join(OUTPUT_DIR, "hallazgos_clean.xlsx")
# df.to_excel(OUTPUT_FILE)