# Load packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import random
import os
from unidecode import unidecode  # Library to remove accents


pd.options.display.max_columns = 100
pd.options.display.max_rows = 100


# Load data

using relative paths

In [2]:
filename = "cleaning_short.ipynb" # Current file name
print(f'Current file name: {filename}\n')
print(f'Current absolute path: {os.getcwd()}')

# Specify the paths, relative to the current file
NOTEBOOKS_DIR = os.path.dirname(os.path.abspath(filename))
BASE_DIR = os.path.dirname(NOTEBOOKS_DIR)
DATA_DIR = os.path.join(BASE_DIR, "Data")
OUTPUT_DIR = os.path.join(DATA_DIR, "output_data")

print(f'BASE_DIR: {BASE_DIR}')
print(f'DATA_DIR: {DATA_DIR}')
print(f'OUTPUT_DIR: {OUTPUT_DIR}')

Current file name: cleaning_short.ipynb

Current absolute path: c:\Users\jhona\Dropbox\ASPECTOS MAESTRIA\Retos_maestria\Reto_bancow\Analitica_1\otros_archivos
BASE_DIR: c:\Users\jhona\Dropbox\ASPECTOS MAESTRIA\Retos_maestria\Reto_bancow\Analitica_1
DATA_DIR: c:\Users\jhona\Dropbox\ASPECTOS MAESTRIA\Retos_maestria\Reto_bancow\Analitica_1\Data
OUTPUT_DIR: c:\Users\jhona\Dropbox\ASPECTOS MAESTRIA\Retos_maestria\Reto_bancow\Analitica_1\Data\output_data


In [3]:
df_base = pd.read_excel(os.path.join(DATA_DIR, "historico_hallazgos.xlsx"))
df_base.head(2)

Unnamed: 0,Num,ID_modif,Cliente,Analista,Nombre analista,Tipo crédito,Fecha desem,Visita_analista_crédito,Visita_auditor,Actividad,Monto,Cuota,Plazo,Categoria,Hallazgo,Tipo hallazgo,Riesgo,Calificación cartera,Relaciones Laborales,Oficina,zona,Regional,Validación unico,Tipo analisis,Clasificac analisis,Estado,Año,Tipo
0,1,7252440,Hector Julio Pabon Castano,AIZ,Juan Alejandro Trujillo Garcia,Renovacion,2021-06-11 00:00:00,,2022-09-20 00:00:00,don patacon postobon,15069.268,792.642,36,FRAUDE_Y_PRÁCTICAS_INDEBIDAS,Crédito otorgado a mas de un titular con el mi...,Acto Irregular,1 - Alto,Si,Si,Dosquebradas,12,4,,,,Cancelada,2022,Afecta Estabilidad
1,2,1143936676,Casas Marisol ...,JQQ,Alexander Joaqui Quintero,Renovacion,2021-06-15 00:00:00,2021-06-08 00:00:00,,expendio de comidas preparadas en cafeterias ...,402.0,209.228,626,FRAUDE_Y_PRÁCTICAS_INDEBIDAS,Presunto negocio inexistente,Acto Irregular,1 - Alto,No,Si,Poblado,1,1,,,,Castigo,2022,Afecta Estabilidad


# Overall check

Reviewing the columns, we can gather valuable information about the data and the context, in order to know how to develop an EDA and ML project.  
We can also clean the columns data, in order to have valuable information.

--> First, we will do an overral check of the data. We will check the data type of each column, the number of missing values, the number of unique values, and some random data.
This review will give us some advices or guide of which columns to modify/clean.

--> Further check column per column, is done in the notebook `notebooks/data_cleaning.ipynb`

In [4]:
pd.DataFrame({
    'objetc_type': df_base.dtypes.apply(lambda x: x.name).tolist(),
    'num_unique_values': df_base.nunique().tolist(),
    'num_nan_values': df_base.isna().sum().tolist(),
    'random_data': [df_base[col].sample(5).tolist() for col in df_base.columns]
}, index=df_base.columns)

Unnamed: 0,objetc_type,num_unique_values,num_nan_values,random_data
Num,int64,3919,0,"[1140, 3867, 3621, 2543, 2881]"
ID_modif,int64,3424,0,"[1151447243, 1098710400, 18417513, 79591442, 6..."
Cliente,object,3459,1,"[Wendys Johana Pacheco De La Rosa, Restrepo Mo..."
Analista,object,406,0,"[KVN, DTG, KAN, JKR, KBZ]"
Nombre analista,object,403,0,"[Rigoberto Figueroa Molina, Jorge Eduardo Her..."
Tipo crédito,object,5,202,"[Preferenci, Nuevo, Nuevo, Nuevo, Preferenci]"
Fecha desem,object,804,3,"[2024-03-30 00:00:00, 2023-06-05 00:00:00, 16/..."
Visita_analista_crédito,object,690,800,"[2022-07-21 00:00:00, 2023-07-05 00:00:00, 202..."
Visita_auditor,object,881,119,"[2023-12-12 00:00:00, 2024-01-18 00:00:00, 202..."
Actividad,object,2364,52,"[venta de leche, confecciones , chatarreria, e..."


# Data cleaning steps

--> Next, **we will make a list of actions to pre-process and clean up the columns and observations**, in order to get more out of the following analysis and project phases.

--> In addition, **we will be leaving some comments on relevant information and knowledge of the data** from this cleaning and basic review stage.

In [5]:
# 1. columns like "Num" & "ID_modif" are not relevant for the analysis. so, 
# We will eliminate it.

df = df_base.drop(columns=["Num", "ID_modif"])

In [6]:
print(f'Number of clients with more than one record: {(df.Cliente.value_counts()>1).sum()}')

Number of clients with more than one record: 229


--> Each observation is a different audit analysis or **audit finding**, even for the same client/account.

**The clients-credit in the dataset should be unique, so we will eliminate the duplicated observations for the same Client-credit combination**. We take in consideration the possibility of a same client to have more than one credit in different moments.

In [7]:
# 2. drop duplicates to have only one observation per client-credit
df = df.drop_duplicates(subset=["Cliente","Fecha desem", "Monto", "Cuota", "Plazo"]) 

# 3. rename Cliente column to avoid capital letters
df.rename(columns={"Cliente":"cliente"}, inplace=True)

In [8]:
print(df.Analista.sample(5))
print()
print(df.Analista.apply(lambda x: len(x)).value_counts())

3148    MVQ
1156    EDN
1999    HBM
2485    KNX
1674    KTC
Name: Analista, dtype: object

Analista
3    3461
Name: count, dtype: int64


--> All the analyst codes have 3 letters.

In [9]:
# 4. Drop the column Nombre analista. This column is not necessary for the analysis.
df = df.drop(columns=["Nombre analista"])

# 5. rename analista column to avoid capital letters
df = df.rename(columns={"Analista": "analista"})

-- > For the column "Tipo de credito" we found some typos, i.e. with the world "Preferencial" and "Renovacion". 
- **We will rename the column name to "tipo_credito"**
- **We will rename the "Preferenci" values as "Preferencial".**
- **We will rename the "Renovación" values as "Renovacion"**
- **We will rename the nan values with "No_especificado"**.


In [10]:
# 6. rename 'tipo credito' column to avoid capital letters
df = df.rename(columns={"Tipo crédito": "tipo_credito"})

# 7. fill NaN values of tipo_credito column
df["tipo_credito"] = df["tipo_credito"].fillna("No_especificado")

# 8. Correct typos and unify levels
df["tipo_credito"] = np.where(df["tipo_credito"] == "Renovación", "Renovacion", df["tipo_credito"])
df["tipo_credito"] = np.where(df["tipo_credito"] == "Preferenci", "Preferencial", df["tipo_credito"])


---

In [11]:
df[["Fecha desem","Visita_analista_crédito","Visita_auditor"]].sample(10, random_state=1)

Unnamed: 0,Fecha desem,Visita_analista_crédito,Visita_auditor
2431,2023-01-06 00:00:00,2023-01-02 00:00:00,2023-02-16 00:00:00
3342,2024-01-31 00:00:00,2024-01-30 00:00:00,2024-03-08 00:00:00
2107,2022-05-16 00:00:00,N/D,2023-08-16 00:00:00
1608,2022-06-13 00:00:00,2022-06-13 00:00:00,2023-01-13 00:00:00
3488,2024-03-27 00:00:00,,2024-05-07 16:30:00
3905,2024-07-10 00:00:00,2024-07-05 00:00:00,2024-08-16 11:37:49.210000
968,31/03/2022,31/03/2022,00:00:00
2308,2023-07-22 00:00:00,2023-07-18 00:00:00,2023-07-28 08:50:00
1881,2023-03-30 00:00:00,2023-03-29 00:00:00,2023-05-09 00:00:00
3364,2024-02-29 00:00:00,2024-02-22 00:00:00,2024-03-13 00:00:00


--> Above we can see at least 6 different formats for the "dates" type columns. This is an unwanted problem.

For further analisys or feature engineering, we will keep the columns "Fecha desem" and "Visita_analista_credito". but in addition, we will do the next steps:

- **We will rename the columns to avoid capital letters and blank spaces.**
- **We will standarize the columns to a date format, avoiding errors and missing values**.
- **We will drop the observations with dates like "00:00:00".**
- **We will eliminate the auditor visit column**, because in an hipotetic modelling phase, we cant know the auditor visit in advance.

In [12]:
# 9. rename dates columns
df = df.rename(columns={"Visita_analista_crédito": "visita_analista_credito","Fecha desem": "fecha_desembolso"})

# 10. drop observations with auditor visit dates similar to 00:00:00
df = df[(df["Visita_auditor"]!="00:00:00") & (df["Visita_auditor"]!=datetime.time(0, 0))]

# Define a function to handle different dates formats
def convert_to_datetime(value):
    if pd.isna(value) or value in ["N/D", "NAN"]:  # Handle missing or invalid values
        return pd.NaT
    try:
        return pd.to_datetime(value, dayfirst=True, errors='coerce')  # Convert to datetime and handle errors
    except:
        return pd.NaT  # Return NaT if it fails

# 11. Apply function to handle different dates formats to both columns
df["fecha_desembolso"] = df["fecha_desembolso"].apply(convert_to_datetime)
df["visita_analista_credito"] = df["visita_analista_credito"].apply(convert_to_datetime)

# 12. Drop column auditor visit
df = df.drop(columns=["Visita_auditor"])

---

Now, about the "Actividad" column:

--> We can see a lot of different activities, but also we can see that there are some values that are referring to the same, for example "Tienda" and "tienda".

--> Several activities have typos and errors. This could difficult the analysis or next phases.

The activity variable could be very important for any EDA or modelling, but it has a lot of values, errors and should be very difficult to fix.  
Despite that, we will try to fix the Activity column in order to gather some information in next steps:  

- **We will rename the variable column.**
- **We will run some fixes and considerations to the Activity column**.
- **and we will create a new activity_v2 column to not drop the original one.**

In [13]:
# Function to clean Activity values
def clean_occupation(value):
    if pd.isna(value):  # Handle missing values
        return "no_especificado"
    value = value.strip()  # Remove spaces at the beginning and end
    value = unidecode(value)  # Remove accents
    value = value.lower()  # Convert to lowercase
    value = value.replace(" ", "_")  # Replace spaces with underscores
    return value

# 13. Clean activity values
df["Actividad"] = df["Actividad"].apply(clean_occupation)

# 14. rename column Actividad to avoid capital letters
df = df.rename(columns={"Actividad": "actividad"})

In [14]:
### 15. Function to combine some Activity values to create macro-activities

def combine_occupation(value):
    if any(char in value for char in ["no_especificado"]):
        return "no_especificado"
    if any(char in value for char in ["ambulant"]):
        return "comercio_ambulante"
    elif any(
        char in value
        for char in [
            "agrico",
            "agricultura",
            "agro",
            "agricultor",
            "culti",
            "ganad",
            "cerdo",
            "pollo",
            "vaca",
            "animal",
            "cr?",
            "cria",
            "platano",
            "banano",
        ]
    ):
        return "agricultura_ganaderia_y_afines"
    elif any(
        char in value
        for char in [
            "comida",
            "restau",
            "alimen",
            "condimento",
            "cafe",
            "caf?",
            "frit",
            "plato",
            "frut",
            "fruv",
            "pan",
            "helad",
            "arepa",
            "empana",
            "piz",
            "leche",
            "almuerz",
            "carnic",
            "salsa",
            "lacte",
        ]
    ):
        return "sector_alimenticio"
    elif any(char in value for char in ["arriendo", "arren", "inmobi", "alquil"]):
        return "arriendos_alquiler_e_inmobiliarios"
    elif any(char in value for char in ["obra", "const", "interior","acabado"]):
        return "construccion_obras_y_afines"
    elif any(
        char in value for char in ["confec", "modist", "sastr", "costur", "prenda"]
    ):
        return "confeccion_y_afines"
    elif any(char in value for char in ["miscel","papel","boutique","floriste","flores","detall","sorpre","perfum","artesani","variedades"]):
        return "comercios_varios_y_detallistas"
    elif any(
        char in value for char in ["venta", "vta", "comercio", "mercanc", "comerci"]
    ):
        return "comercio_y_ventas_general"
    elif any(char in value for char in ["tienda", "almac", "supermer", "abarro","mercado_","ferret"]):
        return "tiendas_almacenes_y_ferreterias"
    elif any(
        char in value
        for char in [
            "mecan",
            "autom",
            "metal",
            "tecnico",
            "t?cni",
            "fabricac",
            "electri",
            "soldad",
            "pint",
            "laton",
            "taller",
            "manteni",
            "reparaci",
            "arregl",
            "ebanist",
            "herr",
            "plome",
            "artesan",
            "mueble",
            "manualida",
            "fundici",
            "guada",
            "llant",
            "vulcaniz",
        ]
    ):
        return "oficios_tecnicos_y_manuales"
    elif any(
        char in value for char in ["parquead", "parki", "lavade", "carwash"]
    ):
        return "lavaderos_parqueaderos_y_afines"
    elif any(
        char in value for char in ["transp", "trasp", "taxi", "bus", "vehicu", "moto","carga"]
    ):
        return "transporte_vehiculos_y_afines"
    elif any(
        char in value
        for char in [
            "manicur",
            "peluq",
            "bell",
            "unas",
            "u?as",
            "nail",
            "spa",
            "pein",
            "estili",
            "maquill",
            "barber",
            "masaj",
            "depil",
            "esteti",
            "cosmet",
        ]
    ):
        return "belleza_y_estetica"
    elif any(
        char in value for char in ["enferm", "medic", "salud", "drog", "farm", "hospi","dental","odonto"]
    ):
        return "salud_y_afines"
    elif any(
        char in value
        for char in ["educ", "escol", "refuer", "univ", "coleg", "clase", "docen"]
    ):
        return "servicios_educativos"
    elif any(char in value for char in ["aseo", "limpieza", "aseador"]):
        return "servicios_de_limpieza"
    else:
        return "otros_servicios_y_negocios"


df["actividad_v2"] = df["actividad"].apply(combine_occupation)

---

In [15]:
print(df[(df["Cuota"]==0) | (df["Plazo"]==0) | (df["Monto"]==0)].shape)
print()
print(df[["Monto","Cuota","Plazo"]].iloc[0:3])

(12, 25)

       Monto    Cuota  Plazo
0  15069.268  792.642     36
1    402.000  209.228    626
3  19213.296  991.582     42


Following, about the columns Monto, Cuota & Plazo:

--> We note that the "Cuota" variable is an object, also there isnt nan values. We also note that there are some observations with zero.  
--> We note that the values of Monto & Cuota are divided by 1000

The cleaning steps will be:  

- We will modify the column names to avoid capital letters.
- **We will cast the Cuota column to float.** To do this, it is necessary to "clean" this column before cast it.
- **We will multiply the Monto & Cuota by 1000.**
- **We will drop the observations with a zero value in any of the three columns.**

In [16]:
# 16. rename Monto, Cuota & Plazo columns
df = df.rename(columns={"Monto": "monto", "Cuota": "cuota", "Plazo": "plazo"})

# 17. eliminate characters that are not numbers in Cuota
df["cuota"] = df["cuota"].astype(str).str.replace(r'[^0-9.]', '', regex=True)

# 18. turn empty strings into 0 in cuota column
df["cuota"] = np.where(df["cuota"] == "", "0", df["cuota"])

# 19. convert to float the column cuota
df["cuota"] = pd.to_numeric(df["cuota"], errors='coerce')

# 20. multiply by 1000 the cuota & monto columns to convert to thousands
df["cuota"] = round(df["cuota"] * 1000)
df["monto"] = round(df["monto"] * 1000)

# 21. drop the observations with a zero value in any of the three columns.
df = df[(df["cuota"] != 0) & (df["plazo"] != 0) & (df["monto"] != 0)]

---

In [17]:
df[["Categoria","Hallazgo","Tipo hallazgo", "Riesgo"]].sample(5,random_state=2)

Unnamed: 0,Categoria,Hallazgo,Tipo hallazgo,Riesgo
898,ESTABILIDAD,Negocio en sociedad,Incumple P & P,2 - Medio
2137,Flujo_de_Caja,Otros ingresos inexistentes,Acto Irregular,2 - Medio
1158,OTRAS_POLITICAS,Formato de vinculación con campos en blanco y/...,Incumple P & P,2 - Medio
2495,Otras_Políticas,Formato de vinculación con campos en blanco y/...,Incumple\n P & P,2 - Medio
1009,FRAUDE_Y_PRÁCTICAS_INDEBIDAS,Alerta de Irregularidades,Acto Irregular,1 - Alto


--> We can see that each column have different strings formats, for example, capital letters, blank spaces, patterns like "\n", etc.  
We find some values that are referred to the same but are wroten different.

--> About the "Hallazgo" column, this is a description of the findings, it is like a comment column. Therefore, it is not necessary to modify it. We could even ignore this column.


In [18]:
df[df["Riesgo"]=="No aplica"]["Categoria"].value_counts()

Categoria
LIMITACIÓN_EN_EL_ALCANCE    65
Sin_Hallazgo                53
Name: count, dtype: int64

In [19]:
df[df["Categoria"]=="Sin_Hallazgo"]["Riesgo"].value_counts()

Riesgo
4 - Otro     939
No aplica     53
Name: count, dtype: int64

--> The "Riesgo" column is the cleanest one. It doesnt need cleaning steps.

Checking the "No aplica" level of the column "Riesgo", this level have values without findings or with limitations to complete the audit.

To clean this columns:

- We will drop the few NA values for each column
- **We will avoid capital letters and drop some patterns.**
- **We will rename some values of 'Categoria' & 'Tipo hallazgo' columns in order to unify levels.**
- We will rename the columns to avoid capital letters.
- **We will drop the observations with "Tipo hallazgo" equal to "Limitacion"**.
- Due to the previous step (#5), we will rename the level "No aplica" of the "Riesgo" column to "4 - otro"

In [20]:
# 22. Drop nan values of the 4 columns (Categoria","Hallazgo","Tipo hallazgo", "Riesgo)
df = df.dropna(subset=["Categoria","Hallazgo","Tipo hallazgo", "Riesgo"])

# Function to clean the column values
def clean_string_column(value):
    if pd.isna(value):  # Handle missing values
        return "no_especificado"
    value = value.strip()  # Remove spaces at the beginning and end
    value = unidecode(value)  # Remove accents
    value = value.lower()  # Convert to lowercase
    return value

# 23. Apply function to clean the column values
df["Categoria"] = df["Categoria"].apply(clean_string_column)
df["Hallazgo"] = df["Hallazgo"].apply(clean_string_column)
df["Tipo hallazgo"] = df["Tipo hallazgo"].apply(clean_string_column)

# 24. Combine some finding types values due to typos (incumple p&p)
df["Tipo hallazgo"] = df["Tipo hallazgo"].apply(lambda x: "incumple_p&p" if "incumple" in x else x)

# 25. rename columns 
df = df.rename(
    columns={
        "Tipo hallazgo": "tipo_hallazgo",
        "Riesgo": "riesgo",
        "Categoria": "categoria",
        "Hallazgo": "hallazgo",
    }
)

# 26. drop observations with limitations (audit incomplete)
df=df[df["tipo_hallazgo"]!="limitacion"]

# 27- Rename values "No aplica" of column Riesgo
df["riesgo"] = df["riesgo"].str.replace("No aplica","4 - Otro")

---

--> The columns "Calificacion cartera" & "Relaciones laborales" have few typos and Nan values

To clean this columns: 

- Rename columns to avoid ccapital letters
- We will change NA values to "no_especificado"
- fix typos in Calificación cartera

In [21]:
# 28. rename columns " Calificacion cartera" & "Relaciones laborales"
df.rename(columns={"Calificación cartera": "calificacion_cartera", "Relaciones Laborales": "relaciones_laborales"}, inplace=True)

# 29. Turn NaN values of both columns to "no_especificado"
df["calificacion_cartera"] = df["calificacion_cartera"].fillna("no_especificado")
df["relaciones_laborales"] = df["relaciones_laborales"].fillna("no_especificado")

# 30. fix typos
df["calificacion_cartera"] = df["calificacion_cartera"].apply(lambda x: "Si" if "si" in x else x)

---

In [22]:
# 31. Rename column Oficina
df = df.rename(columns={"Oficina ": "oficina"})

# 32. avoid capital letters and blank spaces of column Oficina
df["oficina"] = df["oficina"].apply(lambda x: x.strip().lower().replace(" ","_"))

# 33. change column names for zona & regional
df = df.rename(columns={"zona ": "zona", "Regional": "regional"})



---

In [23]:
temp = df["Validación unico"].value_counts()
print(f"\n'validacion unico' top: {temp}")


'validacion unico' top: Validación unico
ok          184
repetido     28
Name: count, dtype: int64


In [24]:
# 34. drop observations with "repetido" value on column "validacion unico" and drop column
df = df[df["Validación unico"] != "repetido"]
df = df.drop(columns=["Validación unico"])

---

--> The column "Tipo analisis" have a lot of NA values and is no clear how this information could add value to our next steps. 

- We will drop this column due to the lack of valuable information

In [25]:
# 35. drop column Tipo analisis
df = df.drop(columns=["Tipo analisis "])

--> The column "Clasificac analisis" have a lot of NA values and is no clear how this information could add value to our next steps. 

- We will drop this column due to the lack of valuable information

In [26]:
# 36. drop column Tipo analisis
df = df.drop(columns=["Clasificac analisis"])

--> The column "Estado" seems to have valuable information about the credit status. Doesnt have typos but several NA values.

In order to clean this column:

- We will rename the column to avoid capital letters.
- We will fill NA values with "no_especificado".
- We will clean a little more the column to avoid capital letters nor blank spaces at the beginning or the last of the string.

In [27]:
# 37. rename column Estado
df = df.rename(columns={"Estado": "estado"})

# 38. fill NaN values in column estado with no_especificado
df["estado"] = df["estado"].fillna("no_especificado")

# 39. From column estado, eliminate capital letters and blank spaces
df["estado"] = df["estado"].apply(lambda x: x.strip().lower())

In [28]:
# 40. rename the column year
df = df.rename(columns={"Año": "year"})

# 41. drop the column tipo
df = df.drop(columns=["Tipo"])

# Create new variables

**The variables that we will create below are intended to improve the exploratory analyses and provide more information about the understanding of the case.**

## New Credit risk variables 

**objective variable**

In [29]:
# 42. Create new column "riesgo", labeling the values to numbers (integers)

df["riesgo_int"] = df["riesgo"].replace({
    "1 - Alto": 1,
    "2 - Medio": 2,
    "3 - Bajo": 3,
    "4 - Otro": 4
})

# 43. Create new binomial "riesgo"
df["riesgo_bin"] = df["riesgo"].replace({
    "1 - Alto": 1,
    "2 - Medio": 1,
    "3 - Bajo": 1,
    "4 - Otro": 0
})

print(f'Label encoding "riesgo":\n{df["riesgo_int"].value_counts()}')
print("--")
print(f'Binomial encodign "riesgo":\n{df["riesgo_bin"].value_counts()}')

Label encoding "riesgo":
riesgo_int
4    1084
2    1084
3     423
1     325
Name: count, dtype: int64
--
Binomial encodign "riesgo":
riesgo_bin
1    1832
0    1084
Name: count, dtype: int64


  df["riesgo_int"] = df["riesgo"].replace({
  df["riesgo_bin"] = df["riesgo"].replace({


## New occupation variable

In [30]:
# 44. combine some Activity values to create macro-activities

def combine_occupation(value):
    if any(char in value for char in ["no_especificado"]):
        return "no_especificado"
    if any(char in value for char in ["ambulant"]):
        return "comercio_ambulante"
    elif any(
        char in value
        for char in [
            "agrico",
            "agricultura",
            "agro",
            "agricultor",
            "culti",
            "ganad",
            "cerdo",
            "pollo",
            "vaca",
            "animal",
            "cr?",
            "cria",
            "platano",
            "banano",
        ]
    ):
        return "agricultura_ganaderia_y_afines"
    elif any(
        char in value
        for char in [
            "comida",
            "restau",
            "alimen",
            "condimento",
            "cafe",
            "caf?",
            "frit",
            "plato",
            "frut",
            "fruv",
            "pan",
            "helad",
            "arepa",
            "empana",
            "piz",
            "leche",
            "almuerz",
            "carnic",
            "salsa",
            "lacte",
        ]
    ):
        return "sector_alimenticio"
    elif any(char in value for char in ["arriendo", "arren", "inmobi", "alquil"]):
        return "arriendos_alquiler_e_inmobiliarios"
    elif any(char in value for char in ["obra", "const", "interior","acabado"]):
        return "construccion_obras_y_afines"
    elif any(
        char in value for char in ["confec", "modist", "sastr", "costur", "prenda"]
    ):
        return "confeccion_y_afines"
    elif any(char in value for char in ["miscel","papel","boutique","floriste","flores","detall","sorpre","perfum","artesani","variedades"]):
        return "comercios_varios_y_detallistas"
    elif any(
        char in value for char in ["venta", "vta", "comercio", "mercanc", "comerci"]
    ):
        return "comercio_y_ventas_general"
    elif any(char in value for char in ["tienda", "almac", "supermer", "abarro","mercado_","ferret"]):
        return "tiendas_almacenes_y_ferreterias"
    elif any(
        char in value
        for char in [
            "mecan",
            "autom",
            "metal",
            "tecnico",
            "t?cni",
            "fabricac",
            "electri",
            "soldad",
            "pint",
            "laton",
            "taller",
            "manteni",
            "reparaci",
            "arregl",
            "ebanist",
            "herr",
            "plome",
            "artesan",
            "mueble",
            "manualida",
            "fundici",
            "guada",
            "llant",
            "vulcaniz",
        ]
    ):
        return "oficios_tecnicos_y_manuales"
    elif any(
        char in value for char in ["parquead", "parki", "lavade", "carwash"]
    ):
        return "lavaderos_parqueaderos_y_afines"
    elif any(
        char in value for char in ["transp", "trasp", "taxi", "bus", "vehicu", "moto","carga"]
    ):
        return "transporte_vehiculos_y_afines"
    elif any(
        char in value
        for char in [
            "manicur",
            "peluq",
            "bell",
            "unas",
            "u?as",
            "nail",
            "spa",
            "pein",
            "estili",
            "maquill",
            "barber",
            "masaj",
            "depil",
            "esteti",
            "cosmet",
        ]
    ):
        return "belleza_y_estetica"
    elif any(
        char in value for char in ["enferm", "medic", "salud", "drog", "farm", "hospi","dental","odonto"]
    ):
        return "salud_y_afines"
    elif any(
        char in value
        for char in ["educ", "escol", "refuer", "univ", "coleg", "clase", "docen"]
    ):
        return "servicios_educativos"
    elif any(char in value for char in ["aseo", "limpieza", "aseador"]):
        return "servicios_de_limpieza"
    else:
        return "otros_servicios_y_negocios"


df["actividad_v2"] = df["actividad"].apply(combine_occupation)

# To check all the activities that have "venta" or "vta" or "comercio" or "mercancia" or "comerci" in the name
# pd.Series([x for x in df["actividad"] if any(char in x for char in ["venta", "vta", "comercio","mercancia","comerci"])]).unique()

In [31]:
# 45. re order the columns of the df
df = df[["cliente", "analista", "tipo_credito", "actividad", "actividad_v2", "monto", "cuota", "plazo", "oficina", "zona", "regional", "fecha_desembolso", "visita_analista_credito",
         "calificacion_cartera", "relaciones_laborales", "estado", "year",
       "categoria", "hallazgo", "tipo_hallazgo", "riesgo","riesgo_int","riesgo_bin"]]

In [32]:
df.sample(5)

Unnamed: 0,cliente,analista,tipo_credito,actividad,actividad_v2,monto,cuota,plazo,oficina,zona,regional,fecha_desembolso,visita_analista_credito,calificacion_cartera,relaciones_laborales,estado,year,categoria,hallazgo,tipo_hallazgo,riesgo,riesgo_int,riesgo_bin
389,Josefina Bedoya Mejia,MRC,Nuevo,tienda_josefina,tiendas_almacenes_y_ferreterias,1080000.0,121226.0,12,pereira_lagos,12,4,2022-06-29,2022-06-28,No,no_especificado,cancelada,2022,sin_hallazgo,sin hallazgo,sin hallazgo,4 - Otro,4,0
2195,Ana Maria Morales Martinez,KSC,Renovacion,corredor_de_seguros,otros_servicios_y_negocios,6562050.0,680923.0,18,zipaquira,6,3,2022-12-28,2022-12-28,No,No,modificado,2023,calidad_de_datos,informacion errada en datos basicos,fallas en la metodologia,3 - Bajo,3,1
1981,Leidy Johana Vega Cortes,HBM,Nuevo,venta_por_catalogo,comercio_y_ventas_general,5625976.0,379736.0,24,chaparral,11,4,2023-06-07,2023-06-06,No,No,normal,2023,sin_hallazgo,sin hallazgo,sin hallazgo,4 - Otro,4,0
1240,Javier Ramirez Zuniga,HZU,Nuevo,servicio_tecnico_gas_domiciliario,oficios_tecnicos_y_manuales,5634062.0,343157.0,33,laureano_gómez,2,1,2022-06-18,2022-06-16,No,no_especificado,cancelada,2022,otras_politicas,formato de vinculacion con campos en blanco y/...,incumple_p&p,2 - Medio,2,1
307,Martha Cecilia Garcia Medina,IOC,Renovacion,venta_de_pollo,agricultura_ganaderia_y_afines,5427250.0,339964.0,30,dosquebradas,12,4,2022-05-18,NaT,No,No,normal,2022,sin_hallazgo,sin hallazgo,sin hallazgo,4 - Otro,4,0
