In [1]:
import numpy as np
import pandas as pd

In [2]:
# read csv and normaliza rows names
excel = pd.read_csv('pagos.csv', encoding='latin-1', sep=',')
excel.columns = excel.columns.str.replace(' ', '')
excel.head()

Unnamed: 0,Fecha,Paciente,Valor,Actividad,Nequi,Datafono,Transferencia,Efectivo,%
0,09/06/2023,Manfry Ariza,"$ 60,000",Valoracion,No,Si,No,No,
1,09/06/2023,Nazly Avila,"$ 185,000",Montaje/retiro,Si,No,No,No,
2,09/06/2023,Paula Hernandez,"$ 25,000",Radiografia,No,No,No,Si,
3,09/06/2023,Kevin Santiago Soto,"$ 105,000",Control/Reparaciones,No,No,No,Si,
4,09/06/2023,Laura Cupa,"$ 60,000",Control,No,Si,No,No,


In [3]:
excel.describe()

Unnamed: 0,Fecha,Paciente,Valor,Actividad,Nequi,Datafono,Transferencia,Efectivo,%
count,10000,147,147,139,146,146,146,146,1
unique,244,120,43,46,2,2,2,2,1
top,17/07/2023,Atenais Peña,"$ 60,000",Control,No,No,No,Si,Sistecredito
freq,42,3,32,34,119,111,141,79,1


In [4]:
excel.isna().sum()

Fecha               0
Paciente         9853
Valor            9853
Actividad        9861
Nequi            9854
Datafono         9854
Transferencia    9854
Efectivo         9854
%                9999
dtype: int64

In [5]:
# Delete rows with less than 2 non-NA values to delete rows with only the date
excel = excel.dropna(thresh=2)
excel.isna().sum()

Fecha              0
Paciente           0
Valor              0
Actividad          8
Nequi              1
Datafono           1
Transferencia      1
Efectivo           1
%                146
dtype: int64

In [6]:
print(excel.shape)

(147, 9)


In [7]:
# Delte "%" column as it only have 1 non-NA value
excel = excel.drop(columns='%')
excel.isna().sum()

Fecha            0
Paciente         0
Valor            0
Actividad        8
Nequi            1
Datafono         1
Transferencia    1
Efectivo         1
dtype: int64

In [8]:
# Print the rows with NA values
excel[excel.isna().any(axis=1)]

Unnamed: 0,Fecha,Paciente,Valor,Actividad,Nequi,Datafono,Transferencia,Efectivo
163,13/06/2023,Johana Martinez,"$ 20,000",,Si,No,No,No
164,13/06/2023,Jeimy,"$ 30,000",,No,No,No,Si
165,13/06/2023,Halam Diaz,"$ 58,500",,No,No,No,Si
532,22/06/2023,Yeimy Quintero,"$ 60,000",,No,No,No,Si
534,22/06/2023,Leimar Diaz,"$ 50,000",,No,No,No,Si
536,22/06/2023,Paola Vargas,"$ 80,000",,No,No,No,Si
538,22/06/2023,Cristian Ovalle,"$ 180,000",,No,No,No,Si
9999,17/07/2023,Total desde 09/06/23,"$ 18,528,500",,,,,


In [9]:
# Delte row of excel summary (index 9999)
excel = excel.drop(9999)
excel[excel.isna().any(axis=1)]

Unnamed: 0,Fecha,Paciente,Valor,Actividad,Nequi,Datafono,Transferencia,Efectivo
163,13/06/2023,Johana Martinez,"$ 20,000",,Si,No,No,No
164,13/06/2023,Jeimy,"$ 30,000",,No,No,No,Si
165,13/06/2023,Halam Diaz,"$ 58,500",,No,No,No,Si
532,22/06/2023,Yeimy Quintero,"$ 60,000",,No,No,No,Si
534,22/06/2023,Leimar Diaz,"$ 50,000",,No,No,No,Si
536,22/06/2023,Paola Vargas,"$ 80,000",,No,No,No,Si
538,22/06/2023,Cristian Ovalle,"$ 180,000",,No,No,No,Si


In [10]:
# Convert "Valor" column into integer from the format "$ 12,345,678"
excel['Valor'] = excel['Valor'].str.replace('$', '')
excel['Valor'] = excel['Valor'].str.replace(',', '')
excel['Valor'] = excel['Valor'].str.replace(' ', '')
excel['Valor'] = excel['Valor'].astype(int)

excel.dtypes

Fecha            object
Paciente         object
Valor             int64
Actividad        object
Nequi            object
Datafono         object
Transferencia    object
Efectivo         object
dtype: object

In [11]:
excel.isna().sum()

Fecha            0
Paciente         0
Valor            0
Actividad        7
Nequi            0
Datafono         0
Transferencia    0
Efectivo         0
dtype: int64

In [12]:
# Get total of the "Valor" column and compare it with the sum the "Valor" on rows with NA values (formated as currency)
total = excel['Valor'].sum()
df_actividades_na = excel[excel['Actividad'].isna()]
total_na = df_actividades_na['Valor'].sum()
print(f'Total: ${total:,.2f}')
print(f'Total NA: ${total_na:,.2f}')
print(f'Lost money percentage: {total_na/total*100:.2f}%')

Total: $18,528,500.00
Total NA: $478,500.00
Lost money percentage: 2.58%


## Extract different activities and use them to input into NaN values in the 'actividad' column

In [13]:
# Creating new database without nan to clean and standardize activities names
cleaning_df = excel[excel['Actividad'].notna()].copy()

# Removing spaces at the start or the end
cleaning_df.loc[:, 'Actividad'] = cleaning_df['Actividad'].str.strip()

print(cleaning_df['Actividad'].unique())
print(f"\nTotal of activities: {len(cleaning_df['Actividad'].unique())}")

['Valoracion' 'Montaje/retiro' 'Radiografia' 'Control/Reparaciones'
 'Control' 'Bichectomia' 'Limpieza/braquets' 'Botox/Hilos' 'Faja'
 'Lipopapada y Bichectomia' 'Control Caucho' 'Limpieza' 'Reparaciones'
 'Reparacion Y Cauchos' 'Hilos' 'Lipopapada Bichectomia'
 'Tratamiento Odontologico' 'Calsa' 'Retenedores' 'Bichectomia Lipopapada'
 'Protesis' 'Limpieza Exodoncia' 'Control, Reparacion' 'Consulta General'
 'Control, Reparaciones' 'Control, Reparaciones, caucho' 'Lipopapada'
 'Control, Reparacion, Caucho' 'Control, Cauchos' 'Limpieza, Montaje'
 'Resina' '90: Deuda' 'Kit' 'Lipopada, Faja'
 'Control, Reparaciones, Cauchos' 'LipoPapada' 'Extraccion' 'Extracciones'
 'Control, Exodoncia' 'Resinas' 'Hilos, Retenedor' 'Controles, Reparacion'
 'Papada, Encimas' 'Placa Activada' 'Pago Porcentaje']

Total of activities: 45


### Normalize "Actividad" column

In [14]:
# Remove 'Pago Porcentaje' row
cleaning_df = cleaning_df[~cleaning_df['Actividad'].str.
                          contains('Pago Porcentaje')]

# Define exceptions to split rules
exceptions = [
    'Tratamiento Odontologico', 'Consulta General', 'Placa Activada',
    '90: Deuda'
]

# Exclude exceptions from the data manipulation
df_clean_rows = cleaning_df[cleaning_df['Actividad'].isin(exceptions)]
cleaning_df = cleaning_df[~cleaning_df['Actividad'].isin(exceptions)]

print(f"Activities list size: {cleaning_df['Actividad'].unique().size}")
print(f"Non-cleaning data size: {df_clean_rows.shape}")
print(f"Cleaning data size: {cleaning_df.shape}")

Activities list size: 40
Non-cleaning data size: (4, 8)
Cleaning data size: (134, 8)


In [15]:
# Standardize the format of the activities by replacing different delimiters with a single comma
cleaning_df['Actividad'] = cleaning_df['Actividad'].str.replace(
    r'\s*[,/yY ]+\s*', ',', regex=True)

# Make everything lowercase
cleaning_df['Actividad'] = cleaning_df['Actividad'].str.lower()

print(cleaning_df['Actividad'].unique())
print(f"\nTotal of activities: {len(cleaning_df['Actividad'].unique())}")

['valoracion' 'montaje,retiro' 'radiografia' 'control,reparaciones'
 'control' 'bichectomia' 'limpieza,braquets' 'botox,hilos' 'faja'
 'lipopapada,bichectomia' 'control,caucho' 'limpieza' 'reparaciones'
 'reparacion,cauchos' 'hilos' 'calsa' 'retenedores'
 'bichectomia,lipopapada' 'protesis' 'limpieza,exodoncia'
 'control,reparacion' 'control,reparaciones,caucho' 'lipopapada'
 'control,reparacion,caucho' 'control,cauchos' 'limpieza,montaje' 'resina'
 'kit' 'lipopada,faja' 'control,reparaciones,cauchos' 'extraccion'
 'extracciones' 'control,exodoncia' 'resinas' 'hilos,retenedor'
 'controles,reparacion' 'papada,encimas']

Total of activities: 37


In [16]:
# Replace conjugations of activities name with only one cunjugation

replacements = {
    r'\bcontroles\b': 'control',
    r'\breparaciones\b': 'reparacion',
    r'\bextracciones\b': 'extraccion',
    r'\bcaucho\b': 'cauchos',
    r'\blipopada\b': 'lipopapada',
    r'\bresinas\b': 'resina',
    r'\bretenedor\b': 'retenedores'
}

# Apply replacements
cleaning_df['Actividad'] = cleaning_df['Actividad'].replace(replacements,
                                                            regex=True)

# Get a list of activities
activities = cleaning_df['Actividad'].str.split(',',
                                                expand=True).stack().unique()

print(activities)
print(f"\nTotal of activities: {len(activities)}")

['valoracion' 'montaje' 'retiro' 'radiografia' 'control' 'reparacion'
 'bichectomia' 'limpieza' 'braquets' 'botox' 'hilos' 'faja' 'lipopapada'
 'cauchos' 'calsa' 'retenedores' 'protesis' 'exodoncia' 'resina' 'kit'
 'extraccion' 'papada' 'encimas']

Total of activities: 23


In [17]:
# Split the activities by comma into new columns
activities_df = cleaning_df['Actividad'].str.split(
    ',', expand=True).rename(columns=lambda x: f'Actividad_{x + 1}')

# Join dataframes
df_clean_rows['Actividad_1'] = df_clean_rows['Actividad']
# rebuilt_df = pd.concat([cleaning_df, df_clean_rows])
rebuilt_df = cleaning_df.join(activities_df, sort=True)
rebuilt_df = pd.concat([rebuilt_df, df_clean_rows])

# Rename column Actividad to Actividades
rebuilt_df = rebuilt_df.rename(columns={'Actividad': 'Actividades'})

# Reorder columns
columns = rebuilt_df.columns.tolist()
columns = columns[:4] + columns[-3:] + columns[4:8]

rebuilt_df = rebuilt_df[columns]

print(f"Data size: {rebuilt_df.shape}")
rebuilt_df.head()

Data size: (138, 11)


Unnamed: 0,Fecha,Paciente,Valor,Actividades,Actividad_1,Actividad_2,Actividad_3,Nequi,Datafono,Transferencia,Efectivo
0,09/06/2023,Manfry Ariza,60000,valoracion,valoracion,,,No,Si,No,No
1,09/06/2023,Nazly Avila,185000,"montaje,retiro",montaje,retiro,,Si,No,No,No
2,09/06/2023,Paula Hernandez,25000,radiografia,radiografia,,,No,No,No,Si
3,09/06/2023,Kevin Santiago Soto,105000,"control,reparacion",control,reparacion,,No,No,No,Si
4,09/06/2023,Laura Cupa,60000,control,control,,,No,Si,No,No


In [18]:
rebuilt_df.isna().sum()

Fecha              0
Paciente           0
Valor              0
Actividades        0
Actividad_1        0
Actividad_2       88
Actividad_3      134
Nequi              0
Datafono           0
Transferencia      0
Efectivo           0
dtype: int64

In [19]:
# Convert "Si" and "No" to boolean
rebuilt_df['Nequi'] = rebuilt_df['Nequi'].replace({
    'Si': True,
    'No': False
}).astype('bool')
rebuilt_df['Datafono'] = rebuilt_df['Datafono'].replace({
    'Si': True,
    'No': False
}).astype('bool')
rebuilt_df['Transferencia'] = rebuilt_df['Transferencia'].replace({
    'Si': True,
    'No': False
}).astype('bool')
rebuilt_df['Efectivo'] = rebuilt_df['Efectivo'].replace({
    'Si': True,
    'No': False
}).astype('bool')

# Convert "Fecha" in format day/month/year to datetime
rebuilt_df['Fecha'] = pd.to_datetime(rebuilt_df['Fecha'], format='%d/%m/%Y')

# Standarize "Paciente" column
rebuilt_df['Paciente'] = rebuilt_df['Paciente'].str.strip().str.title()

rebuilt_df.dtypes

Fecha            datetime64[ns]
Paciente                 object
Valor                     int64
Actividades              object
Actividad_1              object
Actividad_2              object
Actividad_3              object
Nequi                      bool
Datafono                   bool
Transferencia              bool
Efectivo                   bool
dtype: object

In [20]:
rebuilt_df.head()

Unnamed: 0,Fecha,Paciente,Valor,Actividades,Actividad_1,Actividad_2,Actividad_3,Nequi,Datafono,Transferencia,Efectivo
0,2023-06-09,Manfry Ariza,60000,valoracion,valoracion,,,False,True,False,False
1,2023-06-09,Nazly Avila,185000,"montaje,retiro",montaje,retiro,,True,False,False,False
2,2023-06-09,Paula Hernandez,25000,radiografia,radiografia,,,False,False,False,True
3,2023-06-09,Kevin Santiago Soto,105000,"control,reparacion",control,reparacion,,False,False,False,True
4,2023-06-09,Laura Cupa,60000,control,control,,,False,True,False,False


In [21]:
# Convert into csv
rebuilt_df.to_csv('pagos_cleaned.csv', index=False)