In [1]:
import numpy as np
import pandas as pd
import mysql.connector
from dotenv import dotenv_values

# Load the environment variables from the .env file
env_vars = dotenv_values('.env')

# Get the password from the environment variables
password = env_vars.get('PASSWORD')

# Get the user from the environment variables
user = env_vars.get('USER')

if password:
    print("Password retrieved successfully")
else:
    print("Password not found in .env file.")

db = mysql.connector.connect(
    host="localhost",
    user=user,
    passwd=password
)


mycursor = db.cursor()

mycursor.execute("CREATE DATABASE IF NOT EXISTS gsalud DEFAULT CHARACTER SET utf8")

mycursor.execute("USE gsalud")

mycursor.execute("""
    CREATE TABLE IF NOT EXISTS pagos (
    id INT PRIMARY KEY AUTO_INCREMENT,
    paciente VARCHAR(35) NOT NULL,
    valor INT NOT NULL,
    actividad VARCHAR(35) NOT NULL,
    fecha TIMESTAMP NOT NULL,
    nequi BOOLEAN NOT NULL DEFAULT FALSE,
    datafono BOOLEAN NOT NULL DEFAULT FALSE,
    transferencia BOOLEAN NOT NULL DEFAULT FALSE,
    efectivo BOOLEAN NOT NULL DEFAULT FALSE,
    notas VARCHAR(100) NULL
)""")

In [2]:
# read csv and normaliza rows names
excel = pd.read_csv('pagos.csv', encoding='latin-1', sep=',')
excel.columns = excel.columns.str.replace(' ', '')
excel.head()

Unnamed: 0,Fecha,Paciente,Valor,Actividad,Nequi,Datafono,Transferencia,Efectivo,%
0,09/06/2023,Manfry Ariza,"$ 60,000",Valoracion,No,Si,No,No,
1,09/06/2023,Nazly Avila,"$ 185,000",Montaje/retiro,Si,No,No,No,
2,09/06/2023,Paula Hernandez,"$ 25,000",Radiografia,No,No,No,Si,
3,09/06/2023,Kevin Santiago Soto,"$ 105,000",Control/Reparaciones,No,No,No,Si,
4,09/06/2023,Laura Cupa,"$ 60,000",Control,No,Si,No,No,


In [3]:
excel.describe()

Unnamed: 0,Fecha,Paciente,Valor,Actividad,Nequi,Datafono,Transferencia,Efectivo,%
count,10000,147,147,139,146,146,146,146,1
unique,244,120,43,46,2,2,2,2,1
top,17/07/2023,Atenais Peña,"$ 60,000",Control,No,No,No,Si,Sistecredito
freq,42,3,32,34,119,111,141,79,1


In [4]:
excel.isna().sum()

Fecha               0
Paciente         9853
Valor            9853
Actividad        9861
Nequi            9854
Datafono         9854
Transferencia    9854
Efectivo         9854
%                9999
dtype: int64

In [5]:
# Delete rows with less than 2 non-NA values to delete rows with only the date
excel = excel.dropna(thresh=2)
excel.isna().sum()

Fecha              0
Paciente           0
Valor              0
Actividad          8
Nequi              1
Datafono           1
Transferencia      1
Efectivo           1
%                146
dtype: int64

In [6]:
print(excel.shape)

(147, 9)


In [7]:
# Delte "%" column as it only have 1 non-NA value
excel = excel.drop(columns='%')
excel.isna().sum()

Fecha            0
Paciente         0
Valor            0
Actividad        8
Nequi            1
Datafono         1
Transferencia    1
Efectivo         1
dtype: int64

In [8]:
# Print the rows with NA values
excel[excel.isna().any(axis=1)]

Unnamed: 0,Fecha,Paciente,Valor,Actividad,Nequi,Datafono,Transferencia,Efectivo
163,13/06/2023,Johana Martinez,"$ 20,000",,Si,No,No,No
164,13/06/2023,Jeimy,"$ 30,000",,No,No,No,Si
165,13/06/2023,Halam Diaz,"$ 58,500",,No,No,No,Si
532,22/06/2023,Yeimy Quintero,"$ 60,000",,No,No,No,Si
534,22/06/2023,Leimar Diaz,"$ 50,000",,No,No,No,Si
536,22/06/2023,Paola Vargas,"$ 80,000",,No,No,No,Si
538,22/06/2023,Cristian Ovalle,"$ 180,000",,No,No,No,Si
9999,17/07/2023,Total desde 09/06/23,"$ 18,528,500",,,,,


In [9]:
# Delte row of excel summary (index 9999)
excel = excel.drop(9999)
excel[excel.isna().any(axis=1)]

Unnamed: 0,Fecha,Paciente,Valor,Actividad,Nequi,Datafono,Transferencia,Efectivo
163,13/06/2023,Johana Martinez,"$ 20,000",,Si,No,No,No
164,13/06/2023,Jeimy,"$ 30,000",,No,No,No,Si
165,13/06/2023,Halam Diaz,"$ 58,500",,No,No,No,Si
532,22/06/2023,Yeimy Quintero,"$ 60,000",,No,No,No,Si
534,22/06/2023,Leimar Diaz,"$ 50,000",,No,No,No,Si
536,22/06/2023,Paola Vargas,"$ 80,000",,No,No,No,Si
538,22/06/2023,Cristian Ovalle,"$ 180,000",,No,No,No,Si


In [10]:
# Convert "Valor" column into integer from the format "$ 12,345,678"
excel['Valor'] = excel['Valor'].str.replace('$', '')
excel['Valor'] = excel['Valor'].str.replace(',', '')
excel['Valor'] = excel['Valor'].str.replace(' ', '')
excel['Valor'] = excel['Valor'].astype(int)

excel.dtypes

Fecha            object
Paciente         object
Valor             int64
Actividad        object
Nequi            object
Datafono         object
Transferencia    object
Efectivo         object
dtype: object

In [11]:
# Get total of the "Valor" column and compare it with the sum the "Valor" on rows with NA values (formated as currency)
total = excel['Valor'].sum()
total_na = excel[excel.isna().any(axis=1)]['Valor'].sum()
print(f'Total: ${total:,.2f}')
print(f'Total NA: ${total_na:,.2f}')
print(f'Lost money percentage: {total_na/total*100:.2f}%')

Total: $18,528,500.00
Total NA: $478,500.00
Lost money percentage: 2.58%


## Extract different activities and use them to input into NaN values in the 'actividad' column

In [12]:
activities = excel['Actividad'].unique()
activities_df = pd.DataFrame(activities, columns=['Actividades'])
activities_df.dropna(inplace=True)
print(activities_df['Actividades'].values)
print(activities_df.shape)

['Valoracion' 'Montaje/retiro' 'Radiografia' 'Control/Reparaciones'
 'Control' 'Bichectomia' 'Limpieza/braquets' 'Botox/Hilos' 'Faja'
 'Lipopapada y Bichectomia' 'Control Caucho' 'Limpieza' 'Reparaciones'
 'Reparacion Y Cauchos' 'Hilos' 'Lipopapada Bichectomia'
 'Tratamiento Odontologico' 'Calsa' 'Retenedores' 'Bichectomia Lipopapada'
 'Protesis' 'Limpieza Exodoncia' 'Control, Reparacion' 'Consulta General'
 'Control, Reparaciones' 'Control, Reparaciones, caucho' 'Lipopapada'
 'Control, Reparacion, Caucho' 'Control, Cauchos' 'Limpieza, Montaje'
 'Resina' '90: Deuda' 'Kit' 'Lipopada, Faja'
 'Control, Reparaciones, Cauchos' 'LipoPapada' 'Extraccion' 'Extracciones'
 'Control, Exodoncia' 'Resinas' 'Hilos, Retenedor' 'Controles, Reparacion'
 ' Control, Reparacion ' 'Papada, Encimas' 'Placa Activada'
 'Pago Porcentaje']
(46, 1)


### Normalize "Actividad" column

### Split the "Actividad" column into "Actividad_1", "Actividad_2", ..., "Actividad_n" columns


In [13]:
# Define exceptions to split rules
exceptions = [
    'Tratamiento Odontologico', 'Consulta General', 'Placa Activada',
    'Pago Porcentaje', '90: Deuda'
]

# Remove exceptions from dataframe
activities_df = activities_df[~activities_df['Actividades'].isin(exceptions)]

activities_df.shape

(41, 1)

In [14]:
# Create a copy to avoid SettingWithCopyWarning
activities_df_copy = activities_df.copy()

activities_df_copy['Actividades'] = activities_df_copy[
    'Actividades'].str.strip()

# Standardize the format of the activities by replacing different delimiters with a single comma
activities_df_copy['Actividades'] = activities_df_copy[
    'Actividades'].str.replace(r'\s*[,/yY ]+\s*', ',', regex=True)

# Make everything lowercase
activities_df_copy['Actividades'] = activities_df_copy[
    'Actividades'].str.lower()

# Sort the activities alphabetically
activities_df_copy = activities_df_copy.sort_values('Actividades')

print(activities_df_copy['Actividades'].values)

['bichectomia' 'bichectomia,lipopapada' 'botox,hilos' 'calsa' 'control'
 'control,caucho' 'control,cauchos' 'control,exodoncia'
 'control,reparacion' 'control,reparacion' 'control,reparacion,caucho'
 'control,reparaciones' 'control,reparaciones'
 'control,reparaciones,caucho' 'control,reparaciones,cauchos'
 'controles,reparacion' 'extraccion' 'extracciones' 'faja' 'hilos'
 'hilos,retenedor' 'kit' 'limpieza' 'limpieza,braquets'
 'limpieza,exodoncia' 'limpieza,montaje' 'lipopada,faja' 'lipopapada'
 'lipopapada' 'lipopapada,bichectomia' 'lipopapada,bichectomia'
 'montaje,retiro' 'papada,encimas' 'protesis' 'radiografia'
 'reparacion,cauchos' 'reparaciones' 'resina' 'resinas' 'retenedores'
 'valoracion']


In [15]:
# Split the activities by comma
activities_df_copy = activities_df_copy['Actividades'].str.split(
    ',', expand=True).stack().reset_index(
        level=1, drop=True).rename('Actividades').to_frame()

replacements = {
    'controles': 'control',
    'reparaciones': 'reparacion',
    'extracciones': 'extraccion',
    'caucho': 'cauchos',
    'lipopada': 'lipopapada',
    'resinas': 'resina',
    'retenedor': 'retenedores'
}

# Apply replacements to the 'Actividades' column
activities_df_copy['Actividades'] = activities_df_copy['Actividades'].replace(
    replacements)

In [16]:
# Keep only unique values
activities_df_copy = activities_df_copy.drop_duplicates()
activities_df_copy.sort_values('Actividades', inplace=True)

print(activities_df_copy['Actividades'].values)

['bichectomia' 'botox' 'braquets' 'calsa' 'cauchos' 'control' 'encimas'
 'exodoncia' 'extraccion' 'faja' 'hilos' 'kit' 'limpieza' 'lipopapada'
 'montaje' 'papada' 'protesis' 'radiografia' 'reparacion' 'resina'
 'retenedores' 'retiro' 'valoracion']


In [7]:

del excel['%']
excel.dropna(inplace=True)
# convert Si in True and No in False
excel['Nequi'] = excel['Nequi'].map({'Si': 1, 'No': 0})
excel['Datafono'] = excel['Datafono'].map({'Si': 1, 'No': 0})
excel['Transferencia'] = excel['Transferencia'].map({'Si': 1, 'No': 0})
excel['Efectivo'] = excel['Efectivo'].map({'Si': 1, 'No': 0})
# convert value to number and remove the $ symbol and .
excel['Valor'] = excel['Valor'].str.replace('$', '')
excel['Valor'] = excel['Valor'].str.replace(',', '')
excel['Valor'] = excel['Valor'].str.replace(' ', '')
excel['Valor'] = excel['Valor'].astype(int)
# convert date in format day/month/year to datetime
excel['Fecha'] = pd.to_datetime(excel['Fecha'], format='%d/%m/%Y')

excel.reset_index(drop=True, inplace=True)

excel

Unnamed: 0,Fecha,Paciente,Valor,Actividad,Nequi,Datafono,Transferencia,Efectivo
0,2023-06-09,Manfry Ariza,60000,Valoracion,0,1,0,0
1,2023-06-09,Nazly Avila,185000,Montaje/retiro,1,0,0,0
2,2023-06-09,Paula Hernandez,25000,Radiografia,0,0,0,1
3,2023-06-09,Kevin Santiago Soto,105000,Control/Reparaciones,0,0,0,1
4,2023-06-09,Laura Cupa,60000,Control,0,1,0,0
...,...,...,...,...,...,...,...,...
134,2023-07-01,Carmen Mojica,50000,Control,0,0,0,1
135,2023-07-01,Carmen Mojica,10000,Control,0,0,1,0
136,2023-07-01,Lizeth Vargas,230000,"Papada, Encimas",0,0,0,1
137,2023-07-01,Maria Paula Correa,100000,Placa Activada,0,1,0,0


In [9]:
# check changes
print_sql("SELECT * FROM pagos WHERE id > 138")

(139, ' Doctora Diana ', -595000, 'pago porcentaje', datetime.datetime(2023, 7, 1, 0, 0), 0, 0, 0, 1, None)
(158, ' Katherine Veloza ', 300000, 'bichectomia', datetime.datetime(2023, 6, 10, 0, 0), 0, 0, 0, 1, None)
(159, ' Katherine Veloza ', 370000, 'lipopapada', datetime.datetime(2023, 6, 10, 0, 0), 0, 0, 0, 1, None)
(160, ' Aura Yamile ', 300000, 'bichectomia', datetime.datetime(2023, 6, 10, 0, 0), 0, 1, 0, 0, None)
(161, ' Aura Yamile ', 480000, 'lipopapada', datetime.datetime(2023, 6, 10, 0, 0), 0, 1, 0, 0, None)
(162, ' Yolima Montero ', 300000, 'bichectomia', datetime.datetime(2023, 6, 14, 0, 0), 0, 1, 0, 0, None)
(163, ' Yolima Montero ', 550000, 'lipopapada', datetime.datetime(2023, 6, 14, 0, 0), 0, 1, 0, 0, None)


In [10]:
# close connection
db.close()

In [14]:
excel.to_csv('pagos_clean.csv', index=False)