# Conexión MySQL con VSC

In [1]:
import pymysql
import pandas as pd

try:
    # Conexión a la base de datos del simulador
    connection = pymysql.connect(
        host='212.227.90.6',
        user='Equipo12',
        password='E1q2u3i4p5o12',
        database='Equip_12',
        port=3306
    )
    print("✅ Conexión establecida con MySQL Server")

    # Crear un cursor
    cursor = connection.cursor()

    # Consultar las tablas disponibles
    cursor.execute("SHOW TABLES;")
    tables = cursor.fetchall()

    # Diccionario para almacenar los dataframes
    dataframes = {}

    for tabla in tables:
        table_name = tabla[0]
        print(f"📥 Cargando tabla: {table_name}")
        query = f"SELECT * FROM {table_name};"
        df = pd.read_sql(query, connection)
        dataframes[table_name] = df
        print(f"✅ DataFrame para '{table_name}' creado con {df.shape[0]} filas")

except pymysql.MySQLError as e:
    print("❌ Error al conectar con MySQL:", e)


✅ Conexión establecida con MySQL Server
📥 Cargando tabla: RRHH
✅ DataFrame para 'RRHH' creado con 740 filas


  df = pd.read_sql(query, connection)


In [2]:
df_rrhh = dataframes['RRHH']

In [3]:
df_rrhh.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 740 entries, 0 to 739
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   ID                       740 non-null    int64 
 1   Reason_absence           740 non-null    int64 
 2   Month_absence            740 non-null    int64 
 3   Day_week                 740 non-null    int64 
 4   Seasons                  740 non-null    int64 
 5   Transportation_expense   740 non-null    int64 
 6   Distance_Residence_Work  740 non-null    int64 
 7   Service_time             740 non-null    int64 
 8   Age                      740 non-null    int64 
 9   Work_load_Average_day    740 non-null    object
 10  Hit_target               740 non-null    int64 
 11  Disciplinary_failure     740 non-null    object
 12  Education                740 non-null    object
 13  Son                      740 non-null    object
 14  Social_drinker           740 non-null    o

# Data Cleaning
## Tratamiento de valores faltantes

No hay valores nulos, por lo tanto no se hace ninguna modificación

In [4]:
df_rrhh.isna().sum()

ID                         0
Reason_absence             0
Month_absence              0
Day_week                   0
Seasons                    0
Transportation_expense     0
Distance_Residence_Work    0
Service_time               0
Age                        0
Work_load_Average_day      0
Hit_target                 0
Disciplinary_failure       0
Education                  0
Son                        0
Social_drinker             0
Social_smoker              0
Pet                        0
Weight                     0
Height                     0
Body_mass_index            0
Absenteeism_hours          0
dtype: int64

# Corrección de tipos de datos

Antes de eliminar duplicados, se recomienda la corrección de tipos de datos:

- Reemplazar comas por puntos (Work_load_Average_day) debería ser float (número)
- Disciplinary_failure, Social_drinker, Social_smoker, Education, Son y Pet aunque sus valores son numéricos y deberían ser enteros (int) o categorías (category), están codificados como texto.


In [21]:
df_rrhh.dtypes

ID                          int64
Reason_absence              int64
Month_absence               int64
Day_week                    int64
Seasons                     int64
Transportation_expense      int64
Distance_Residence_Work     int64
Service_time                int64
Age                         int64
Work_load_Average_day      object
Hit_target                  int64
Disciplinary_failure       object
Education                  object
Son                        object
Social_drinker             object
Social_smoker              object
Pet                        object
Weight                      int64
Height                      int64
Body_mass_index             int64
Absenteeism_hours           int64
dtype: object

In [22]:
df_rrhh["Work_load_Average_day"] = df_rrhh["Work_load_Average_day"].str.replace(",", ".").astype(float)

In [23]:
columnas_a_convertir = ["Disciplinary_failure", "Education", "Son", "Social_drinker", "Social_smoker", "Pet"]
df_rrhh[columnas_a_convertir] = df_rrhh[columnas_a_convertir].astype(int)

In [38]:
df_rrhh.dtypes

ID                           int64
Reason_absence               int64
Month_absence                int64
Day_week                     int64
Seasons                      int64
Transportation_expense       int64
Distance_Residence_Work      int64
Service_time                 int64
Age                          int64
Work_load_Average_day      float64
Hit_target                   int64
Disciplinary_failure         int64
Education                    int64
Son                          int64
Social_drinker               int64
Social_smoker                int64
Pet                          int64
Weight                       int64
Height                       int64
Body_mass_index              int64
Absenteeism_hours            int64
dtype: object

# Eliminación o corrección de duplicados

In [24]:
# Duplicados completos
df_rrhh.duplicated().sum()

np.int64(34)

In [34]:
df_rrhh[df_rrhh.duplicated()].sort_values(by='ID')

# pd.set_option('display.max_columns', None) 
# # para ver todas las columnas y que no se corten por el limite de Panda

Unnamed: 0,ID,Reason_absence,Month_absence,Day_week,Seasons,Transportation_expense,Distance_Residence_Work,Service_time,Age,Work_load_Average_day,Hit_target,Disciplinary_failure,Education,Son,Social_drinker,Social_smoker,Pet,Weight,Height,Body_mass_index,Absenteeism_hours
386,3,27,2,4,2,179,51,18,38,251.818,96,0,1,0,1,0,0,89,170,31,3
383,3,27,2,4,2,179,51,18,38,251.818,96,0,1,0,1,0,0,89,170,31,3
388,3,27,2,4,2,179,51,18,38,251.818,96,0,1,0,1,0,0,89,170,31,3
387,3,27,2,6,2,179,51,18,38,251.818,96,0,1,0,1,0,0,89,170,31,3
389,3,27,2,6,2,179,51,18,38,251.818,96,0,1,0,1,0,0,89,170,31,3
454,3,23,7,6,1,179,51,18,38,239.554,97,0,1,0,1,0,0,89,170,31,2
434,3,27,3,5,2,179,51,18,38,222.196,99,0,1,0,1,0,0,89,170,31,3
431,3,27,3,5,2,179,51,18,38,222.196,99,0,1,0,1,0,0,89,170,31,3
563,3,27,2,6,2,179,51,18,38,264.249,97,0,1,0,1,0,0,89,170,31,2
582,3,27,3,4,2,179,51,18,38,222.196,99,0,1,0,1,0,0,89,170,31,2


In [37]:
# Lista de filas duplicadas exactas a eliminar
filas_a_eliminar = [383, 388, 389, 431, 577, 562, 589, 592]

# Eliminar esas filas del DataFrame
df_rrhh = df_rrhh.drop(index=filas_a_eliminar).reset_index(drop=True)

In [39]:
# comprobación de que ya no están los duplicados:
df_rrhh[df_rrhh.duplicated()].sort_values(by='ID')

Unnamed: 0,ID,Reason_absence,Month_absence,Day_week,Seasons,Transportation_expense,Distance_Residence_Work,Service_time,Age,Work_load_Average_day,Hit_target,Disciplinary_failure,Education,Son,Social_drinker,Social_smoker,Pet,Weight,Height,Body_mass_index,Absenteeism_hours
386,3,27,2,6,2,179,51,18,38,251.818,96,0,1,0,1,0,0,89,170,31,3
385,3,27,2,4,2,179,51,18,38,251.818,96,0,1,0,1,0,0,89,170,31,3
430,3,27,3,5,2,179,51,18,38,222.196,99,0,1,0,1,0,0,89,170,31,3
450,3,23,7,6,1,179,51,18,38,239.554,97,0,1,0,1,0,0,89,170,31,2
558,3,27,2,6,2,179,51,18,38,264.249,97,0,1,0,1,0,0,89,170,31,2
565,3,27,2,5,2,179,51,18,38,264.249,97,0,1,0,1,0,0,89,170,31,2
564,3,27,2,4,2,179,51,18,38,264.249,97,0,1,0,1,0,0,89,170,31,2
559,3,27,2,2,2,179,51,18,38,264.249,97,0,1,0,1,0,0,89,170,31,2
576,3,27,3,4,2,179,51,18,38,222.196,99,0,1,0,1,0,0,89,170,31,2
655,3,28,12,6,4,179,51,18,38,236.629,93,0,1,0,1,0,0,89,170,31,1


# Validación de valores atípicos (outliers)

In [7]:
df_rrhh.describe()

Unnamed: 0,ID,Reason_absence,Month_absence,Day_week,Seasons,Transportation_expense,Distance_Residence_Work,Service_time,Age,Hit_target,Weight,Height,Body_mass_index,Absenteeism_hours
count,740.0,740.0,740.0,740.0,740.0,740.0,740.0,740.0,740.0,740.0,740.0,740.0,740.0,740.0
mean,18.017568,19.216216,6.324324,3.914865,2.544595,221.32973,29.631081,12.554054,36.45,94.587838,79.035135,172.114865,26.677027,6.924324
std,11.021247,8.433406,3.436287,1.421675,1.111831,66.952223,14.836788,4.384873,6.478772,3.779313,12.883211,6.034995,4.285452,13.330998
min,1.0,0.0,0.0,2.0,1.0,118.0,5.0,1.0,27.0,81.0,56.0,163.0,19.0,0.0
25%,9.0,13.0,3.0,3.0,2.0,179.0,16.0,9.0,31.0,93.0,69.0,169.0,24.0,2.0
50%,18.0,23.0,6.0,4.0,3.0,225.0,26.0,13.0,37.0,95.0,83.0,170.0,25.0,3.0
75%,28.0,26.0,9.0,5.0,4.0,260.0,50.0,16.0,40.0,97.0,89.0,172.0,31.0,8.0
max,36.0,28.0,12.0,6.0,4.0,388.0,52.0,29.0,58.0,100.0,108.0,196.0,38.0,120.0


Se han encontrado posibles valores extremos en:
- Transportation_expense (máx. 388), la media es 221. Esto sugiere que algunos empleados tienen gastos de transporte muy por encima del promedio. Podrían ser outliers, o casos especiales (viven lejos, taxis, etc.).
- Absenteeism_hours (máx. 120)
- Work_load_Average_day (máx. 310), muy cercano al máximo permitido en carga diaria

# Estandarización de formatos

In [8]:
df_rrhh['Month_absence'].value_counts()

Month_absence
3     87
2     72
10    71
7     67
5     64
11    63
6     54
8     54
4     53
9     53
1     50
12    49
0      3
Name: count, dtype: int64

En la columna Month_absence se ha detectado el valor 0, que no corresponde a ningún mes válido (1–12). Solo hay 3 registros con este valor. Clasificado como " mes desconocido"


In [40]:
# Convertir la columna a tipo string para permitir texto
df_rrhh['Month_absence'] = df_rrhh['Month_absence'].astype(str)

# Reemplazar '0' por 'mes desconocido'
df_rrhh['Month_absence'] = df_rrhh['Month_absence'].replace('0', 'mes desconocido')


In [41]:
df_rrhh['Month_absence'].value_counts()


Month_absence
3                  85
10                 71
2                  68
7                  67
5                  64
11                 63
6                  54
8                  54
9                  53
4                  51
1                  50
12                 49
mes desconocido     3
Name: count, dtype: int64

In [9]:
df_rrhh['Disciplinary_failure'].value_counts()
# está ok

Disciplinary_failure
0    700
1     40
Name: count, dtype: int64

In [10]:
df_rrhh['Education'].value_counts()
# está ok

Education
1    611
3     79
2     46
4      4
Name: count, dtype: int64

In [11]:
df_rrhh['Social_drinker'].value_counts()
# está ok

Social_drinker
1    420
0    320
Name: count, dtype: int64

In [12]:
df_rrhh['Social_smoker'].value_counts()
# está ok

Social_smoker
0    686
1     54
Name: count, dtype: int64