In [1]:
# Importa las librerías
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.decomposition import PCA

In [2]:
# Carga el archivo CSV
EmpleadosAttrition = pd.read_csv("/content/empleadosRETO.csv")  # Verifica la ruta si es necesario
EmpleadosAttrition.head()  # Muestra las primeras filas para verificar

Unnamed: 0,Age,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,...,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsInCurrentRole,YearsSinceLastPromotion,Attrition
0,50,Travel_Rarely,Research & Development,1 km,2,Medical,1,997,4,Male,...,22,4,3,80,32,1,2,4,1,No
1,36,Travel_Rarely,Research & Development,6 km,2,Medical,1,178,2,Male,...,20,4,4,80,7,0,3,2,0,No
2,21,Travel_Rarely,Sales,7 km,1,Marketing,1,1780,2,Male,...,13,3,2,80,1,3,3,0,1,Yes
3,52,Travel_Rarely,Research & Development,7 km,4,Life Sciences,1,1118,2,Male,...,19,3,4,80,18,4,3,6,4,No
4,33,Travel_Rarely,Research & Development,15 km,1,Medical,1,582,2,Male,...,12,3,4,80,15,2,4,6,7,Yes


In [3]:
# Elimina columnas irrelevantes
EmpleadosAttrition.drop(columns=["EmployeeCount", "EmployeeNumber", "Over18", "StandardHours"], inplace=True)

In [6]:
# Detecta el formato de fecha automáticamente
EmpleadosAttrition['HiringDate'] = pd.to_datetime(EmpleadosAttrition['HiringDate'], infer_datetime_format=True, errors='coerce')
EmpleadosAttrition['Year'] = EmpleadosAttrition['HiringDate'].dt.year

  EmpleadosAttrition['HiringDate'] = pd.to_datetime(EmpleadosAttrition['HiringDate'], infer_datetime_format=True, errors='coerce')


In [7]:
# Convierte HiringDate a fecha y crea columna Year
EmpleadosAttrition['HiringDate'] = pd.to_datetime(EmpleadosAttrition['HiringDate'])
EmpleadosAttrition['Year'] = EmpleadosAttrition['HiringDate'].dt.year

In [8]:
# Calcula YearsAtCompany
EmpleadosAttrition['YearsAtCompany'] = 2018 - EmpleadosAttrition['Year']

In [9]:
# Renombra DistanceFromHome y elimina "km"
EmpleadosAttrition['DistanceFromHome_km'] = EmpleadosAttrition['DistanceFromHome']
EmpleadosAttrition['DistanceFromHome'] = EmpleadosAttrition['DistanceFromHome_km'].str.replace("km", "").astype(int)
EmpleadosAttrition.drop(columns=['Year', 'HiringDate', 'DistanceFromHome_km'], inplace=True)

In [10]:
# Calcula el salario promedio por departamento
SueldoPromedioDepto = EmpleadosAttrition.groupby('Department')['MonthlyIncome'].mean().reset_index()
SueldoPromedioDepto.columns = ['Department', 'SueldoPromedio']
SueldoPromedioDepto  # Muestra el resultado

Unnamed: 0,Department,SueldoPromedio
0,Human Resources,6239.888889
1,Research & Development,6804.149813
2,Sales,7188.25


In [11]:
# Escala MonthlyIncome entre 0 y 1
scaler = MinMaxScaler()
EmpleadosAttrition['MonthlyIncome'] = scaler.fit_transform(EmpleadosAttrition[['MonthlyIncome']])

In [12]:
# Convierte variables categóricas a numéricas
label_encoders = {}
for column in ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'Attrition']:
    le = LabelEncoder()
    EmpleadosAttrition[column] = le.fit_transform(EmpleadosAttrition[column])
    label_encoders[column] = le  # Guarda el encoder para referencia

In [14]:
# Muestra los tipos de datos de cada columna
print(EmpleadosAttrition.dtypes)

Age                           int64
BusinessTravel                int64
Department                    int64
DistanceFromHome              int64
Education                     int64
EducationField                int64
EnvironmentSatisfaction       int64
Gender                        int64
JobInvolvement                int64
JobLevel                      int64
JobRole                       int64
JobSatisfaction               int64
MaritalStatus                 int64
MonthlyIncome               float64
NumCompaniesWorked            int64
OverTime                     object
PercentSalaryHike             int64
PerformanceRating             int64
RelationshipSatisfaction      int64
TotalWorkingYears             int64
TrainingTimesLastYear         int64
WorkLifeBalance               int64
YearsInCurrentRole            int64
YearsSinceLastPromotion       int64
Attrition                     int64
YearsAtCompany              float64
dtype: object


In [15]:
from sklearn.preprocessing import LabelEncoder

# Lista de columnas categóricas que deben ser convertidas
categorical_columns = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'Attrition']

# Convierte cada columna categórica a numérico
for column in categorical_columns:
    if EmpleadosAttrition[column].dtype == 'object':  # Solo aplica si la columna es de tipo object
        le = LabelEncoder()
        EmpleadosAttrition[column] = le.fit_transform(EmpleadosAttrition[column])

In [18]:
# Selecciona solo las columnas numéricas
EmpleadosAttrition = EmpleadosAttrition.select_dtypes(include=[np.number])

In [19]:
# Verifica si hay valores nulos
print(EmpleadosAttrition.isnull().sum())

# Opcional: Llena los valores nulos con la media de la columna
EmpleadosAttrition.fillna(EmpleadosAttrition.mean(), inplace=True)

Age                         0
BusinessTravel              0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EnvironmentSatisfaction     0
Gender                      0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
NumCompaniesWorked          0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsInCurrentRole          0
YearsSinceLastPromotion     0
Attrition                   0
YearsAtCompany              1
dtype: int64


In [20]:
# Calcula la correlación después de asegurar que todas las columnas sean numéricas
correlation_matrix = EmpleadosAttrition.corr()
selected_features = correlation_matrix['Attrition'][abs(correlation_matrix['Attrition']) >= 0.1].index
EmpleadosAttritionFinal = EmpleadosAttrition[selected_features]

In [21]:
from sklearn.decomposition import PCA

# Aplica PCA al conjunto de datos, excluyendo la columna 'Attrition' (la variable de salida)
pca = PCA(n_components=0.8)
pca_components = pca.fit_transform(EmpleadosAttritionFinal.drop(columns=['Attrition']))

In [22]:
# Añadir componentes principales al DataFrame EmpleadosAttritionFinal
for i in range(pca_components.shape[1]):
    EmpleadosAttritionFinal[f'C{i}'] = pca_components[:, i]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  EmpleadosAttritionFinal[f'C{i}'] = pca_components[:, i]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  EmpleadosAttritionFinal[f'C{i}'] = pca_components[:, i]


In [23]:
# Guarda el DataFrame final en un archivo CSV
EmpleadosAttritionFinal.to_csv("EmpleadosAttritionFinal.csv", index=False)