## <font color='157699'> Librerías

In [1]:
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("default")

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression,  f_classif, mutual_info_classif, chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegressionCV, Ridge

import pandas as pd
import a_funciones as funciones

## <font color='157699'> Importación de datos

In [2]:
# Datos del 2015
df_2015 = pd.read_csv("processed_data_2015.csv", sep =",")
# Datos del 2016
df_2016 = pd.read_csv("processed_data_2016.csv", sep =",")

In [3]:
df_2015['EmployeeID'] = df_2015['EmployeeID'].astype(str)
df_2015['PercentSalaryHike'] = df_2015['PercentSalaryHike'] / 100

## <font color='157699'> Eliminación de variables

In [4]:
df_2015.drop(['PerformanceRating', 'EnvironmentSatisfaction', 'JobInvolvement', 'retiro_2016'], axis=1, inplace=True)

### <font color='157699'> Dumizar variables categóricas

In [5]:
df_2015['JobSatisfaction'] = df_2015['JobSatisfaction'].astype(str)
df_2015['WorkLifeBalance'] = df_2015['WorkLifeBalance'].astype(str)
df_2015['NumCompaniesWorked'] = df_2015['NumCompaniesWorked'].astype(int)
df_2015['Education'] = df_2015['Education'].astype(str)

In [6]:
columnas_dumizar = ['BusinessTravel', 'Department', 'JobRole',
                    'JobSatisfaction', 'WorkLifeBalance', 'Education']


df_dummy = pd.get_dummies(df_2015[columnas_dumizar])
df_dummy.head()

Unnamed: 0,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Human Resources,Department_Research & Development,Department_Sales,JobRole_Healthcare Representative,JobRole_Human Resources,JobRole_Laboratory Technician,JobRole_Manager,...,JobSatisfaction_4.0,WorkLifeBalance_1.0,WorkLifeBalance_2.0,WorkLifeBalance_3.0,WorkLifeBalance_4.0,Education_1,Education_2,Education_3,Education_4,Education_5
0,0,0,1,0,0,1,1,0,0,0,...,1,0,1,0,0,0,1,0,0,0
1,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
2,0,1,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
3,1,0,0,0,1,0,0,1,0,0,...,1,0,0,1,0,0,0,0,0,1
4,0,0,1,0,1,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0


In [7]:
y = df_2015['renuncia2016']

In [8]:
x_numeric = df_2015.select_dtypes(int)
del x_numeric['renuncia2016']

In [9]:
df_normalizada = funciones.normalize_dataframe(x_numeric)

In [10]:
df_final = pd.concat([df_normalizada, df_dummy], axis=1)
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4308 entries, 0 to 4307
Data columns (total 35 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Age                                4308 non-null   float64
 1   DistanceFromHome                   4308 non-null   float64
 2   MonthlyIncome                      4308 non-null   float64
 3   NumCompaniesWorked                 4308 non-null   float64
 4   TrainingTimesLastYear              4308 non-null   float64
 5   YearsAtCompany                     4308 non-null   float64
 6   YearsSinceLastPromotion            4308 non-null   float64
 7   BusinessTravel_Non-Travel          4308 non-null   uint8  
 8   BusinessTravel_Travel_Frequently   4308 non-null   uint8  
 9   BusinessTravel_Travel_Rarely       4308 non-null   uint8  
 10  Department_Human Resources         4308 non-null   uint8  
 11  Department_Research & Development  4308 non-null   uint8

## <font color='056938'>Métodos Wrapper 

### <font color='157699'> RFE (Recursive Feature Elimination)

In [11]:
# Función recursiva de selección de características

def recursive_feature_selection(X,y,model,k): # model=modelo que me va a servir de estimador en este caso de regresión logística
  rfe = RFE(model, n_features_to_select=k, step=1)# step=1 cada cuanto el toma la sucesión de tomar una caracteristica
  fit = rfe.fit(X, y)
  X_new = fit.support_
  print("Num Features: %s" % (fit.n_features_))
  print("Selected Features: %s" % (fit.support_))
  print("Feature Ranking: %s" % (fit.ranking_))

  return X_new

In [12]:

#  Estimador en este caso para regresión logística (problema de clasificación binaria)
model = LogisticRegressionCV()

# Obtener columnas seleciconadas
X_new = recursive_feature_selection(df_final, y, model, 28)

# Nuevo conjunto de datos
df_new = df_final.iloc[:,X_new]
df_new.head()

Num Features: 28
Selected Features: [ True False  True  True  True  True  True  True  True False  True  True
  True  True False  True  True  True  True  True  True  True  True False
 False  True  True  True  True  True False  True  True False  True]
Feature Ranking: [1 7 1 1 1 1 1 1 1 3 1 1 1 1 4 1 1 1 1 1 1 1 1 6 5 1 1 1 1 1 8 1 1 2 1]


Unnamed: 0,Age,MonthlyIncome,NumCompaniesWorked,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,Department_Human Resources,Department_Research & Development,...,JobRole_Sales Representative,JobSatisfaction_1.0,JobSatisfaction_4.0,WorkLifeBalance_1.0,WorkLifeBalance_2.0,WorkLifeBalance_3.0,WorkLifeBalance_4.0,Education_2,Education_3,Education_5
0,0.785714,0.637546,0.111111,1.0,0.025,0.0,0,0,0,0,...,0,0,1,0,1,0,0,1,0,0
1,0.309524,0.167457,0.0,0.5,0.125,0.066667,0,1,0,1,...,0,0,0,0,0,0,1,0,0,0
2,0.333333,0.964666,0.111111,0.333333,0.125,0.0,0,1,0,1,...,0,0,0,1,0,0,0,0,0,0
3,0.47619,0.385045,0.333333,0.833333,0.2,0.466667,1,0,0,1,...,0,0,1,0,0,1,0,0,0,1
4,0.333333,0.070195,0.444444,0.333333,0.15,0.0,0,0,0,1,...,0,1,0,0,0,1,0,0,0,0


In [13]:
df_new.columns

Index(['Age', 'MonthlyIncome', 'NumCompaniesWorked', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion',
       'BusinessTravel_Non-Travel', 'BusinessTravel_Travel_Frequently',
       'Department_Human Resources', 'Department_Research & Development',
       'Department_Sales', 'JobRole_Healthcare Representative',
       'JobRole_Laboratory Technician', 'JobRole_Manager',
       'JobRole_Manufacturing Director', 'JobRole_Research Director',
       'JobRole_Research Scientist', 'JobRole_Sales Executive',
       'JobRole_Sales Representative', 'JobSatisfaction_1.0',
       'JobSatisfaction_4.0', 'WorkLifeBalance_1.0', 'WorkLifeBalance_2.0',
       'WorkLifeBalance_3.0', 'WorkLifeBalance_4.0', 'Education_2',
       'Education_3', 'Education_5'],
      dtype='object')

In [14]:
dummies = df_new[['BusinessTravel_Non-Travel', 'BusinessTravel_Travel_Frequently',
       'Department_Human Resources', 'Department_Research & Development',
       'Department_Sales', 'JobRole_Healthcare Representative',
       'JobRole_Laboratory Technician', 'JobRole_Manager',
       'JobRole_Manufacturing Director', 'JobRole_Research Director',
       'JobRole_Research Scientist', 'JobRole_Sales Executive',
       'JobRole_Sales Representative', 'JobSatisfaction_1.0',
       'JobSatisfaction_4.0', 'WorkLifeBalance_1.0', 'WorkLifeBalance_2.0',
       'WorkLifeBalance_3.0', 'WorkLifeBalance_4.0', 'Education_2',
       'Education_3', 'Education_5']]

numericas = x_numeric[['Age', 'DistanceFromHome', 'MonthlyIncome', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion']]
numericas

Unnamed: 0,Age,DistanceFromHome,MonthlyIncome,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion
0,51,6,131160,6,1,0
1,31,10,41890,3,5,1
2,32,17,193280,2,5,0
3,38,2,83210,5,8,7
4,32,10,23420,2,6,0
...,...,...,...,...,...,...
4303,29,7,21800,2,4,0
4304,33,11,71400,2,5,0
4305,33,1,51470,2,9,1
4306,32,23,24680,2,3,1


In [15]:
# Guardar los resultados en archivos CSV
dummies.to_csv('v_cat.csv', index=False)
numericas.to_csv('v_num.csv', index=False)

# Imprimir la ubicación de los archivos generados
print("Los archivos se guardaron como:")
print("v_cat.csv")
print("v_num.csv")

Los archivos se guardaron como:
v_cat.csv
v_num.csv
