In [30]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error, r2_score



import numpy as np

import tensorflow as tf

In [31]:
### analisis breve de los datos contenidos en la base de datos ### 

url = 'https://raw.githubusercontent.com/Mauascm/Model_PayEmUP/main/Salary2USA.csv'
data = pd.read_csv(url)

#### tomando solamente una muestra de los datos para tener una prueba rápida.
#data = data.sample(frac=0.5, random_state=42)

# Ver las primeras filas de los datos
print(data.head())

# Ver información general sobre los datos
print(data.info())

# Ver estadísticas descriptivas de las variables numéricas
print(data.describe())

# Ver la cantidad de valores únicos en cada columna
print(data.nunique())

# Ver la cantidad de valores nulos en cada columna
print(data.isnull().sum())

# Eliminar las filas con valores nulos
##data = data.dropna()

# Reemplazar los valores NA con la moda de la columna
for column in data.columns:
    data[column].fillna(data[column].mode()[0], inplace=True)


### ---------------------------- ###

          CASE_NUMBER CASE_STATUS CASE_RECEIVED_DATE DECISION_DATE   
0  I-200-14073-248840      denied          3/14/2014     3/21/2014  \
1       A-15061-55212      denied          3/19/2015     3/19/2015   
2  I-200-13256-001092      denied          9/13/2013     9/23/2013   
3  I-200-14087-353657      denied          3/28/2014      4/7/2014   
4  I-203-14259-128844      denied          9/16/2014     9/23/2014   

                                       EMPLOYER_NAME   
0                ADVANCED TECHNOLOGY GROUP USA, INC.  \
1                     SAN FRANCISCO STATE UNIVERSITY   
2                                    CAROUSEL SCHOOL   
3  HARLINGEN CONSOLIDATED INDEPENDENT SCHOOL DIST...   
4                        SIGNAL SCIENCES CORPORATION   

   PREVAILING_WAGE_SUBMITTED PREVAILING_WAGE_SUBMITTED_UNIT   
0                  6217100.0                           year  \
1                  5067600.0                           year   
2                  4947000.0                         

In [36]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

"""
Columnas socio-demográficas:

EMPLOYER_NAME
WORK_CITY
COUNTRY_OF_CITIZENSHIP
WORK_STATE
WORK_POSTAL_CODE
FULL_TIME_POSITION_Y_N
VISA_CLASS
Columnas académicas o profesionales:

PREVAILING_WAGE_SUBMITTED
PREVAILING_WAGE_SUBMITTED_UNIT
PAID_WAGE_SUBMITTED
PAID_WAGE_SUBMITTED_UNIT
JOB_TITLE
EDUCATION_LEVEL_REQUIRED
COLLEGE_MAJOR_REQUIRED
EXPERIENCE_REQUIRED_Y_N
EXPERIENCE_REQUIRED_NUM_MONTHS
PREVAILING_WAGE_SOC_CODE
PREVAILING_WAGE_SOC_TITLE
PREVAILING_WAGE_PER_YEAR
PAID_WAGE_PER_YEAR
JOB_TITLE_SUBGROUP

"""

#socio_demographic_cols = ['CASE_STATUS', 'EMPLOYER_NAME', 'PREVAILING_WAGE_SUBMITTED', 'PREVAILING_WAGE_SUBMITTED_UNIT', 'WORK_CITY', 'WORK_STATE', 'FULL_TIME_POSITION_Y_N', 'VISA_CLASS']
#academic_cols = ['PREVAILING_WAGE_SOC_CODE', 'PREVAILING_WAGE_SOC_TITLE', 'JOB_TITLE_SUBGROUP']

# Definir las columnas socio-demográficas y académicas
socio_demographic_cols = ['EMPLOYER_NAME', 'WORK_CITY', 'COUNTRY_OF_CITIZENSHIP', 'WORK_STATE', 'WORK_POSTAL_CODE', 'FULL_TIME_POSITION_Y_N', 'VISA_CLASS']
academic_cols = ['PREVAILING_WAGE_SUBMITTED', 'PREVAILING_WAGE_SUBMITTED_UNIT', 'PAID_WAGE_SUBMITTED', 'PAID_WAGE_SUBMITTED_UNIT', 'JOB_TITLE', 'EDUCATION_LEVEL_REQUIRED', 'COLLEGE_MAJOR_REQUIRED', 'EXPERIENCE_REQUIRED_Y_N', 'EXPERIENCE_REQUIRED_NUM_MONTHS', 'PREVAILING_WAGE_SOC_CODE', 'PREVAILING_WAGE_SOC_TITLE', 'PREVAILING_WAGE_PER_YEAR', 'PAID_WAGE_PER_YEAR', 'JOB_TITLE_SUBGROUP']


# Crear los transformadores para las columnas numéricas y categóricas
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')

# Crear un preprocesador que aplique las transformaciones a las columnas correspondientes
preprocessor_socio_demographic = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, socio_demographic_cols)])

preprocessor_academic = ColumnTransformer(
    transformers=[
        ('num', num_transformer, ['PREVAILING_WAGE_SUBMITTED']),
        ('cat', cat_transformer, academic_cols)])

# Crear un pipeline que aplique el preprocesador, el transformador polinomial y luego ajuste el modelo
pipeline_socio_demographic = Pipeline(steps=[('preprocessor', preprocessor_socio_demographic),
                                              ('regressor', RandomForestRegressor())])

pipeline_academic = Pipeline(steps=[('preprocessor', preprocessor_academic),
                                    ('regressor', RandomForestRegressor())])


# La variable objetivo es el salario pagado
salary = data['PAID_WAGE_PER_YEAR']

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train_socio_demographic, X_test_socio_demographic, y_train_socio_demographic, y_test_socio_demographic = train_test_split(data[socio_demographic_cols], salary, test_size=0.2, random_state=42)
X_train_academic, X_test_academic, y_train_academic, y_test_academic = train_test_split(data[academic_cols], salary, test_size=0.2, random_state=42)

print('Datos preprocesados y divididos en conjuntos de entrenamiento y prueba.')


Datos preprocesados y divididos en conjuntos de entrenamiento y prueba.


In [37]:
# Entrenar los modelos
pipeline_socio_demographic.fit(X_train_socio_demographic, y_train_socio_demographic)
y_pred_socio_demographic = pipeline_socio_demographic.predict(X_test_socio_demographic)

pipeline_academic.fit(X_train_academic, y_train_academic)
y_pred_academic = pipeline_academic.predict(X_test_academic)

# Calcular el error cuadrático medio
mse_socio_demographic = mean_squared_error(y_test_socio_demographic, y_pred_socio_demographic)
mse_academic = mean_squared_error(y_test_academic, y_pred_academic)

print('Error cuadrático medio para el modelo socio-demográfico:', mse_socio_demographic)
print('Error cuadrático medio para el modelo académico:', mse_academic)

# Calcular RMSE
rmse_socio_demographic = np.sqrt(mse_socio_demographic)
rmse_academic = np.sqrt(mse_academic)

# Calcular MAE
mae_socio_demographic = mean_absolute_error(y_test_socio_demographic, y_pred_socio_demographic)
mae_academic = mean_absolute_error(y_test_academic, y_pred_academic)

# Calcular R²
r2_socio_demographic = r2_score(y_test_socio_demographic, y_pred_socio_demographic)
r2_academic = r2_score(y_test_academic, y_pred_academic)

print('RMSE para el modelo socio-demográfico:', rmse_socio_demographic)
print('RMSE para el modelo académico:', rmse_academic)

print('MAE para el modelo socio-demográfico:', mae_socio_demographic)
print('MAE para el modelo académico:', mae_academic)

print('R² para el modelo socio-demográfico:', r2_socio_demographic)
print('R² para el modelo académico:', r2_academic)




Error cuadrático medio para el modelo socio-demográfico: 748557086.49947
Error cuadrático medio para el modelo académico: 295720904.3180138
RMSE para el modelo socio-demográfico: 27359.77131665157
RMSE para el modelo académico: 17196.537567720246
MAE para el modelo socio-demográfico: 13354.383233562825
MAE para el modelo académico: 3343.697096042564
R² para el modelo socio-demográfico: 0.42699340527469964
R² para el modelo académico: 0.773631121221813
