In [12]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer


import numpy as np

import tensorflow as tf

In [13]:
### analisis breve de los datos contenidos en la base de datos ### 

url = 'https://raw.githubusercontent.com/Mauascm/Model_PayEmUP/main/Salary2USA.csv'
data = pd.read_csv(url)

#### tomando solamente una muestra de los datos para tener una prueba rápida.
data = data.sample(frac=0.25, random_state=42)

# Ver las primeras filas de los datos
print(data.head())

# Ver información general sobre los datos
print(data.info())

# Ver estadísticas descriptivas de las variables numéricas
print(data.describe())

# Ver la cantidad de valores únicos en cada columna
print(data.nunique())


### ---------------------------- ###

               CASE_NUMBER          CASE_STATUS CASE_RECEIVED_DATE   
34190   I-200-13267-221220            certified          9/24/2013  \
137917  I-200-13039-257242            certified           2/8/2013   
65281   I-200-13262-493824            certified         10/29/2013   
27111        A-14162-77222    certified-expired          8/11/2014   
137879  I-200-14190-296562  certified-withdrawn           7/9/2014   

       DECISION_DATE            EMPLOYER_NAME  PREVAILING_WAGE_SUBMITTED   
34190     10/23/2013     JPMORGAN CHASE & CO.                    94890.0  \
137917     2/14/2013      IGT SOLUTIONS, INC.                    50960.0   
65281      11/7/2013           EMC CORPORTION                    77937.0   
27111     12/30/2014      CISCO SYSTEMS, INC.                    98927.0   
137879     1/12/2015  OREGON STATE UNIVERSITY                    51000.0   

       PREVAILING_WAGE_SUBMITTED_UNIT  PAID_WAGE_SUBMITTED   
34190                            year             115000.0  

In [18]:
# Definir las columnas socio-demográficas y académicas
socio_demographic_cols = ['CASE_STATUS', 'EMPLOYER_NAME', 'PREVAILING_WAGE_SUBMITTED', 'PREVAILING_WAGE_SUBMITTED_UNIT', 'WORK_CITY', 'COUNTRY_OF_CITIZENSHIP', 'WORK_STATE', 'WORK_POSTAL_CODE', 'FULL_TIME_POSITION_Y_N', 'VISA_CLASS']
academic_cols = ['EDUCATION_LEVEL_REQUIRED', 'COLLEGE_MAJOR_REQUIRED', 'EXPERIENCE_REQUIRED_Y_N', 'EXPERIENCE_REQUIRED_NUM_MONTHS', 'PREVAILING_WAGE_SOC_CODE', 'PREVAILING_WAGE_SOC_TITLE', 'JOB_TITLE_SUBGROUP']

# Crear los transformadores para las columnas numéricas y categóricas
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(drop='first')

# Crear un preprocesador que aplique las transformaciones a las columnas correspondientes
preprocessor_socio_demographic = ColumnTransformer(
    transformers=[
        ('num', num_transformer, ['PREVAILING_WAGE_SUBMITTED']),
        ('cat', cat_transformer, socio_demographic_cols)])

preprocessor_academic = ColumnTransformer(
    transformers=[
        ('num', num_transformer, ['EXPERIENCE_REQUIRED_NUM_MONTHS']),
        ('cat', cat_transformer, academic_cols)])

# Crear un pipeline que aplique el preprocesador y luego ajuste el modelo
pipeline_socio_demographic = Pipeline(steps=[('preprocessor', preprocessor_socio_demographic)])
pipeline_academic = Pipeline(steps=[('preprocessor', preprocessor_academic)])

# Aplicar las transformaciones a los datos
data_preprocessed_socio_demographic = pipeline_socio_demographic.fit_transform(data[socio_demographic_cols])
data_preprocessed_academic = pipeline_academic.fit_transform(data[academic_cols])

# La variable objetivo es el salario pagado
salary = data['PAID_WAGE_PER_YEAR']

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train_socio_demographic, X_test_socio_demographic, y_train_socio_demographic, y_test_socio_demographic = train_test_split(data_preprocessed_socio_demographic, salary, test_size=0.2, random_state=42)
X_train_academic, X_test_academic, y_train_academic, y_test_academic = train_test_split(data_preprocessed_academic, salary, test_size=0.2, random_state=42)

print('Datos preprocesados y divididos en conjuntos de entrenamiento y prueba.')



Datos preprocesados y divididos en conjuntos de entrenamiento y prueba.


In [21]:
# Crear un imputador que rellena los valores faltantes con la media
imputer = SimpleImputer(strategy='mean')

# Aplicar el imputador a los datos de entrenamiento
X_train_socio_demographic_imputed = imputer.fit_transform(X_train_socio_demographic)
X_train_academic_imputed = imputer.fit_transform(X_train_academic)

# Crear los modelos
model_socio_demographic = LinearRegression()
model_academic = LinearRegression()

# Entrenar los modelos
model_socio_demographic.fit(X_train_socio_demographic_imputed, y_train_socio_demographic)
model_academic.fit(X_train_academic_imputed, y_train_academic)

# Evaluar los modelos
train_score_socio_demographic = model_socio_demographic.score(X_train_socio_demographic_imputed, y_train_socio_demographic)
test_score_socio_demographic = model_socio_demographic.score(X_test_socio_demographic, y_test_socio_demographic)

train_score_academic = model_academic.score(X_train_academic_imputed, y_train_academic)
test_score_academic = model_academic.score(X_test_academic, y_test_academic)

print(f'Exactitud del modelo socio-demográfico en el conjunto de entrenamiento: {train_score_socio_demographic:.2f}')
print(f'Exactitud del modelo socio-demográfico en el conjunto de prueba: {test_score_socio_demographic:.2f}')

print(f'Exactitud del modelo académico en el conjunto de entrenamiento: {train_score_academic:.2f}')
print(f'Exactitud del modelo académico en el conjunto de prueba: {test_score_academic:.2f}')

ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values