In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error, r2_score

import tensorflow as tf


import numpy as np

import tensorflow as tf

In [26]:
### analisis breve de los datos contenidos en la base de datos ### 

url = 'https://raw.githubusercontent.com/Mauascm/Model_PayEmUP/main/WellCleanedSalaries.csv'
data = pd.read_csv(url)

#### tomando solamente una muestra de los datos para tener una prueba rápida.
data = data.sample(frac=0.1, random_state=42)

# Ver las primeras filas de los datos
print(data.head())

# Ver información general sobre los datos
print(data.info())

# Ver estadísticas descriptivas de las variables numéricas
print(data.describe())

# Ver la cantidad de valores únicos en cada columna
print(data.nunique())

# Ver la cantidad de valores nulos en cada columna
print(data.isnull().sum())

# Eliminar las filas con valores nulos
##data = data.dropna()

# Reemplazar los valores NA con la moda de la columna
for column in data.columns:
    data[column].fillna(data[column].mode()[0], inplace=True)


### ---------------------------- ###

               CASE_NUMBER          CASE_STATUS CASE_RECEIVED_DATE   
34190   I-200-13267-221220            certified          9/24/2013  \
137917  I-200-13039-257242            certified           2/8/2013   
65281   I-200-13262-493824            certified         10/29/2013   
27111        A-14162-77222    certified-expired          8/11/2014   
137879  I-200-14190-296562  certified-withdrawn           7/9/2014   

       DECISION_DATE            EMPLOYER_NAME PREVAILING_WAGE_SUBMITTED_UNIT   
34190     10/23/2013     JPMORGAN CHASE & CO.                           year  \
137917     2/14/2013      IGT SOLUTIONS, INC.                           year   
65281      11/7/2013           EMC CORPORTION                           year   
27111     12/30/2014      CISCO SYSTEMS, INC.                           year   
137879     1/12/2015  OREGON STATE UNIVERSITY                           year   

        PREVAILING_WAGE_SUBMITTED                 JOB_TITLE   WORK_CITY   
34190                  

In [27]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

"""
Columnas socio-demográficas:

EMPLOYER_NAME
WORK_CITY
COUNTRY_OF_CITIZENSHIP
WORK_STATE
WORK_POSTAL_CODE --> Deleted
FULL_TIME_POSITION_Y_N
VISA_CLASS


Columnas académicas o profesionales:

PREVAILING_WAGE_SUBMITTED
PREVAILING_WAGE_SUBMITTED_UNIT
PAID_WAGE_SUBMITTED --> Deleted
PAID_WAGE_SUBMITTED_UNIT --> Deleted
JOB_TITLE
EDUCATION_LEVEL_REQUIRED
COLLEGE_MAJOR_REQUIRED
EXPERIENCE_REQUIRED_Y_N --> Deleted 
EXPERIENCE_REQUIRED_NUM_MONTHS
PREVAILING_WAGE_SOC_CODE
PREVAILING_WAGE_SOC_TITLE
PREVAILING_WAGE_PER_YEAR
PAID_WAGE_PER_YEAR
JOB_TITLE_SUBGROUP

"""

#socio_demographic_cols = ['CASE_STATUS', 'EMPLOYER_NAME', 'PREVAILING_WAGE_SUBMITTED', 'PREVAILING_WAGE_SUBMITTED_UNIT', 'WORK_CITY', 'WORK_STATE', 'FULL_TIME_POSITION_Y_N', 'VISA_CLASS']
#academic_cols = ['PREVAILING_WAGE_SOC_CODE', 'PREVAILING_WAGE_SOC_TITLE', 'JOB_TITLE_SUBGROUP']

# Definir las columnas socio-demográficas y académicas
socio_demographic_cols = ['EMPLOYER_NAME', 'WORK_CITY', 'COUNTRY_OF_CITIZENSHIP', 'WORK_STATE', 'FULL_TIME_POSITION_Y_N', 'VISA_CLASS']
academic_cols = ['PREVAILING_WAGE_SUBMITTED', 'PREVAILING_WAGE_SUBMITTED_UNIT', 'JOB_TITLE', 'EDUCATION_LEVEL_REQUIRED', 'COLLEGE_MAJOR_REQUIRED', 'EXPERIENCE_REQUIRED_NUM_MONTHS', 'PREVAILING_WAGE_SOC_CODE', 'PREVAILING_WAGE_SOC_TITLE', 'PREVAILING_WAGE_PER_YEAR', 'PAID_WAGE_PER_YEAR', 'JOB_TITLE_SUBGROUP']


# Crear los transformadores para las columnas numéricas y categóricas
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')

# Crear un preprocesador que aplique las transformaciones a las columnas correspondientes
preprocessor_socio_demographic = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, socio_demographic_cols)])

preprocessor_academic = ColumnTransformer(
    transformers=[
        ('num', num_transformer, ['PREVAILING_WAGE_SUBMITTED']),
        ('cat', cat_transformer, academic_cols)])

# Crear un pipeline que aplique el preprocesador, el transformador polinomial y luego ajuste el modelo
pipeline_socio_demographic = Pipeline(steps=[('preprocessor', preprocessor_socio_demographic),
                                              ('regressor', RandomForestRegressor())])

pipeline_academic = Pipeline(steps=[('preprocessor', preprocessor_academic),
                                    ('regressor', RandomForestRegressor())])


# La variable objetivo es el salario pagado
salary = data['PAID_WAGE_PER_YEAR']

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train_socio_demographic, X_test_socio_demographic, y_train_socio_demographic, y_test_socio_demographic = train_test_split(data[socio_demographic_cols], salary, test_size=0.2, random_state=42)
X_train_academic, X_test_academic, y_train_academic, y_test_academic = train_test_split(data[academic_cols], salary, test_size=0.2, random_state=42)

print('Datos preprocesados y divididos en conjuntos de entrenamiento y prueba.')


Datos preprocesados y divididos en conjuntos de entrenamiento y prueba.


In [28]:
# Entrenar los modelos
pipeline_socio_demographic.fit(X_train_socio_demographic, y_train_socio_demographic)
y_pred_socio_demographic = pipeline_socio_demographic.predict(X_test_socio_demographic)

pipeline_academic.fit(X_train_academic, y_train_academic)
y_pred_academic = pipeline_academic.predict(X_test_academic)

# Calcular el error cuadrático medio
mse_socio_demographic = mean_squared_error(y_test_socio_demographic, y_pred_socio_demographic)
mse_academic = mean_squared_error(y_test_academic, y_pred_academic)

print('Error cuadrático medio para el modelo socio-demográfico:', mse_socio_demographic)
print('Error cuadrático medio para el modelo académico:', mse_academic)

# Calcular RMSE
rmse_socio_demographic = np.sqrt(mse_socio_demographic)
rmse_academic = np.sqrt(mse_academic)

# Calcular MAE
mae_socio_demographic = mean_absolute_error(y_test_socio_demographic, y_pred_socio_demographic)
mae_academic = mean_absolute_error(y_test_academic, y_pred_academic)

# Calcular R²
r2_socio_demographic = r2_score(y_test_socio_demographic, y_pred_socio_demographic)
r2_academic = r2_score(y_test_academic, y_pred_academic)

print('RMSE para el modelo socio-demográfico:', rmse_socio_demographic)
print('RMSE para el modelo académico:', rmse_academic)

print('MAE para el modelo socio-demográfico:', mae_socio_demographic)
print('MAE para el modelo académico:', mae_academic)

print('R² para el modelo socio-demográfico:', r2_socio_demographic)
print('R² para el modelo académico:', r2_academic)



Error cuadrático medio para el modelo socio-demográfico: 799195053.3772154
Error cuadrático medio para el modelo académico: 388855951.28872114
RMSE para el modelo socio-demográfico: 28270.03808588194
RMSE para el modelo académico: 19719.430805393982
MAE para el modelo socio-demográfico: 16287.463843567852
MAE para el modelo académico: 5849.794143365213
R² para el modelo socio-demográfico: 0.34129851283637436
R² para el modelo académico: 0.679502529046047




In [32]:
# %%
# Start counters
count_academic = 0
count_socio_demographic = 0
count_same = 0
totales = 0

# Iterate over each row of the test data
for i in range(len(y_test_academic)):
    # Obtain the predictions for each model
    pred_academic = y_pred_academic[i]
    pred_socio_demographic = y_pred_socio_demographic[i]

    # Obtain the real salary
    real_salary = y_test_academic.iloc[i]
    print(real_salary)
    print(pred_academic)
    print(pred_socio_demographic)

    # Compare the predictions and the real salary to see which model is better
    if pred_academic > pred_socio_demographic and pred_academic > real_salary:
        count_academic += 1
    elif pred_socio_demographic > pred_academic and pred_socio_demographic > real_salary:
        count_socio_demographic += 1
    else:
        count_same += 1
totales = count_academic + count_socio_demographic + count_same

# Show the results (Quantity + Percentage)
print('Veces que eligió el sueldo académico:', count_academic, '-->', count_academic / totales * 100, '%')
print('Veces que eligió el sueldo socio-demográfico:', count_socio_demographic, '-->',
      count_socio_demographic / totales * 100, '%')
print('Veces que se quedó con su mismo sueldo:', count_same, '-->', count_same / totales * 100, '%')

#%%

49774.4
84603.7296
67169.0634153846
128773.0
114756.351
68628.30961333333
73861.0
75547.1592
88063.65764693636
112757.0
113157.72
60904.126
100000.0
100000.0
120649.72
34819.0
34825.066799999986
115755.28636170212
95000.0
81376.09
81943.18
57000.0
47771.92
65268.670773809536
80000.0
80000.0
103538.18
100000.0
86531.27
68812.15875
51000.0
71573.2292
61003.17571428571
110969.0
88120.66
109983.19900000001
64000.0
63854.32
70078.62
88587.0
89155.69
90555.96833333332
76117.0
76267.63
60543.97
154000.0
88090.3316
102246.09666666666
60000.0
60000.0
76647.5
73923.0
73923.0
103538.18
60000.0
60000.0
67521.21251068376
75000.0
75000.0
74416.74881493507
80000.0
80001.5
68123.46822222223
80000.0
80241.6892
62110.65
150000.0
140981.18
107215.99222222224
43460.0
39190.06
66304.89676190476
62000.0
62005.22
69848.985
155000.0
147950.23
96404.73836666666
66000.0
66040.0008
68628.30961333333
40768.0
43681.7287
140089.90483730158
67288.0
67508.6
83335.22503994309
55000.0
55149.88
62057.37761904762
107000.

In [114]:
import gym
from gym import spaces
import numpy as np

# Create the environment
class JobChangeEnv(gym.Env):
    def __init__(self, data, pipeline_socio_demographic, pipeline_academic):
        super(JobChangeEnv, self).__init__()

        # Define action and observation space
        self.action_space = spaces.Discrete(3)
        self.observation_space = spaces.Box(low=0, high=100, shape=(2,))

        # Store the data and models
        self.data = data
        self.pipeline_socio_demographic = pipeline_socio_demographic
        self.pipeline_academic = pipeline_academic

        # Initialize the current index to the first row of the data
        self.current_index = 0
    
    def step(self, action):
        # Get the current row of data
        current_row = self.data.iloc[[self.current_index]]

        # Get the current salary
        current_salary = current_row['PAID_WAGE_PER_YEAR'].values[0]

        # Get the features for the models
        socio_demographic_features = current_row[socio_demographic_cols]
        academic_features = current_row[academic_cols]

        # Get the new salary offers from the models
        socio_demographic_offer = self.pipeline_socio_demographic.predict(socio_demographic_features)[0]
        academic_offer = self.pipeline_academic.predict(academic_features)[0]


        # Calculate the reward based on the action taken by the agent
        if action == 0:  # Stay with the current job
            reward = current_salary
        elif action == 1:  # Take the socio-demographic job offer
            reward = socio_demographic_offer
        else:  # Take the academic job offer
            reward = academic_offer

        # Update the current index
        self.current_index += 1

        # Check if we have reached the end of the data
        done = self.current_index >= len(self.data)
        
        return np.array([[current_salary, socio_demographic_offer]]), reward, done, {}


    def reset(self):
    # Reset the state of the environment to an initial state
        self.current_index = 0
        return np.array(self.data.iloc[self.current_index]['PAID_WAGE_PER_YEAR'])


    def render(self, mode='human'):
        pass

In [117]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import random
from collections import deque

class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self):
        model = Sequential()
        model.add(Dense(24, input_shape=(1,), activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model


    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)

        act_values = self.model.predict(np.expand_dims(state, axis=0))
        return np.argmax(act_values[0])

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            state = np.reshape(state, [1, state_size])
            next_state = np.array(next_state).astype('float32')
            target = self.model.predict(np.expand_dims(state, axis=0))
            if done:
                target[0][action] = reward
            else:
                Q_future = max(self.model.predict(np.expand_dims(next_state, axis=0))[0])
                target[0][action] = reward + Q_future * self.gamma
            self.model.train_on_batch(np.expand_dims(state, axis=0), target)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay



In [119]:
actions = list(pipeline_academic.named_steps.keys())
state_size = (1,)
action_size = len(actions)
agent = DQNAgent(state_size, action_size)
done = False
batch_size = 32
max_steps_per_episode = 10

n_episodes = 1000  # Set the desired number of episodes


# Training loop
for e in range(n_episodes):
    state = env.reset()
    state = np.reshape(state, state_size)
    for time in range(max_steps_per_episode):
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, state_size)
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        if done:
            print("Episode: {}/{}, score: {}".format(e + 1, n_episodes, time))
            break
    if len(agent.memory) > batch_size:
        agent.replay(batch_size)






ValueError: cannot reshape array of size 3 into shape (1,)