In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error, r2_score

import tensorflow as tf


import numpy as np

import tensorflow as tf

In [2]:
### analisis breve de los datos contenidos en la base de datos ### 

url = 'https://raw.githubusercontent.com/Mauascm/Model_PayEmUP/main/WellCleanedSalaries.csv'
data = pd.read_csv(url)

#### tomando solamente una muestra de los datos para tener una prueba rápida.
#data = data.sample(frac=0.1, random_state=42)

# Ver las primeras filas de los datos
print(data.head())

# Ver información general sobre los datos
print(data.info())

# Ver estadísticas descriptivas de las variables numéricas
print(data.describe())

# Ver la cantidad de valores únicos en cada columna
print(data.nunique())

# Ver la cantidad de valores nulos en cada columna
print(data.isnull().sum())

# Eliminar las filas con valores nulos
##data = data.dropna()

# Reemplazar los valores NA con la moda de la columna
for column in data.columns:
    data[column].fillna(data[column].mode()[0], inplace=True)


### ---------------------------- ###

          CASE_NUMBER CASE_STATUS CASE_RECEIVED_DATE DECISION_DATE   
0  I-200-14073-248840      denied          3/14/2014     3/21/2014  \
1       A-15061-55212      denied          3/19/2015     3/19/2015   
2  I-200-13256-001092      denied          9/13/2013     9/23/2013   
3  I-200-14087-353657      denied          3/28/2014      4/7/2014   
4  I-203-14259-128844      denied          9/16/2014     9/23/2014   

                                       EMPLOYER_NAME   
0                ADVANCED TECHNOLOGY GROUP USA, INC.  \
1                     SAN FRANCISCO STATE UNIVERSITY   
2                                    CAROUSEL SCHOOL   
3  HARLINGEN CONSOLIDATED INDEPENDENT SCHOOL DIST...   
4                        SIGNAL SCIENCES CORPORATION   

  PREVAILING_WAGE_SUBMITTED_UNIT  PREVAILING_WAGE_SUBMITTED   
0                           year                   62171.00  \
1                           year                   50676.00   
2                           year                   49

In [3]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

"""
Columnas socio-demográficas:

EMPLOYER_NAME
WORK_CITY
COUNTRY_OF_CITIZENSHIP
WORK_STATE
WORK_POSTAL_CODE --> Deleted
FULL_TIME_POSITION_Y_N
VISA_CLASS


Columnas académicas o profesionales:

PREVAILING_WAGE_SUBMITTED
PREVAILING_WAGE_SUBMITTED_UNIT
PAID_WAGE_SUBMITTED --> Deleted
PAID_WAGE_SUBMITTED_UNIT --> Deleted
JOB_TITLE
EDUCATION_LEVEL_REQUIRED
COLLEGE_MAJOR_REQUIRED
EXPERIENCE_REQUIRED_Y_N --> Deleted 
EXPERIENCE_REQUIRED_NUM_MONTHS
PREVAILING_WAGE_SOC_CODE
PREVAILING_WAGE_SOC_TITLE
PREVAILING_WAGE_PER_YEAR
PAID_WAGE_PER_YEAR
JOB_TITLE_SUBGROUP

"""

#socio_demographic_cols = ['CASE_STATUS', 'EMPLOYER_NAME', 'PREVAILING_WAGE_SUBMITTED', 'PREVAILING_WAGE_SUBMITTED_UNIT', 'WORK_CITY', 'WORK_STATE', 'FULL_TIME_POSITION_Y_N', 'VISA_CLASS']
#academic_cols = ['PREVAILING_WAGE_SOC_CODE', 'PREVAILING_WAGE_SOC_TITLE', 'JOB_TITLE_SUBGROUP']

# Definir las columnas socio-demográficas y académicas
socio_demographic_cols = ['EMPLOYER_NAME', 'WORK_CITY', 'COUNTRY_OF_CITIZENSHIP', 'WORK_STATE', 'FULL_TIME_POSITION_Y_N', 'VISA_CLASS']
academic_cols = ['PREVAILING_WAGE_SUBMITTED', 'PREVAILING_WAGE_SUBMITTED_UNIT', 'JOB_TITLE', 'EDUCATION_LEVEL_REQUIRED', 'COLLEGE_MAJOR_REQUIRED', 'EXPERIENCE_REQUIRED_NUM_MONTHS', 'PREVAILING_WAGE_SOC_CODE', 'PREVAILING_WAGE_SOC_TITLE', 'PREVAILING_WAGE_PER_YEAR', 'PAID_WAGE_PER_YEAR', 'JOB_TITLE_SUBGROUP']


# Crear los transformadores para las columnas numéricas y categóricas
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')

# Crear un preprocesador que aplique las transformaciones a las columnas correspondientes
preprocessor_socio_demographic = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, socio_demographic_cols)])

preprocessor_academic = ColumnTransformer(
    transformers=[
        ('num', num_transformer, ['PREVAILING_WAGE_SUBMITTED']),
        ('cat', cat_transformer, academic_cols)])

# Crear un pipeline que aplique el preprocesador, el transformador polinomial y luego ajuste el modelo
pipeline_socio_demographic = Pipeline(steps=[('preprocessor', preprocessor_socio_demographic),
                                              ('regressor', RandomForestRegressor())])

pipeline_academic = Pipeline(steps=[('preprocessor', preprocessor_academic),
                                    ('regressor', RandomForestRegressor())])


# La variable objetivo es el salario pagado
salary = data['PAID_WAGE_PER_YEAR']

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train_socio_demographic, X_test_socio_demographic, y_train_socio_demographic, y_test_socio_demographic = train_test_split(data[socio_demographic_cols], salary, test_size=0.2, random_state=42)
X_train_academic, X_test_academic, y_train_academic, y_test_academic = train_test_split(data[academic_cols], salary, test_size=0.2, random_state=42)

print('Datos preprocesados y divididos en conjuntos de entrenamiento y prueba.')


Datos preprocesados y divididos en conjuntos de entrenamiento y prueba.


In [4]:
# Entrenar los modelos
pipeline_socio_demographic.fit(X_train_socio_demographic, y_train_socio_demographic)
y_pred_socio_demographic = pipeline_socio_demographic.predict(X_test_socio_demographic)

pipeline_academic.fit(X_train_academic, y_train_academic)
y_pred_academic = pipeline_academic.predict(X_test_academic)

# Calcular el error cuadrático medio
mse_socio_demographic = mean_squared_error(y_test_socio_demographic, y_pred_socio_demographic)
mse_academic = mean_squared_error(y_test_academic, y_pred_academic)

print('Error cuadrático medio para el modelo socio-demográfico:', mse_socio_demographic)
print('Error cuadrático medio para el modelo académico:', mse_academic)

# Calcular RMSE
rmse_socio_demographic = np.sqrt(mse_socio_demographic)
rmse_academic = np.sqrt(mse_academic)

# Calcular MAE
mae_socio_demographic = mean_absolute_error(y_test_socio_demographic, y_pred_socio_demographic)
mae_academic = mean_absolute_error(y_test_academic, y_pred_academic)

# Calcular R²
r2_socio_demographic = r2_score(y_test_socio_demographic, y_pred_socio_demographic)
r2_academic = r2_score(y_test_academic, y_pred_academic)

print('RMSE para el modelo socio-demográfico:', rmse_socio_demographic)
print('RMSE para el modelo académico:', rmse_academic)

print('MAE para el modelo socio-demográfico:', mae_socio_demographic)
print('MAE para el modelo académico:', mae_academic)

print('R² para el modelo socio-demográfico:', r2_socio_demographic)
print('R² para el modelo académico:', r2_academic)



Error cuadrático medio para el modelo socio-demográfico: 779649851.7969688
Error cuadrático medio para el modelo académico: 313732116.0899513
RMSE para el modelo socio-demográfico: 27922.210725459558
RMSE para el modelo académico: 17712.484752003355
MAE para el modelo socio-demográfico: 13434.276995282678
MAE para el modelo académico: 3386.6796473099002
R² para el modelo socio-demográfico: 0.4031924689332529
R² para el modelo académico: 0.7598438719786365


In [5]:
# %%
# Start counters
count_academic = 0
count_socio_demographic = 0
count_same = 0
totales = 0

# Iterate over each row of the test data
for i in range(len(y_test_academic)):
    # Obtain the predictions for each model
    pred_academic = y_pred_academic[i]
    pred_socio_demographic = y_pred_socio_demographic[i]

    # Obtain the real salary
    real_salary = y_test_academic.iloc[i]
    print(real_salary)
    print(pred_academic)
    print(pred_socio_demographic)

    # Compare the predictions and the real salary to see which model is better
    if pred_academic > pred_socio_demographic and pred_academic > real_salary:
        count_academic += 1
    elif pred_socio_demographic > pred_academic and pred_socio_demographic > real_salary:
        count_socio_demographic += 1
    else:
        count_same += 1
totales = count_academic + count_socio_demographic + count_same

# Show the results (Quantity + Percentage)
print('Veces que eligió el sueldo académico:', count_academic, '-->', count_academic / totales * 100, '%')
print('Veces que eligió el sueldo socio-demográfico:', count_socio_demographic, '-->',
      count_socio_demographic / totales * 100, '%')
print('Veces que se quedó con su mismo sueldo:', count_same, '-->', count_same / totales * 100, '%')

#%%

115000.0
115000.0
107902.48884903846
62000.0
61987.36
68328.0
95000.0
95000.0
91665.56482642502
98927.0
98927.0
107547.54649010063
100008.0
91610.11
86813.15479871856
92851.0
92851.0
85510.02392499888
180000.0
146367.58
135706.51234981418
120000.0
117383.79269999999
140005.60528562267
50000.0
50000.0
60000.0
74380.0
76541.12
66020.35367521367
80000.0
84278.89
111380.49839948177
48150.0
48150.0
48845.55107142859
90300.0
90300.0
81873.31056888685
140000.0
140000.0
101091.09126984127
70533.0
70624.48
71412.83691098768
333900.0
278100.64
145538.1960559269
29521.0
48629.47
50030.55311904761
109762.0
109762.0
109766.14789109513
88000.0
88000.0
101725.08777777778
90000.0
90000.0
70366.40561904763
174720.0
144487.408
100825.37076923077
95160.0
96234.5386
65704.5672489661
67454.0
64677.27
66852.98968772954
65000.0
65673.6
64842.18063126193
67101.0
64149.36
60798.202000000005
60000.0
60000.0
51720.22222222222
85000.0
85000.0
64092.83333333334
68328.0
68574.5
87304.82365079364
120000.0
120000.0
9

In [8]:
import numpy as np

# Discretize salaries into 100 bins
bins = np.linspace(data['PAID_WAGE_PER_YEAR'].min(), data['PAID_WAGE_PER_YEAR'].max(), 100)
y_test_academic_binned = np.digitize(y_test_academic, bins)
y_pred_academic_binned = np.digitize(y_pred_academic, bins)
y_pred_socio_demographic_binned = np.digitize(y_pred_socio_demographic, bins)

# Initialize Q-table
salaries = list(range(100))  # now we have 100 possible salaries
states = [(s1, s2, s3) for s1 in salaries for s2 in salaries for s3 in salaries]
actions = [0, 1, 2]  # 0: choose academic, 1: choose socio-demographic, 2: stay the same
Q = np.zeros((len(states), len(actions)))

# Parameters
alpha = 0.5  # learning rate
gamma = 0.9  # discount factor
epsilon = 0.1  # exploration rate
n_episodes = 100  # number of episodes

# Q-learning
for _ in range(n_episodes):
    for i in range(len(y_test_academic_binned)):
        # Current state
        state = (y_test_academic_binned[i], y_pred_academic_binned[i], y_pred_socio_demographic_binned[i])
        state_index = states.index(state)

        # Choose action
        if np.random.rand() < epsilon:  # exploration
            action = np.random.choice(actions)
        else:  # exploitation
            action = np.argmax(Q[state_index])

        # Get reward
        reward = state[action] - state[0]

        # Update Q-table
        Q[state_index, action] = (1 - alpha) * Q[state_index, action] + \
                                  alpha * (reward + gamma * np.max(Q[state_index]))

# Print Q-table
print(Q)


[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 ...
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]


In [10]:
# Save Q-table
np.save('q_table.npy', Q)


In [9]:
# Initialize counters
count_academic = 0
count_socio_demographic = 0
count_same = 0
totales = 0

# Iterate over each row of the test data
for i in range(len(y_test_academic_binned)):
    # Current state
    state = (y_test_academic_binned[i], y_pred_academic_binned[i], y_pred_socio_demographic_binned[i])
    state_index = states.index(state)

    # Choose action
    action = np.argmax(Q[state_index])

    # Update counters
    if action == 0:
        count_academic += 1
    elif action == 1:
        count_socio_demographic += 1
    else:
        count_same += 1

totales = count_academic + count_socio_demographic + count_same

# Show the results (Quantity + Percentage)
print('Veces que eligió el sueldo académico:', count_academic, '-->', count_academic / totales * 100, '%')
print('Veces que eligió el sueldo socio-demográfico:', count_socio_demographic, '-->', count_socio_demographic / totales * 100, '%')
print('Veces que se quedó con su mismo sueldo:', count_same, '-->', count_same / totales * 100, '%')


Veces que eligió el sueldo académico: 25804 --> 77.12816834050693 %
Veces que eligió el sueldo socio-demográfico: 683 --> 2.0414873266379723 %
Veces que se quedó con su mismo sueldo: 6969 --> 20.830344332855095 %


In [114]:
import gym
from gym import spaces
import numpy as np

# Create the environment
class JobChangeEnv(gym.Env):
    def __init__(self, data, pipeline_socio_demographic, pipeline_academic):
        super(JobChangeEnv, self).__init__()

        # Define action and observation space
        self.action_space = spaces.Discrete(3)
        self.observation_space = spaces.Box(low=0, high=100, shape=(2,))

        # Store the data and models
        self.data = data
        self.pipeline_socio_demographic = pipeline_socio_demographic
        self.pipeline_academic = pipeline_academic

        # Initialize the current index to the first row of the data
        self.current_index = 0
    
    def step(self, action):
        # Get the current row of data
        current_row = self.data.iloc[[self.current_index]]

        # Get the current salary
        current_salary = current_row['PAID_WAGE_PER_YEAR'].values[0]

        # Get the features for the models
        socio_demographic_features = current_row[socio_demographic_cols]
        academic_features = current_row[academic_cols]

        # Get the new salary offers from the models
        socio_demographic_offer = self.pipeline_socio_demographic.predict(socio_demographic_features)[0]
        academic_offer = self.pipeline_academic.predict(academic_features)[0]


        # Calculate the reward based on the action taken by the agent
        if action == 0:  # Stay with the current job
            reward = current_salary
        elif action == 1:  # Take the socio-demographic job offer
            reward = socio_demographic_offer
        else:  # Take the academic job offer
            reward = academic_offer

        # Update the current index
        self.current_index += 1

        # Check if we have reached the end of the data
        done = self.current_index >= len(self.data)
        
        return np.array([[current_salary, socio_demographic_offer]]), reward, done, {}


    def reset(self):
    # Reset the state of the environment to an initial state
        self.current_index = 0
        return np.array(self.data.iloc[self.current_index]['PAID_WAGE_PER_YEAR'])


    def render(self, mode='human'):
        pass

In [117]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import random
from collections import deque

class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self):
        model = Sequential()
        model.add(Dense(24, input_shape=(1,), activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model


    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)

        act_values = self.model.predict(np.expand_dims(state, axis=0))
        return np.argmax(act_values[0])

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            state = np.reshape(state, [1, state_size])
            next_state = np.array(next_state).astype('float32')
            target = self.model.predict(np.expand_dims(state, axis=0))
            if done:
                target[0][action] = reward
            else:
                Q_future = max(self.model.predict(np.expand_dims(next_state, axis=0))[0])
                target[0][action] = reward + Q_future * self.gamma
            self.model.train_on_batch(np.expand_dims(state, axis=0), target)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay



In [119]:
actions = list(pipeline_academic.named_steps.keys())
state_size = (1,)
action_size = len(actions)
agent = DQNAgent(state_size, action_size)
done = False
batch_size = 32
max_steps_per_episode = 10

n_episodes = 1000  # Set the desired number of episodes


# Training loop
for e in range(n_episodes):
    state = env.reset()
    state = np.reshape(state, state_size)
    for time in range(max_steps_per_episode):
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, state_size)
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        if done:
            print("Episode: {}/{}, score: {}".format(e + 1, n_episodes, time))
            break
    if len(agent.memory) > batch_size:
        agent.replay(batch_size)






ValueError: cannot reshape array of size 3 into shape (1,)