# Salary Prediction in Mexico

## Importing

In [45]:
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import Lasso, Ridge, LinearRegression
import matplotlib as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

## Dataframes

### Dataframe from kaggle with target variable <=50k, >50k

In [2]:
dataset_path = ".\\Documents\\MachineLearningProject\\Dataset\\salary.csv"
df_50k = pd.read_csv(dataset_path)
df_50k.shape
df_50k.sample()


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
23128,64,Local-gov,198728,Some-college,10,Never-married,Transport-moving,Unmarried,White,Male,0,0,40,United-States,<=50K


Filter the dataset to only have mexican data

In [3]:
df_50k = df_50k[df_50k["native-country"].str.contains("Mexico", case=False, na=False)]
df_50k.shape

(643, 15)

### Dataframes from the Mexican government

In [4]:
path = '.\\Documents\\MachineLearningProject\\Profesionistas y Tecnicos\\Evolucion-poblacion-ocupada-trabajadores-totales\\Evolucion-poblacion-ocupada-trabajadores-totales.csv'
professionals_df = pd.read_csv(path)
professionals_df.sample()
professionals_df.columns

Index(['Category ID', 'Category', 'Quarter ID', 'Quarter', 'Workforce',
       'Monthly Wage', 'Time', 'type', 'type ID'],
      dtype='object')

Filter unnecesary or redundance data.

In [5]:
columns_to_pop = ['type', 'type ID', 'Workforce', 'Time', 'Quarter']
for column in columns_to_pop:
    professionals_df.pop(column)
professionals_df

Unnamed: 0,Category ID,Category,Quarter ID,Monthly Wage
0,1,"Funcionarios, Directores y Jefes",20101,5652.376449
1,1,"Funcionarios, Directores y Jefes",20102,5176.334410
2,1,"Funcionarios, Directores y Jefes",20103,5528.734344
3,1,"Funcionarios, Directores y Jefes",20104,5201.022173
4,1,"Funcionarios, Directores y Jefes",20111,4678.875804
...,...,...,...,...
213,4,"Comerciantes, Empleados en Ventas y Agentes de...",20232,4363.811313
214,4,"Comerciantes, Empleados en Ventas y Agentes de...",20233,4502.061632
215,4,"Comerciantes, Empleados en Ventas y Agentes de...",20234,4617.991412
216,4,"Comerciantes, Empleados en Ventas y Agentes de...",20241,4925.742507


In [6]:
category_values = professionals_df['Category'].unique()
category_keys = professionals_df['Category ID'].unique()

category_dict = {}
for i in range(len(category_values)):
    category_dict[category_keys[i]] = category_values[i]

In [7]:
print(category_dict)

{1: 'Funcionarios, Directores y Jefes', 2: 'Profesionistas y Técnicos', 3: 'Trabajadores Auxiliares en Actividades Administrativas', 4: 'Comerciantes, Empleados en Ventas y Agentes de Ventas'}


So now we can pop the Category column.

In [8]:
professionals_df.pop('Category')
professionals_df.sample()

Unnamed: 0,Category ID,Quarter ID,Monthly Wage
210,4,20223,4099.784322


Dataframe salary per sex and scolar years

In [9]:
sex_and_education_paths = ['.\\Documents\\MachineLearningProject\\actividades administrativas\\Salario-segun-sexo-y-escolaridad-en-segundo-trimestre-de-2024-trabajadores-totales (2)\\Salario-segun-sexo-y-escolaridad-en-segundo-trimestre-de-2024-trabajadores-totales.csv', 
                           '.\\Documents\\MachineLearningProject\\Comerciantes y Venta\\Salario-segun-sexo-y-escolaridad-en-segundo-trimestre-de-2024-trabajadores-totales (1)\\Salario-segun-sexo-y-escolaridad-en-segundo-trimestre-de-2024-trabajadores-totales.csv',
                           '.\\Documents\\MachineLearningProject\\Funcionarios directores y jefes\\Salario-segun-sexo-y-escolaridad-en-segundo-trimestre-de-2024-trabajadores-totales (3)\\Salario-segun-sexo-y-escolaridad-en-segundo-trimestre-de-2024-trabajadores-totales.csv',
                           '.\\Documents\\MachineLearningProject\\Profesionistas y Tecnicos\\Salario-segun-sexo-y-escolaridad-en-segundo-trimestre-de-2024-trabajadores-totales\\Salario-segun-sexo-y-escolaridad-en-segundo-trimestre-de-2024-trabajadores-totales.csv'
                        ]

sex_and_education_dfs = []
for path in sex_and_education_paths:
    sex_and_education_dfs.append(pd.read_csv(path))

sex_and_education_df = pd.concat(sex_and_education_dfs, axis = 0)
sex_and_education_df.sample()

Unnamed: 0,Category ID,Category,Schooling Years Range ID,Schooling Years Range,Sex ID,Sex,Monthly Wage,Workforce,Number of Records,Workforce Total,percentage
6,2,Profesionistas y Técnicos,4,10 a 12 Años de Escolaridad,2,Mujer,6041.53889,573505,1995,1836071,31.235448


We notice in Sex ID 1 is for male and 2 for female.

Here we pop the unnecesary columns.

In [10]:
columns_to_pop = ['Category', 'Workforce', 'Number of Records', 'percentage', 'Sex', 'Workforce Total']
for column in columns_to_pop:
    sex_and_education_df.pop(column)

In [11]:
sex_and_education_df.sample()

Unnamed: 0,Category ID,Schooling Years Range ID,Schooling Years Range,Sex ID,Monthly Wage
9,1,6,16 a 18 Años de Escolaridad,2,11237.044526


I am going to determine the schooling years range keys.

In [12]:
schooling_years_range_values = sex_and_education_df['Schooling Years Range'].unique()

In [13]:
schooling_years_range_keys = sex_and_education_df['Schooling Years Range ID'].unique()

In [14]:
schooling_years_range = {}
for i in range(len(schooling_years_range_keys)):
    schooling_years_range[schooling_years_range_keys[i]] = schooling_years_range_values[i]

In [15]:
print(schooling_years_range)

{2: '4 a 6 Años de Escolaridad', 3: '7 a 9 Años de Escolaridad', 4: '10 a 12 Años de Escolaridad', 5: '13 a 15 Años de Escolaridad', 6: '16 a 18 Años de Escolaridad', 7: 'Más de 18 Años de Escolaridad', 1: '0 a 3 Años de Escolaridad'}


Now I can pop that column.

In [16]:
sex_and_education_df.pop('Schooling Years Range')
sex_and_education_df.sample()

Unnamed: 0,Category ID,Schooling Years Range ID,Sex ID,Monthly Wage
0,4,1,1,3809.480121


Salary per state dataframe.

In [17]:
salary_per_state_paths = ['.\\Documents\\MachineLearningProject\\actividades administrativas\\Salarios-en-2024-T2 (2)\\Salarios-en-2024-T2.csv',
                          '.\\Documents\\MachineLearningProject\\Comerciantes y Venta\\Salarios-en-2024-T2 (1)\\Salarios-en-2024-T2.csv',
                          '.\\Documents\\MachineLearningProject\\Funcionarios directores y jefes\\Salarios-en-2024-T2 (3)\\Salarios-en-2024-T2.csv',
                          '.\\Documents\\MachineLearningProject\\Profesionistas y Tecnicos\\Salarios-en-2024-T2\\Salarios-en-2024-T2.csv']

salary_per_state_dfs = []
for path in salary_per_state_paths:
    salary_per_state_dfs.append(pd.read_csv(path))

salary_per_state_df = pd.concat(salary_per_state_dfs, axis = 0)

In [18]:
salary_per_state_df.sample()

Unnamed: 0,Category ID,Category,State ID,State,Quarter ID,Quarter,Monthly Wage,Monthly Wage Growth,Monthly Wage Growth Value,lastSalary
14,1,"Officers, Directors and Heads",15,Estado de México,20242,2024-Q2,6841.159987,-0.224136,-1976.308916,8817.468904


Pop unnecesary columns.

In [19]:
columns_to_pop = ['Category', 'Monthly Wage Growth', 'lastSalary', 'Monthly Wage Growth Value', 'Quarter']
for column in columns_to_pop:
    salary_per_state_df.pop(column)

In [20]:
salary_per_state_df.sample()

Unnamed: 0,Category ID,State ID,State,Quarter ID,Monthly Wage
31,3,32,Zacatecas,20242,7072.647041


We are doing another dictionary for state and state id.

In [21]:
state_values = salary_per_state_df['State'].unique()

In [22]:
satate_keys = salary_per_state_df['State ID'].unique()

In [23]:
state_dict = {}
for i in range(len(satate_keys)):
    state_dict[satate_keys[i]] = state_values[i]

In [24]:
print(state_dict)

{1: 'Aguascalientes', 2: 'Baja California', 3: 'Baja California Sur', 4: 'Campeche', 5: 'Coahuila de Zaragoza', 6: 'Colima', 7: 'Chiapas', 8: 'Chihuahua', 9: 'Ciudad de México', 10: 'Durango', 11: 'Guanajuato', 12: 'Guerrero', 13: 'Hidalgo', 14: 'Jalisco', 15: 'Estado de México', 16: 'Michoacán de Ocampo', 17: 'Morelos', 18: 'Nayarit', 19: 'Nuevo León', 20: 'Oaxaca', 21: 'Puebla', 22: 'Querétaro', 23: 'Quintana Roo', 24: 'San Luis Potosí', 25: 'Sinaloa', 26: 'Sonora', 27: 'Tabasco', 28: 'Tamaulipas', 29: 'Tlaxcala', 30: 'Veracruz de Ignacio de la Llave', 31: 'Yucatán', 32: 'Zacatecas'}


Now we're popping the state column.

In [25]:
salary_per_state_df.pop('State')

0                      Aguascalientes
1                     Baja California
2                 Baja California Sur
3                            Campeche
4                Coahuila de Zaragoza
                   ...               
27                         Tamaulipas
28                           Tlaxcala
29    Veracruz de Ignacio de la Llave
30                            Yucatán
31                          Zacatecas
Name: State, Length: 128, dtype: object

Now we have this dataframes from the mexican government's page:

In [26]:
salary_per_state_df.sample(5)

Unnamed: 0,Category ID,State ID,Quarter ID,Monthly Wage
1,2,2,20242,8085.182218
28,4,29,20242,3923.989935
24,2,25,20242,11360.12222
19,3,20,20242,6450.577241
15,3,16,20242,7551.039399


In [27]:
professionals_df.sample(5)

Unnamed: 0,Category ID,Quarter ID,Monthly Wage
169,4,20121,2557.19214
37,1,20192,8098.726575
32,1,20181,7687.20622
7,1,20114,4880.940481
59,2,20103,9152.277655


In [28]:
sex_and_education_df.sample(5)

Unnamed: 0,Category ID,Schooling Years Range ID,Sex ID,Monthly Wage
0,1,2,1,8256.462548
10,4,6,1,7378.434186
1,2,2,1,7026.412147
9,4,5,2,4630.523775
8,3,6,1,6640.638373


With this ID dictionaries:

In [47]:
print(schooling_years_range)
print(state_dict)
print(category_dict)

{2: '4 a 6 Años de Escolaridad', 3: '7 a 9 Años de Escolaridad', 4: '10 a 12 Años de Escolaridad', 5: '13 a 15 Años de Escolaridad', 6: '16 a 18 Años de Escolaridad', 7: 'Más de 18 Años de Escolaridad', 1: '0 a 3 Años de Escolaridad'}
{1: 'Aguascalientes', 2: 'Baja California', 3: 'Baja California Sur', 4: 'Campeche', 5: 'Coahuila de Zaragoza', 6: 'Colima', 7: 'Chiapas', 8: 'Chihuahua', 9: 'Ciudad de México', 10: 'Durango', 11: 'Guanajuato', 12: 'Guerrero', 13: 'Hidalgo', 14: 'Jalisco', 15: 'Estado de México', 16: 'Michoacán de Ocampo', 17: 'Morelos', 18: 'Nayarit', 19: 'Nuevo León', 20: 'Oaxaca', 21: 'Puebla', 22: 'Querétaro', 23: 'Quintana Roo', 24: 'San Luis Potosí', 25: 'Sinaloa', 26: 'Sonora', 27: 'Tabasco', 28: 'Tamaulipas', 29: 'Tlaxcala', 30: 'Veracruz de Ignacio de la Llave', 31: 'Yucatán', 32: 'Zacatecas'}
{1: 'Funcionarios, Directores y Jefes', 2: 'Profesionistas y Técnicos', 3: 'Trabajadores Auxiliares en Actividades Administrativas', 4: 'Comerciantes, Empleados en Ventas y

## Machine learning and parameter tunning

Here I split the data in X and y

In [57]:
dfs = [salary_per_state_df, professionals_df, sex_and_education_df]
X = []
y = []
for df in dfs:
    X.append(df.drop('Monthly Wage', axis = 1))
    y.append(df['Monthly Wage'])

Here I train-test splitted the data.

In [67]:
X_train = []
X_test = []
y_train = []
y_test = []
for i in X:
    X_tr, X_te, y_tr, y_te = train_test_split(X_state, y_state, test_size = 0.2, random_state = 45)
    X_train.append(X_tr)
    X_test.append(X_te)
    y_train.append(y_tr)
    y_test.append(y_te)

Here I do all the pipelines of scaling, training the data and finally make a grid search for the best params.

In [71]:


search_space = [
  {'regressor': [LinearRegression()]},
  {'regressor': [Lasso()], 'regressor__alpha': [0.1, 1, 10, 20, 30]},
  {'regressor': [Ridge()], 'regressor__alpha': [0.1, 1, 10, 20, 30]}
]
pipeline = Pipeline([('scale', StandardScaler()), ('regressor', LinearRegression())])

gs = GridSearchCV(estimator= pipeline, param_grid=search_space)

for i in range(len(X_train)):
  gs.fit(X_train[i], y_train[i])
  best_model = gs.best_estimator_
  print("{}Best model:".format(i), best_model.named_steps['regressor'])
  print("{}Best score:".format(i), gs.best_score_)


ls = Lasso(alpha=40)

for i in range(len(X_train)):
  ls.fit(X_train[i], y_train[i])
  print(i, ls.score(X_test[i], y_test[i]))



0Best model: Ridge(alpha=1)
0Best score: 0.47549893309589597
1Best model: Ridge(alpha=1)
1Best score: 0.47549893309589597
2Best model: Ridge(alpha=1)
2Best score: 0.47549893309589597
0 0.278676994191268
1 0.278676994191268
2 0.278676994191268


## Conclusions

I was thinking on combining the predictions each dataframe can make by doing, but by looking at the performance of each models I came to the conclussion that this data cannot be used for the purposes of predicting a mexican salary.