## Loading the Dataset

In [None]:
# Code to read csv file into Colaboratory:
# !pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import pandas as pd

# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
# link for dataset
link = 'https://drive.google.com/file/d/1xPGOMsOXflvfzd0gcYt6pQ1XHEpCyl8l/view?usp=share_link' # The shareable link
#       
# getting the id part of the link
id = link.split("/")[-2]
 
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('full_dataset_2019.csv') 
 
data = pd.read_csv('full_dataset_2019.csv')

## Prepping the data

In [None]:
# Transforming all variables to string
datos = data.astype(str)
# Change variable type                
datos = datos.astype({"P6790": float})

In [None]:
# settig up the y variable
datos['P6440'] = datos['P6440'].replace("2","0",regex = True)

# switching the values of base, from strings to numeric categories
datos['base'] = datos['base'].replace("Area", 1, regex = True)
datos['base'] = datos['base'].replace("Cabecera", 2, regex = True)
datos['base'] = datos['base'].replace("Resto", 3, regex = True)

In [None]:
# Line to split the code
from sklearn.model_selection import train_test_split

# Lines for OneHotEncoder
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV

# Line for Decision Tree 
from sklearn.tree import DecisionTreeClassifier

# Line for F1 Score
from sklearn.metrics import f1_score

# Line for Matthews Correlation Coefficient
from sklearn.metrics import matthews_corrcoef

# Line for cross validation
from sklearn.model_selection import cross_validate

#Line for confusion matrix
from sklearn.metrics import ConfusionMatrixDisplay

# Lines for Standardscaler
from sklearn.preprocessing import StandardScaler

## Dividing our data by Categories

Setting the subsets by columns variables


*  Unexpected expenses
*  Geographical
*  Working characteristics
*  Preferences and opinions
*  Sociodemographic characteristics
*  Other activities






In [None]:
# Defining dependant variable
y = datos['P6440']

# general to fill
f1_scores = []
combinations = []
mcc = []

# We use onehotencoder to encode our categorical varibales into binary columns
ohe = OneHotEncoder(handle_unknown='ignore')
#se = StandardScaler()

#nombres, aunque no debería de importar
names_subsets = ['unex_exp', 'geog', 'work_chars', 'prefs_ops', 'sociodem_chara', 'other']

# transformations of variables
#numerical = ["P6790"] #we are not using this
categorical = ['P6915','P7240','P6125','P388','DPTO','base','CLASE','P6870','P6880','P1881','P6240','P7090','P7130','P7170S1','P7170S6','P514','P515','P6020','P6050','P6070','P6160','P6170','P6210','P7480S1','P7480S3','P7480S4','P7480S5','P7480S6','P7480S7','P7480S8','P7480S9','P7480S10','P7480S11']


### **Unexpected expenses**

In [None]:
# Unexpected expenses
unex_exp = datos[['P6915', # ¿en caso de enfermedad como cubriría los costos médicos y los medicamentos?
            'P7240', # En caso de no tener trabajo, de donde obtendría principalmente los recursos para sus gatos y/o los de su hogar
            'P6125' # en los últimos 12 meses dejó de asistir al médico o no se hospitalizó, por no tener con que pagar estos servicios
            ]]

# Create a pipeline
pipe_tree = Pipeline([
    ('preproc', ohe),
    ('model', DecisionTreeClassifier())
    ])

#datasets
X_train, X_test, y_train, y_test = train_test_split(unex_exp,y,test_size = 0.30, random_state=573)
#fit
y_pred_tree = pipe_tree.fit(X_train, y_train).predict(X_train)
#predict
y_pred_tree = pipe_tree.fit(X_train, y_train).predict(X_test)
#f1 score
f1_scores.append(f1_score(y_pred_tree, y_test, average="macro"))
#mcc score
mcc.append(matthews_corrcoef(y_pred_tree, y_test))

In [None]:
f1_scores, mcc

([0.7654705785956], [0.5469401237934076])

### **Geographical**

In [None]:
# Geographical
geog = datos[['P388', #en qué departamento realiza # principalmente este trabajo
        'DPTO', # Departamento
        'base', # Área Cabecera Resto
        'CLASE' #Urbano - Rural
        ]]

# Create a pipeline
pipe_tree = Pipeline([
    ('preproc', ohe),
    ('model', DecisionTreeClassifier())
    ])

#datasets
X_train, X_test, y_train, y_test = train_test_split(geog,y,test_size = 0.30, random_state=573)
#fit
y_pred_tree = pipe_tree.fit(X_train, y_train).predict(X_train)
#predict
y_pred_tree = pipe_tree.fit(X_train, y_train).predict(X_test)
#f1 score
f1_scores.append(f1_score(y_pred_tree, y_test, average="macro"))
#mcc score
mcc.append(matthews_corrcoef(y_pred_tree, y_test))

#let's see the outcomes of the models
f1_scores, mcc

([0.7654705785956, 0.5644500002003721],
 [0.5469401237934076, 0.1809084831438166])

### **Working characteristics**

In [None]:
# Working characteristics
work_chars = datos[['P6870', # ¿cuántas personas en total tiene la empresa, negocio, industria, oficina, firma o finca o sitio donde ...Trabaja?
              'P6880', # ¿Dónde realiza principalmente su trabajo:
              'P1881', # ¿Qué medio de transporte utiliza principalmente para desplazarse a su sitio de trabajo?
              'P6240' # ¿en qué actividad ocupó ... la mayor parte del tiempo la semana pasada?
              ]]

# Create a pipeline
pipe_tree = Pipeline([
    ('preproc', ohe),
    ('model', DecisionTreeClassifier())
    ])

#datasets
X_train, X_test, y_train, y_test = train_test_split(work_chars,y,test_size = 0.30, random_state=573)
#fit
y_pred_tree = pipe_tree.fit(X_train, y_train).predict(X_train)
#predict
y_pred_tree = pipe_tree.fit(X_train, y_train).predict(X_test)
#f1 score
f1_scores.append(f1_score(y_pred_tree, y_test, average="macro"))
#mcc score
mcc.append(matthews_corrcoef(y_pred_tree, y_test))

#let's see the outcomes of the models
f1_scores, mcc

([0.7772182738232414], [0.5550886096283816])

### **Preferences and opinions**

In [None]:
# Preferences and opinions
prefs_ops = datos[['P7090', # Además de las horas que trabaja actualmente ¿... quiere trabajar más horas?
             'P7130', # ¿Desea cambiar el trabajo que tiene actualmente?
             'P7170S1', # ¿Está satisfecho con su trabajo actual?
             'P7170S6', # ¿Está ... satisfecho con su jornada laboral actual?
             'P514', # ¿considera que es su empleo o trabajo actual es estable?
             'P515' # ¿su horario de trabajo y sus responsabilidades familiares son compatibles?
             ]]

# Create a pipeline
pipe_tree = Pipeline([
    ('preproc', ohe),
    ('model', DecisionTreeClassifier())
    ])

#datasets
X_train, X_test, y_train, y_test = train_test_split(prefs_ops,y,test_size = 0.30, random_state=573)
#fit
y_pred_tree = pipe_tree.fit(X_train, y_train).predict(X_train)
#predict
y_pred_tree = pipe_tree.fit(X_train, y_train).predict(X_test)
#f1 score
f1_scores.append(f1_score(y_pred_tree, y_test, average="macro"))
#mcc score
mcc.append(matthews_corrcoef(y_pred_tree, y_test))

#let's see the outcomes of the models
f1_scores, mcc

([0.7654705785956, 0.5644500002003721, 0.8923697117414314, 0.5587601617821668],
 [0.5469401237934076,
  0.1809084831438166,
  0.785184414269175,
  0.16782743231208658])

### **Sociodemographic characteristics**

In [None]:
# Sociodemographic characteristics
sociodem_chara = datos[['P6020', # Sexo
                  'P6050', # ¿Cuál es el parentesco de ... con el jefe o jefa del hogar?
                  'P6070', # Actualmente casadx / no casadx y vive e pareja hace menos de dos años / ...
                  'P6160', # ¿sabe leer y escribir?
                  'P6170', # ¿actualmente .. asiste a la escuela, colegio o universidad?
                  'P6210', # ¿cuál es el nivel educativo más alto alcanzado por ... y el último año o grado aprobado en este nivel?
                  ]]

# Create a pipeline
pipe_tree = Pipeline([
    ('preproc', ohe),
    ('model', DecisionTreeClassifier())
    ])

#datasets
X_train, X_test, y_train, y_test = train_test_split(sociodem_chara,y,test_size = 0.30, random_state=573)
#fit
y_pred_tree = pipe_tree.fit(X_train, y_train).predict(X_train)
#predict
y_pred_tree = pipe_tree.fit(X_train, y_train).predict(X_test)
#f1 score
f1_scores.append(f1_score(y_pred_tree, y_test, average="macro"))
#mcc score
mcc.append(matthews_corrcoef(y_pred_tree, y_test))

#let's see the outcomes of the models
f1_scores, mcc

([0.7654705785956,
  0.5644500002003721,
  0.8923697117414314,
  0.5587601617821668,
  0.6380802450124744],
 [0.5469401237934076,
  0.1809084831438166,
  0.785184414269175,
  0.16782743231208658,
  0.28778419908307157])

### **Other activities**

In [None]:
# Other activities
other = datos[['P7480S1', #Ayudar a criar animales
               'P7480S3', #Realizar oficios en su hogar
               'P7480S4', #Realizar oficios del hogar en otros hogares o instituciones
               'P7480S5', #Cuidar o atender niños
               'P7480S6', #Cuidar a personas ancianas y/o discapacitadas
               'P7480S7', # Elaborar prendas de vestir o tejidos para miembros del hogarr
               'P7480S8', # Asistir a cursos o eventos de acapacitación
               'P7480S9', # Trabajar en la autoconstucción de vivienda
               'P7480S10', # Realizar trabajos comunitarios o voluntarios en edificaciones u obras comunitarias o públicas
               'P7480S11' # Participar en otras activiaddes comunales y/o de trabajo voluntario
               ]]

# Create a pipeline
pipe_tree = Pipeline([
    ('preproc', ohe),
    ('model', DecisionTreeClassifier())
    ])

#datasets
X_train, X_test, y_train, y_test = train_test_split(other,y,test_size = 0.30, random_state=573)
#fit
y_pred_tree = pipe_tree.fit(X_train, y_train).predict(X_train)
#predict
y_pred_tree = pipe_tree.fit(X_train, y_train).predict(X_test)
#f1 score
f1_scores.append(f1_score(y_pred_tree, y_test, average="macro"))
#mcc score
mcc.append(matthews_corrcoef(y_pred_tree, y_test))

#let's see the outcomes of the models
f1_scores, mcc


([0.7654705785956,
  0.5644500002003721,
  0.8923697117414314,
  0.5587601617821668,
  0.6380802450124744,
  0.42504791054299557],
 [0.5469401237934076,
  0.1809084831438166,
  0.785184414269175,
  0.16782743231208658,
  0.28778419908307157,
  0.09821352071871092])

### **Summarizing our Results**

In [None]:
pd.DataFrame({'subsets': names_subsets, 'f1_scores': f1_scores, 'mcc':mcc}, columns=['subsets', 'f1_scores','mcc'])
#dataset_2021

Unnamed: 0,subsets,f1_scores,mcc
0,unex_exp,0.765471,0.54694
1,geog,0.56445,0.180908
2,work_chars,0.89237,0.785184
3,prefs_ops,0.55876,0.167827
4,sociodem_chara,0.63808,0.287784
5,other,0.425048,0.098214


The 6 groups that are formed by characteristics of the working people and their working environments are separated and evaluated into the the best model, which is the decision tree. According to the evaluation metric F1, the variables that act as a better input for the chosen model are those associated with unexpected expenses and working characteristics. However, when looking at the MCC the model that outperforms all others is the one with the input of working characteristics.