In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler

dummy_employed = pd.read_csv('dummy_employed.csv')
dummy_wk_incomes = pd.read_csv('dummy_wk_incomes.csv')


In [2]:
y = dummy_employed['Monthly Income']
X = dummy_employed.drop(columns=['Monthly Income'], axis=1)

scaler = MinMaxScaler()
X=pd.DataFrame(scaler.fit_transform(X),
            columns=X.columns, index=X.index) 

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

rt = DecisionTreeRegressor(criterion = 'mse', max_depth=100, min_samples_split=8, splitter='best', min_samples_leaf= 100)

model_r = rt.fit(X_train, y_train)

y_pred = model_r.predict(X_test)

In [4]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R Squared Score is:', r2_score(y_test, y_pred))

Mean Absolute Error: 2927.019725719349
Mean Squared Error: 30537755.371197402
Root Mean Squared Error: 5526.097662111791
R Squared Score is: 0.18134664395291678


In [5]:
y = dummy_wk_incomes['Monthly Income']
X = dummy_wk_incomes.drop(columns=['Monthly Income'], axis=1)

scaler = MinMaxScaler()
X=pd.DataFrame(scaler.fit_transform(X),
            columns=X.columns, index=X.index) 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

rt = DecisionTreeRegressor(min_samples_split=8, max_depth=100, splitter='best', min_samples_leaf= 100)

model_r = rt.fit(X_train, y_train)

y_pred = model_r.predict(X_test)

In [6]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R Squared Score is:', r2_score(y_test, y_pred))

Mean Absolute Error: 3369.507344612625
Mean Squared Error: 39766029.669677876
Root Mean Squared Error: 6306.0312138204545
R Squared Score is: 0.2685455500527367


##  Linear Multiple Regression 

In [7]:
import statsmodels.api as sm

model = sm.OLS(y, sm.add_constant(X))
results = model.fit()

print(results.summary())

                            OLS Regression Results                            
Dep. Variable:         Monthly Income   R-squared:                       0.273
Model:                            OLS   Adj. R-squared:                  0.272
Method:                 Least Squares   F-statistic:                     215.9
Date:                Wed, 29 Mar 2023   Prob (F-statistic):               0.00
Time:                        18:02:53   Log-Likelihood:            -1.2815e+06
No. Observations:              126055   AIC:                         2.563e+06
Df Residuals:                  125835   BIC:                         2.566e+06
Df Model:                         219                                         
Covariance Type:            nonrobust                                         
                                                                                                                                           coef    std err          t      P>|t|      [0.025      0.975]
---------

## Classification Problem

In [35]:
new_employed = pd.read_csv('Data/new_employed.csv')
wk_incomes = pd.read_csv('Data/wk_incomes.csv')

In [36]:
new_employed['Monthly Income'].describe()

count    180370.000000
mean       7754.908993
std        6196.597671
min           0.000000
25%        5590.000000
50%        6880.000000
75%        8600.000000
max      300000.000000
Name: Monthly Income, dtype: float64

In [37]:
new_employed['Income Level']= new_employed['Monthly Income']
new_employed.head()

Unnamed: 0.1,Unnamed: 0,Residence,Gender,Age,Literacy,Scholarship,Career,Marital Status,Urban or Rural,Economic Zone,By Position,By Economic Sector,Monthly Income,Income Level
0,0,Ciudad de México,Hombre,59,Sí,Profesional,Biología,Está soltero(a),Muestra urbana,Zona 2 Resto del paÃ­s,Trabajadores subordinados y remunerados,Servicios,6880.0,6880.0
1,2,Ciudad de México,Hombre,30,Sí,Profesional,Planes multidisciplinarios o generales del cam...,Está soltero(a),Muestra urbana,Zona 2 Resto del paÃ­s,Trabajadores subordinados y remunerados,Servicios,16000.0,16000.0
2,4,Ciudad de México,Hombre,42,Sí,Profesional,Telecomunicaciones,Está separado(a),Muestra urbana,Zona 2 Resto del paÃ­s,Trabajadores subordinados y remunerados,Servicios,6880.0,6880.0
3,5,Ciudad de México,Mujer,36,Sí,Profesional,Gastronomía y servicios de alimentos,Está casado(a),Muestra urbana,Zona 2 Resto del paÃ­s,Trabajadores subordinados y remunerados,Servicios,6880.0,6880.0
4,6,Ciudad de México,Hombre,43,Sí,Profesional,Hospitalidad y turismo,Está casado(a),Muestra urbana,Zona 2 Resto del paÃ­s,Trabajadores subordinados y remunerados,Servicios,6880.0,6880.0


In [38]:
level_1 = new_employed['Income Level'].quantile(.10)
level_2 = new_employed['Income Level'].quantile(.20)
level_3 = new_employed['Income Level'].quantile(.30)
level_4 = new_employed['Income Level'].quantile(.40)
level_5 = new_employed['Income Level'].quantile(.50)
level_6 = new_employed['Income Level'].quantile(.60)
level_7 = new_employed['Income Level'].quantile(.70)
level_8 = new_employed['Income Level'].quantile(.80)
level_9 = new_employed['Income Level'].quantile(.90)


In [40]:
new_employed.loc[(new_employed['Monthly Income'] <= level_1, 'Income Level')] = 'Level 1'
new_employed

Unnamed: 0.1,Unnamed: 0,Residence,Gender,Age,Literacy,Scholarship,Career,Marital Status,Urban or Rural,Economic Zone,By Position,By Economic Sector,Monthly Income,Income Level
0,0,Ciudad de México,Hombre,59,Sí,Profesional,Biología,Está soltero(a),Muestra urbana,Zona 2 Resto del paÃ­s,Trabajadores subordinados y remunerados,Servicios,6880.0,6880
1,2,Ciudad de México,Hombre,30,Sí,Profesional,Planes multidisciplinarios o generales del cam...,Está soltero(a),Muestra urbana,Zona 2 Resto del paÃ­s,Trabajadores subordinados y remunerados,Servicios,16000.0,16000
2,4,Ciudad de México,Hombre,42,Sí,Profesional,Telecomunicaciones,Está separado(a),Muestra urbana,Zona 2 Resto del paÃ­s,Trabajadores subordinados y remunerados,Servicios,6880.0,6880
3,5,Ciudad de México,Mujer,36,Sí,Profesional,Gastronomía y servicios de alimentos,Está casado(a),Muestra urbana,Zona 2 Resto del paÃ­s,Trabajadores subordinados y remunerados,Servicios,6880.0,6880
4,6,Ciudad de México,Hombre,43,Sí,Profesional,Hospitalidad y turismo,Está casado(a),Muestra urbana,Zona 2 Resto del paÃ­s,Trabajadores subordinados y remunerados,Servicios,6880.0,6880
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180365,389425,Zacatecas,Hombre,62,Sí,Primaria,,Está casado(a),Muestra complemento y rural,Zona 2 Resto del paÃ­s,Trabajadores subordinados y remunerados,Construcción,7740.0,7740
180366,389428,Zacatecas,Mujer,31,Sí,Preparatoria o bachillerato,,Está viudo(a),Muestra complemento y rural,Zona 2 Resto del paÃ­s,Trabajadores subordinados y remunerados,Servicios,7740.0,7740
180367,389429,Zacatecas,Mujer,18,Sí,Preparatoria o bachillerato,,Está soltero(a),Muestra complemento y rural,Zona 2 Resto del paÃ­s,Trabajadores subordinados y remunerados,Comercio,7955.0,7955
180368,389430,Zacatecas,Hombre,54,Sí,Primaria,,Está casado(a),Muestra complemento y rural,Zona 2 Resto del paÃ­s,Trabajadores por cuenta propia,Servicios,6880.0,6880


In [44]:
new_employed.loc[(new_employed['Monthly Income'] <= level_2, 'Income Level')] = 'Level 2'
new_employed.loc[(new_employed['Monthly Income'] <= level_3, 'Income Level')] = 'Level 3'
new_employed.loc[(new_employed['Monthly Income'] <= level_4, 'Income Level')] = 'Level 4'
new_employed.loc[(new_employed['Monthly Income'] <= level_5, 'Income Level')] = 'Level 5'
new_employed.loc[(new_employed['Monthly Income'] <= level_6, 'Income Level')] = 'Level 6'
new_employed.loc[(new_employed['Monthly Income'] <= level_7, 'Income Level')] = 'Level 7'
new_employed.loc[(new_employed['Monthly Income'] <= level_8, 'Income Level')] = 'Level 8'
new_employed.loc[(new_employed['Monthly Income'] <= level_9, 'Income Level')] = 'Level 9'
new_employed.loc[(new_employed['Monthly Income'] > level_9, 'Income Level')] = 'Level 10'

In [45]:
new_employed

Unnamed: 0.1,Unnamed: 0,Residence,Gender,Age,Literacy,Scholarship,Career,Marital Status,Urban or Rural,Economic Zone,By Position,By Economic Sector,Monthly Income,Income Level
0,0,Ciudad de México,Hombre,59,Sí,Profesional,Biología,Está soltero(a),Muestra urbana,Zona 2 Resto del paÃ­s,Trabajadores subordinados y remunerados,Servicios,6880.0,Level 9
1,2,Ciudad de México,Hombre,30,Sí,Profesional,Planes multidisciplinarios o generales del cam...,Está soltero(a),Muestra urbana,Zona 2 Resto del paÃ­s,Trabajadores subordinados y remunerados,Servicios,16000.0,Level 10
2,4,Ciudad de México,Hombre,42,Sí,Profesional,Telecomunicaciones,Está separado(a),Muestra urbana,Zona 2 Resto del paÃ­s,Trabajadores subordinados y remunerados,Servicios,6880.0,Level 9
3,5,Ciudad de México,Mujer,36,Sí,Profesional,Gastronomía y servicios de alimentos,Está casado(a),Muestra urbana,Zona 2 Resto del paÃ­s,Trabajadores subordinados y remunerados,Servicios,6880.0,Level 9
4,6,Ciudad de México,Hombre,43,Sí,Profesional,Hospitalidad y turismo,Está casado(a),Muestra urbana,Zona 2 Resto del paÃ­s,Trabajadores subordinados y remunerados,Servicios,6880.0,Level 9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180365,389425,Zacatecas,Hombre,62,Sí,Primaria,,Está casado(a),Muestra complemento y rural,Zona 2 Resto del paÃ­s,Trabajadores subordinados y remunerados,Construcción,7740.0,Level 9
180366,389428,Zacatecas,Mujer,31,Sí,Preparatoria o bachillerato,,Está viudo(a),Muestra complemento y rural,Zona 2 Resto del paÃ­s,Trabajadores subordinados y remunerados,Servicios,7740.0,Level 9
180367,389429,Zacatecas,Mujer,18,Sí,Preparatoria o bachillerato,,Está soltero(a),Muestra complemento y rural,Zona 2 Resto del paÃ­s,Trabajadores subordinados y remunerados,Comercio,7955.0,Level 9
180368,389430,Zacatecas,Hombre,54,Sí,Primaria,,Está casado(a),Muestra complemento y rural,Zona 2 Resto del paÃ­s,Trabajadores por cuenta propia,Servicios,6880.0,Level 9


In [46]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree

In [51]:
y = new_employed['Income Level']
X = new_employed.drop(columns=['Monthly Income', 'Income Level'], axis=1)

rel_columns = ['Age','Gender', 'Literacy', 'Scholarship','Career', 'Marital Status', 'Urban or Rural', 'Residence','Economic Zone', 'By Position','By Economic Sector']
X = pd.get_dummies(X[rel_columns], drop_first=True, dtype=float)

scaler = MinMaxScaler()
X=pd.DataFrame(scaler.fit_transform(X),
            columns=X.columns, index=X.index) 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state= 42)

In [52]:
clf = DecisionTreeClassifier(criterion='entropy')

clf.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy')

In [54]:
y_preds = clf.predict(X_test)

print('Accuracy: ', accuracy_score(y_test, y_preds))

Accuracy:  0.8546875866274879


## with known incomes

In [59]:
wk_incomes['Income Level']= wk_incomes['Monthly Income']

level_1 = wk_incomes['Income Level'].quantile(.10)
level_2 = wk_incomes['Income Level'].quantile(.20)
level_3 = wk_incomes['Income Level'].quantile(.30)
level_4 = wk_incomes['Income Level'].quantile(.40)
level_5 = wk_incomes['Income Level'].quantile(.50)
level_6 = wk_incomes['Income Level'].quantile(.60)
level_7 = wk_incomes['Income Level'].quantile(.70)
level_8 = wk_incomes['Income Level'].quantile(.80)
level_9 = wk_incomes['Income Level'].quantile(.90)

wk_incomes.loc[(wk_incomes['Monthly Income'] <= level_1, 'Income Level')] = 'Level 1'
wk_incomes.loc[(wk_incomes['Monthly Income'] <= level_2, 'Income Level')] = 'Level 2'
wk_incomes.loc[(wk_incomes['Monthly Income'] <= level_3, 'Income Level')] = 'Level 3'
wk_incomes.loc[(wk_incomes['Monthly Income'] <= level_4, 'Income Level')] = 'Level 4'
wk_incomes.loc[(wk_incomes['Monthly Income'] <= level_5, 'Income Level')] = 'Level 5'
wk_incomes.loc[(wk_incomes['Monthly Income'] <= level_6, 'Income Level')] = 'Level 6'
wk_incomes.loc[(wk_incomes['Monthly Income'] <= level_7, 'Income Level')] = 'Level 7'
wk_incomes.loc[(wk_incomes['Monthly Income'] <= level_8, 'Income Level')] = 'Level 8'
wk_incomes.loc[(wk_incomes['Monthly Income'] <= level_9, 'Income Level')] = 'Level 9'
wk_incomes.loc[(wk_incomes['Monthly Income'] > level_9, 'Income Level')] = 'Level 10'

y = wk_incomes['Income Level']
X = wk_incomes.drop(columns=['Monthly Income', 'Income Level'], axis=1)

rel_columns = ['Age','Gender', 'Literacy', 'Scholarship','Career', 'Marital Status', 'Urban or Rural', 'Residence','Economic Zone', 'By Position','By Economic Sector']
X = pd.get_dummies(X[rel_columns], drop_first=True, dtype=float)

scaler = MinMaxScaler()
X=pd.DataFrame(scaler.fit_transform(X),
            columns=X.columns, index=X.index) 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state= 42)

clf = DecisionTreeClassifier(criterion='entropy')

clf.fit(X_train, y_train)

y_preds = clf.predict(X_test)

print('Accuracy: ', accuracy_score(y_test, y_preds))

Accuracy:  0.858159029008118
