In [1]:
import warnings

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import missingno as msno

import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve, roc_auc_score

import funciones_auxiliares as aux

# warnings.filterwarnings('ignore')
sns.set_theme()
plt.rcParams['figure.figsize'] = (12, 6)

In [2]:
df = pd.read_csv('income-db.csv')

In [3]:
df_depurada = df.replace('?', np.nan)
df_depurada = df_depurada.dropna()

In [64]:
REEMPLAZO_OCCUPATION = {'white-collar': ['Prof-specialty', 'Exec-managerial', 'Adm-clerical', 'Sales',
                                         'Tech-support'],
                        'blue-collar': ['Craft-repair', 'Machine-op-inspct', 'Transport-moving',
                                        'Handlers-cleaners', 'Farming-fishing', 'Protective-serv',
                                        'Priv-house-serv'],
                        'others': ['Other-service', 'Armed-Forces']}

REEMPLAZO_WORKCLASS = {'federal-gov': ['Federal-gov'],
                       'state-level-gov': ['State-gov', 'Local-gov'],
                       'self-employed': ['Self-emp-inc', 'Self-emp-not-inc'],
                       'unemployed': ['Never-worked', 'Without-pay']}

REEMPLAZO_EDUCATION = {'preschool': ['Preschool'],
                       'elementary-school': ['1st-4th', '5th-6th'],
                       'high-school': ['7th-8th', '9th', '10th','11th', '12th', 'HS-grad'],
                       'college': ['Assoc-voc', 'Assoc-acdm', 'Some-college'],
                       'university': ['Bachelors', 'Masters', 'Prof-school', 'Doctorate']}

REEMPLAZO_MARITAL = {'married': ['Married-civ-spouse', 'Married-spouse-absent','Married-AF-spouse'],
                     'divorced': ['Divorced'],
                     'separated': ['Separated'],
                     'widowed': ['Widowed']}

REEMPLAZO_COUNTRY = {'america': ["United-States", "Mexico", "Puerto-Rico", "Canada", "El-Salvador", 
                                 "Cuba", "Jamaica", "Dominican-Republic", "Guatemala", "Columbia", 
                                 "Haiti", "Nicaragua", "Peru", "Ecuador", "Trinadad&Tobago", 
                                 "Outlying-US(Guam-USVI-etc)"],
                     'asia': ["Philippines", "India", "China", "Japan", "Vietnam", "Taiwan", 
                              "Iran", "Hong", "Thailand", "Cambodia", "Laos"],
                     'europe': ["Germany", "England", "Italy", "Poland", "Portugal", "Greece", 
                                "France", "Ireland", "Yugoslavia", "Scotland", "Honduras", 
                                "Hungary", "Holand-Netherlands"],
                     'oceania': [],
                     'africa': ["South"]}

REEMPLAZO_INCOME = {0: ['<=50K'], 1: ['>50K']}
REEMPLAZO_SEX = {0: ['Male'], 1: ['Female']}

RECODIFICACION_ENUNCIADO_1 = [REEMPLAZO_OCCUPATION, REEMPLAZO_WORKCLASS, REEMPLAZO_EDUCATION,
                              REEMPLAZO_MARITAL, REEMPLAZO_COUNTRY, REEMPLAZO_INCOME, REEMPLAZO_SEX]

def recodificar_variable(df, diccionario_cambio):
    tmp = df.copy()
    for valor_nuevo, valores_antiguos in diccionario_cambio.items():
        tmp = tmp.replace(valores_antiguos, valor_nuevo)

    return tmp

def recodificar_enunciado_uno(df):
    recodificada = df.copy()
    for dict_recod in RECODIFICACION_ENUNCIADO_1:
        recodificada = recodificar_variable(recodificada, dict_recod)
    
    return recodificada

def codificar_a_one_hot(df, nombre_columna, serie_columna):
    tmp = df.copy()

    df_unida = tmp.join(pd.get_dummies(serie_columna, drop_first=True))
    df_unida = df_unida.drop(columns=nombre_columna)
    
    return df_unida

def one_hot_todas_las_categoricas(df):
    tmp = df.copy()

    _, categoricas = aux.separar_df_a_numericas_categoricas(df)
    for nombre_columna, serie_columna in categoricas.items():
        tmp = codificar_a_one_hot(tmp, nombre_columna, serie_columna)
    
    return tmp


In [65]:
df_recodificada = recodificar_enunciado_uno(df_depurada)
df_final = one_hot_todas_las_categoricas(df_recodificada)

In [73]:
variables_independientes = ' + '.join(df_final.drop(columns='income').columns)
formula_final = f'income ~ {variables_independientes}'

In [66]:
modelo = smf.logit(formula_final, df_final).fit()

Unnamed: 0,age,fnlwgt,educational-num,gender,capital-gain,capital-loss,hours-per-week,income,federal-gov,self-employed,...,Own-child,Unmarried,Wife,Asian-Pac-Islander,Black,Other,White,america,asia,europe
0,25,226802,7,0,0,0,40,0,0,0,...,1,0,0,0,1,0,0,1,0,0
1,38,89814,9,0,0,0,50,0,0,0,...,0,0,0,0,0,0,1,1,0,0
2,28,336951,12,0,0,0,40,1,0,0,...,0,0,0,0,0,0,1,1,0,0
3,44,160323,10,0,7688,0,40,1,0,0,...,0,0,0,0,1,0,0,1,0,0
5,34,198693,6,0,0,0,30,0,0,0,...,0,0,0,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,257302,12,1,0,0,38,0,0,0,...,0,0,1,0,0,0,1,1,0,0
48838,40,154374,9,0,0,0,40,1,0,0,...,0,0,0,0,0,0,1,1,0,0
48839,58,151910,9,1,0,0,40,0,0,0,...,0,1,0,0,0,0,1,1,0,0
48840,22,201490,9,0,0,0,20,0,0,0,...,1,0,0,0,0,0,1,1,0,0
