# Regresion lineal del logaritmo del salario

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler

In [16]:
df1 = df.drop(['terminocontrato',"educ_publica","DIRECTORIO","ORDEN","HOGAR","orden_conyugue","conyuge_en_hogar","ocupados"],axis=1)


In [17]:
df1 = pd.get_dummies(df1, dtype=int)

In [18]:
# se quitan los salarios 0 para poder utilizar log
df1 = df1[df1['INGLABO']>0]
df1['educacion**2']=df1['educacion']**2

In [19]:
df1 = df1.dropna(how='any')

## Analisis de correlacion de variables con el logaritmo del salario

In [20]:
def correlacion(datos,variable=str):
    a = datos.corr().loc[variable, :].sort_values(ascending=False)
    z = [i for i in a if i >= 0.1] # se quieren ver solo correlaciones mayores a 0.1
    head = len(z)
    return a.head(head)
    

In [21]:
correlacion(df1,'INGLABO')

INGLABO                          1.000000
educacion**2                     0.470052
educacion                        0.408543
carroparticular                  0.364283
tipocupacion_empleadogobierno    0.306786
computador                       0.287030
internet                         0.222275
tipocontrato_Verbal              0.213507
lavadora                         0.195068
television                       0.177770
tipocupacion_empleador           0.161884
estado_civil_Casados             0.156220
#_cuartos_vivienda               0.138805
nevera                           0.121666
mesestrabajoultimoaño            0.115561
Name: INGLABO, dtype: float64

In [22]:
# se convierte al logaritmo del salario basandose en literatura economica
y = np.log(df1['INGLABO'].values)
# se arma X, se quitan algunas dummies que no tienen observaciones
X = df1.drop(['INGLABO','tipocupacion_familiarnoremu','tipocupacion_empleadonoremu','tipocupacion_empleadonoremu','tipocupacion_familiarnoremu','tipocupacion_otro'], axis=1).values

## Modelo regresion lineal normal

In [23]:
def regresion_lineal(X,y):
    # division de la base 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    #normalizacion de X
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    # entrenamiento del modelo
    lr = LinearRegression()
    lr.fit(X_train,y_train)
    # predicciones
    y_hat = lr.predict(X_test)
    # metricas
    rmse = np.sqrt(mean_squared_error(y_test,y_hat))
    mse = mean_squared_error(y_test,y_hat)
    r2 = r2_score(y_test,y_hat)
    return {'rmse':rmse, 'mse':mse, 'r2':r2, 'numero de observaciones de entrenamiento':len(y_train)}

In [24]:
modelo=regresion_lineal(X,y)
print(modelo)


{'rmse': 0.610304340234178, 'mse': 0.3724713877086753, 'r2': 0.4809610187469888, 'numero de observaciones de entrenamiento': 5666}


## Modelo ridge

In [25]:
from sklearn.linear_model import Ridge

In [26]:
def regresion_ridge(X,y,alpha):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    #normalizacion de X
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    # modelo ridge
    l_ridge = Ridge(alpha).fit(X_train,y_train)
    # predicciones
    y_hat = l_ridge.predict(X_test)
    # metricas
    rmse = np.sqrt(mean_squared_error(y_test,y_hat))
    mse = mean_squared_error(y_test,y_hat)
    r2 = r2_score(y_test,y_hat)
    return {'rmse':rmse, 'mse':mse, 'r2':r2, 'numero de observaciones de entrenamiento':len(y_train)}

In [27]:
regresion_ridge(X,y,1)

{'rmse': 0.6097199837289877,
 'mse': 0.37175845855847706,
 'r2': 0.48195448571394395,
 'numero de observaciones de entrenamiento': 5666}

## Modelo Lasso

In [28]:
from sklearn.linear_model import Lasso

In [29]:
def regresion_lasso(X,y,alpha):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    #normalizacion de X
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    # modelo ridge
    l_lasso = Lasso(alpha).fit(X_train,y_train)
    # predicciones
    y_hat = l_lasso.predict(X_test)
    # metricas
    rmse = np.sqrt(mean_squared_error(y_test,y_hat))
    mse = mean_squared_error(y_test,y_hat)
    r2 = r2_score(y_test,y_hat)
    return {'rmse':rmse, 'mse':mse, 'r2':r2, 'numero de observaciones de entrenamiento':len(y_train)}

In [30]:
regresion_lasso(X,y,0.005)

{'rmse': 0.611585434787389,
 'mse': 0.37403674404407966,
 'r2': 0.4787796942629142,
 'numero de observaciones de entrenamiento': 5666}