In [35]:
import pandas as pd
import numpy as np
from sklearn.linear_model import HuberRegressor

In [19]:
df = pd.read_csv('VeinteDatos.csv')
df.loc[df['ID'] == 1, 'Salario'] = 5000
df.loc[df['ID'] == 2, 'Salario'] = 6000
df.loc[df['ID'] == 6, 'Edad'] = 25
df.loc[df['ID'] == 17, 'Edad'] = 32
df.fillna({'Salario': df['Salario'].mean()}, inplace=True)
display(df)

Unnamed: 0,ID,Nombre,Edad,Salario,Fecha_Ingreso
0,1,Ana,23.0,5000.0,2020-01-10
1,2,Luis,35.0,6000.0,2019-06-15
2,3,Carlos,30.0,55941.176471,2020-03-25
3,4,María,22.0,58000.0,
4,5,Elena,28.0,49000.0,2018-07-30
5,6,Jorge,25.0,62000.0,2017-09-14
6,7,Sofía,34.0,72000.0,2016-11-22
7,8,Andrés,45.0,61000.0,2021-04-01
8,9,Laura,29.0,52000.0,2015-05-19
9,10,Pedro,41.0,67000.0,2014-08-25


In [20]:
#rangos intercuartilicos para indentificar datos atipicos
Q1 = df['Salario'].quantile(0.25)
Q3 = df['Salario'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

#identificacion de datos atipicos
outliers = df[(df['Salario'] < lower_bound) | (df['Salario'] > upper_bound)]
display(outliers)

Unnamed: 0,ID,Nombre,Edad,Salario,Fecha_Ingreso
0,1,Ana,23.0,5000.0,2020-01-10
1,2,Luis,35.0,6000.0,2019-06-15


In [21]:
#eliminar outliers
df_cleaned = df[(df['Salario'] >= lower_bound) & (df['Salario'] <= upper_bound)]
display(df_cleaned)

Unnamed: 0,ID,Nombre,Edad,Salario,Fecha_Ingreso
2,3,Carlos,30.0,55941.176471,2020-03-25
3,4,María,22.0,58000.0,
4,5,Elena,28.0,49000.0,2018-07-30
5,6,Jorge,25.0,62000.0,2017-09-14
6,7,Sofía,34.0,72000.0,2016-11-22
7,8,Andrés,45.0,61000.0,2021-04-01
8,9,Laura,29.0,52000.0,2015-05-19
9,10,Pedro,41.0,67000.0,2014-08-25
10,11,Paula,38.0,55000.0,2013-12-30
11,12,Diego,25.0,53000.0,2012-02-11


In [22]:
#Transformacion de datos
#cada valor de salario le aplico el logaritmo
df['Salario_Log'] = np.log(df['Salario'])
display(df)
#se puede aplciar raiz cuadrada pero eso depende de como lo querramos manejar
#tendencia de estabilidad en el eje x, (asintota horizontal)

Unnamed: 0,ID,Nombre,Edad,Salario,Fecha_Ingreso,Salario_Log
0,1,Ana,23.0,5000.0,2020-01-10,8.517193
1,2,Luis,35.0,6000.0,2019-06-15,8.699515
2,3,Carlos,30.0,55941.176471,2020-03-25,10.932056
3,4,María,22.0,58000.0,,10.968198
4,5,Elena,28.0,49000.0,2018-07-30,10.799576
5,6,Jorge,25.0,62000.0,2017-09-14,11.03489
6,7,Sofía,34.0,72000.0,2016-11-22,11.184421
7,8,Andrés,45.0,61000.0,2021-04-01,11.018629
8,9,Laura,29.0,52000.0,2015-05-19,10.858999
9,10,Pedro,41.0,67000.0,2014-08-25,11.112448


In [23]:
#imputacion de valores atipicos
#significa cambiar valores de datos por valores medios o estadares para no verse perjudicado a los demas datos
#cambio por una mediana
df['Salario_Capped'] = np.where(
    (df['Salario'] < lower_bound) | (df['Salario'] > upper_bound),
    df['Salario'].mean(),
    df['Salario'] #que columna vamos a reemplazar
)
display(df)

Unnamed: 0,ID,Nombre,Edad,Salario,Fecha_Ingreso,Salario_Log,Salario_Capped
0,1,Ana,23.0,5000.0,2020-01-10,8.517193,55941.176471
1,2,Luis,35.0,6000.0,2019-06-15,8.699515,55941.176471
2,3,Carlos,30.0,55941.176471,2020-03-25,10.932056,55941.176471
3,4,María,22.0,58000.0,,10.968198,58000.0
4,5,Elena,28.0,49000.0,2018-07-30,10.799576,49000.0
5,6,Jorge,25.0,62000.0,2017-09-14,11.03489,62000.0
6,7,Sofía,34.0,72000.0,2016-11-22,11.184421,72000.0
7,8,Andrés,45.0,61000.0,2021-04-01,11.018629,61000.0
8,9,Laura,29.0,52000.0,2015-05-19,10.858999,52000.0
9,10,Pedro,41.0,67000.0,2014-08-25,11.112448,67000.0


In [24]:
#binned
#agrupacion de intervalos par suavizar datos atipicos
df['Salario_Binned'] = pd.cut(df['Salario'], bins=5)
display(df)

Unnamed: 0,ID,Nombre,Edad,Salario,Fecha_Ingreso,Salario_Log,Salario_Capped,Salario_Binned
0,1,Ana,23.0,5000.0,2020-01-10,8.517193,55941.176471,"(4932.0, 18600.0]"
1,2,Luis,35.0,6000.0,2019-06-15,8.699515,55941.176471,"(4932.0, 18600.0]"
2,3,Carlos,30.0,55941.176471,2020-03-25,10.932056,55941.176471,"(45800.0, 59400.0]"
3,4,María,22.0,58000.0,,10.968198,58000.0,"(45800.0, 59400.0]"
4,5,Elena,28.0,49000.0,2018-07-30,10.799576,49000.0,"(45800.0, 59400.0]"
5,6,Jorge,25.0,62000.0,2017-09-14,11.03489,62000.0,"(59400.0, 73000.0]"
6,7,Sofía,34.0,72000.0,2016-11-22,11.184421,72000.0,"(59400.0, 73000.0]"
7,8,Andrés,45.0,61000.0,2021-04-01,11.018629,61000.0,"(59400.0, 73000.0]"
8,9,Laura,29.0,52000.0,2015-05-19,10.858999,52000.0,"(45800.0, 59400.0]"
9,10,Pedro,41.0,67000.0,2014-08-25,11.112448,67000.0,"(59400.0, 73000.0]"


In [28]:
#uso de modelos robustos
#se ocupa sklearn
x = df[['Edad']]
y = df['Salario']

In [38]:
huber = HuberRegressor()
huber.fit(x,y)
y_pred = huber.predict(x)
print(y_pred)

[54472.32225396 61707.7430119  58692.98436276 53869.37052413
 57487.0809031  55678.22571362 61104.79128207 67737.26031017
 58090.03263293 65325.45339086 63516.59820138 55678.22571362
 60501.83955224 56884.12917327 59898.88782241 62310.69474172
 59898.88782241 64119.54993121 59295.93609258 64722.50166104]
