In [1]:
import pandas as pd
import numpy as np

In [12]:
data = {
    'ID': range(1, 21),
    'Nombre': ['Ana', 'Luis', 'Carlos', 'María', 'Elena', 'Jorge', 'Sofía', 'Andrés', 'Laura', 'Pedro',
               'Paula', 'Diego', 'Carmen', 'Marta', None, 'Pablo', 'Lucía', 'Manuel', 'Carla', 'David'],
    'Edad': [23, 35, 30, 22, 28, 25, 34, 45, 29, 41, 38, 25, 33, 27, 32, 36, 32, 39, 31, 40],
    'Salario': [5000, 6000, np.nan, 58000, 49000, 62000, 72000, 61000, 52000, 670000,
                55000, 53000, np.nan, 69000, 64000, 71000, 68000, 73000, np.nan, 66000],
    'Fecha_Ingreso': ['2020-01-10', '2019-06-15', '2020-03-25', None, '2018-07-30', '2017-09-14',
                      '2016-11-22', '2021-04-01', '2015-05-19', '2014-08-25', '2013-12-30', '2012-02-11',
                      '2011-10-17', None, '2020-09-09', '2018-04-05', '2019-11-22', '2010-07-27', '2020-05-13', '2017-01-29']
}
df = pd.DataFrame(data)
df.fillna({'Salario': df['Salario'].mean()}, inplace=True)

display(df)

Unnamed: 0,ID,Nombre,Edad,Salario,Fecha_Ingreso
0,1,Ana,23,5000.0,2020-01-10
1,2,Luis,35,6000.0,2019-06-15
2,3,Carlos,30,55941.176471,2020-03-25
3,4,María,22,58000.0,
4,5,Elena,28,49000.0,2018-07-30
5,6,Jorge,25,62000.0,2017-09-14
6,7,Sofía,34,72000.0,2016-11-22
7,8,Andrés,45,61000.0,2021-04-01
8,9,Laura,29,52000.0,2015-05-19
9,10,Pedro,41,67000.0,2014-08-25


In [13]:
Q1 = df['Salario'].quantile(0.25)
Q3 = df['Salario'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df['Salario'] < lower_bound) | (df['Salario'] > upper_bound)]
display(outliers)

Unnamed: 0,ID,Nombre,Edad,Salario,Fecha_Ingreso
0,1,Ana,23,5000.0,2020-01-10
1,2,Luis,35,6000.0,2019-06-15


In [14]:
df_cleaned = df[(df['Salario'] >= lower_bound) & (df['Salario'] <= upper_bound)]
display(df_cleaned)

Unnamed: 0,ID,Nombre,Edad,Salario,Fecha_Ingreso
2,3,Carlos,30,55941.176471,2020-03-25
3,4,María,22,58000.0,
4,5,Elena,28,49000.0,2018-07-30
5,6,Jorge,25,62000.0,2017-09-14
6,7,Sofía,34,72000.0,2016-11-22
7,8,Andrés,45,61000.0,2021-04-01
8,9,Laura,29,52000.0,2015-05-19
9,10,Pedro,41,67000.0,2014-08-25
10,11,Paula,38,55000.0,2013-12-30
11,12,Diego,25,53000.0,2012-02-11


In [5]:
df['Salario_Log'] = np.log(df['Salario'])
display(df[['Salario', 'Salario_Log']])

Unnamed: 0,Salario,Salario_Log
0,5000.0,8.517193
1,6000.0,8.699515
2,55941.176471,10.932056
3,58000.0,10.968198
4,49000.0,10.799576
5,62000.0,11.03489
6,72000.0,11.184421
7,61000.0,11.018629
8,52000.0,10.858999
9,67000.0,11.112448


In [15]:
df['Salario_Capped'] = np.where(
    (df['Salario'] < lower_bound) | (df['Salario'] > upper_bound),
    df['Salario'].median(),
    df['Salario'])

display(df[['Salario', 'Salario_Capped']])

Unnamed: 0,Salario,Salario_Capped
0,5000.0,59500.0
1,6000.0,59500.0
2,55941.176471,55941.176471
3,58000.0,58000.0
4,49000.0,49000.0
5,62000.0,62000.0
6,72000.0,72000.0
7,61000.0,61000.0
8,52000.0,52000.0
9,67000.0,67000.0


In [16]:
df['Salario_Binned'] = pd.cut(df['Salario'], bins=5)
display(df[['Salario', 'Salario_Binned']])

Unnamed: 0,Salario,Salario_Binned
0,5000.0,"(4932.0, 18600.0]"
1,6000.0,"(4932.0, 18600.0]"
2,55941.176471,"(45800.0, 59400.0]"
3,58000.0,"(45800.0, 59400.0]"
4,49000.0,"(45800.0, 59400.0]"
5,62000.0,"(59400.0, 73000.0]"
6,72000.0,"(59400.0, 73000.0]"
7,61000.0,"(59400.0, 73000.0]"
8,52000.0,"(45800.0, 59400.0]"
9,67000.0,"(59400.0, 73000.0]"


In [17]:
from sklearn.linear_model import HuberRegressor

In [18]:
X = df[['Edad']]
y = df['Salario']

In [19]:
huber = HuberRegressor()
huber.fit(X, y)
y_pred = huber.predict(X)
print(y_pred)

[54472.32225399 61707.74301189 58692.98436277 53869.37052417
 57487.08090312 55678.22571364 61104.79128207 67737.26031014
 58090.03263294 65325.45339084 63516.59820137 55678.22571364
 60501.83955224 56884.12917329 59898.88782242 62310.69474172
 59898.88782242 64119.54993119 59295.93609259 64722.50166102]
