# Regresión múltiple

Ejemplo de pipeline de preprocesado y regresión.

Objetivo: predecir price que significa precio de vivienda.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [17]:
df = pd.read_csv('../../data/duke-forest-nulls.csv')
df.tail(1)

Unnamed: 0,address,price,bed,bath,area,type,year_built,heating,cooling,parking,lot,hoa,url
97,"2708 Circle Dr, Durham, NC 27705",674500,4,4.0,3766.0,Single Family,1955.0,"Forced air, Electric, Gas",other,0 spaces,0.73,,https://www.zillow.com/homedetails/2708-Circle...


## Valores faltantes con Scikit Learn

Técnicas de imputación de valores faltantes: sklearn.impute

### Ejemplo demo

In [55]:
from sklearn.experimental import enable_iterative_imputer
# CUIDADO: enable_iterative_imputer va antes de IterativeImputer para poder usarlo
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer

df = pd.DataFrame({
    'edad': [25, np.nan, 40, 35, 60, np.nan],
    'genero': ['masculino', 'femenino', np.nan, 'masculino', 'masculino', 'femenino'],
    'ingresos': [30000, 70000, np.nan, 80000, np.nan, 40000]
})

# Opción 1 SimpleImputer por media o mediana:
# imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
# df['edad'] = imp_mean.fit_transform(df[['edad']])

# Opción 2 KNNImputer: media pero de las filas más cercanas según el algoritmo de KNN
# imp_knn = KNNImputer(n_neighbors=2) # calcula la media de los n_neighbors más cercanos
# df['edad'] = imp_knn.fit_transform(df[['edad']])
# df['ingresos'] = imp_knn.fit_transform(df[['ingresos']])

# Opción 3 SimpleImputer constante: asignamos un valor fijo
# imp_constant = SimpleImputer(strategy='constant', fill_value='other')
# df['genero'] = imp_constant.fit_transform(df[['genero']])

# Opción 4 SimpleImputer con el valor más frecuente (moda) 
# imp_mode = SimpleImputer(strategy='most_frequent')
# df['genero'] = imp_mode.fit_transform(df[['genero']])

# Opción 5 IterativeImputer con una predicción utilizando una regresión
imp_iter = IterativeImputer(random_state=42)
df['ingresos'] = imp_iter.fit_transform(df[['ingresos']])

df.head()

Unnamed: 0,edad,genero,ingresos
0,25.0,masculino,30000.0
1,,femenino,70000.0
2,40.0,,55000.0
3,35.0,masculino,80000.0
4,60.0,masculino,55000.0


### Ejemplo duke forest

In [18]:
df = pd.read_csv('../../data/duke-forest-nulls.csv')

df.isnull().sum()

address        0
price          0
bed            0
bath           0
area           2
type           2
year_built     4
heating        1
cooling        1
parking        0
lot            1
hoa           97
url            0
dtype: int64

In [19]:
# Borrar hoa
df = df.drop(['hoa'], axis=1)

In [20]:
from sklearn.impute import SimpleImputer

# Estrategias para numéricos: mean, median
imp_mean = SimpleImputer(missing_values=np.nan, strategy='median')
df['lot'] = imp_mean.fit_transform(df[['lot']]) # fit_transform recibe una matriz 2D, un dataframe

In [21]:
df.isnull().sum()

address       0
price         0
bed           0
bath          0
area          2
type          2
year_built    4
heating       1
cooling       1
parking       0
lot           0
url           0
dtype: int64

In [22]:
# categóricos: most_frequent, constant
imp_most_freq = SimpleImputer(missing_values=np.nan, strategy='most_frequent') # moda
df['type'] = imp_most_freq.fit_transform(df[['type']])

In [23]:
df.isnull().sum()

address       0
price         0
bed           0
bath          0
area          2
type          0
year_built    4
heating       1
cooling       1
parking       0
lot           0
url           0
dtype: int64

## Regresión lineal simple