In [1]:
# Tratamiento de datos
# ==============================================================================
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt
import matplotlib.font_manager
from matplotlib import style
style.use('ggplot') or plt.style.use('ggplot')

# Preprocesado y modelado
# ==============================================================================
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
from sklearn.pipeline import Pipeline

# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

In [29]:
df_or = pd.read_csv("./data/train.csv")
df_or

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [30]:
# Eliminamos Id y SalePrice para trabajar el dataset ya que no será necesario
df = df_or.drop(columns=["Id","SalePrice"])
df

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,8,2007,WD,Normal
1456,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,2,2010,WD,Normal
1457,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,GdPrv,Shed,2500,5,2010,WD,Normal
1458,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2010,WD,Normal


In [31]:
# Analizamos el dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [32]:
# Vemos cuales se pueden transformar y en que se deberian transformar
df.head(20)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal
5,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,Shed,700,10,2009,WD,Normal
6,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,8,2007,WD,Normal
7,60,RL,,10382,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,Shed,350,11,2009,WD,Normal
8,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2008,WD,Abnorml
9,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,Corner,...,0,0,,,,0,1,2008,WD,Normal


In [33]:
# Determinamos variables que son categóricas
categorical_columns = df.select_dtypes(include=[object]).columns.to_list()
categorical_columns

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [34]:
# Hacemos un bucle con las columnas categoricas para cambiar Nans por "Unknown"
for column in categorical_columns:
    df[column] = df[column].fillna("Missing")

In [35]:
df

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,Missing,Reg,Lvl,AllPub,Inside,...,0,0,Missing,Missing,Missing,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,Missing,Reg,Lvl,AllPub,FR2,...,0,0,Missing,Missing,Missing,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,Missing,IR1,Lvl,AllPub,Inside,...,0,0,Missing,Missing,Missing,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,Missing,IR1,Lvl,AllPub,Corner,...,0,0,Missing,Missing,Missing,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,Missing,IR1,Lvl,AllPub,FR2,...,0,0,Missing,Missing,Missing,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,Missing,Reg,Lvl,AllPub,Inside,...,0,0,Missing,Missing,Missing,0,8,2007,WD,Normal
1456,20,RL,85.0,13175,Pave,Missing,Reg,Lvl,AllPub,Inside,...,0,0,Missing,MnPrv,Missing,0,2,2010,WD,Normal
1457,70,RL,66.0,9042,Pave,Missing,Reg,Lvl,AllPub,Inside,...,0,0,Missing,GdPrv,Shed,2500,5,2010,WD,Normal
1458,20,RL,68.0,9717,Pave,Missing,Reg,Lvl,AllPub,Inside,...,0,0,Missing,Missing,Missing,0,4,2010,WD,Normal


In [36]:
# Hacemos un bucle para ver las variables categoricas y lo que incluyen para evaluar su potencial
for column in categorical_columns:
    print(column,":",df[column].unique())

MSZoning : ['RL' 'RM' 'C (all)' 'FV' 'RH']
Street : ['Pave' 'Grvl']
Alley : ['Missing' 'Grvl' 'Pave']
LotShape : ['Reg' 'IR1' 'IR2' 'IR3']
LandContour : ['Lvl' 'Bnk' 'Low' 'HLS']
Utilities : ['AllPub' 'NoSeWa']
LotConfig : ['Inside' 'FR2' 'Corner' 'CulDSac' 'FR3']
LandSlope : ['Gtl' 'Mod' 'Sev']
Neighborhood : ['CollgCr' 'Veenker' 'Crawfor' 'NoRidge' 'Mitchel' 'Somerst' 'NWAmes'
 'OldTown' 'BrkSide' 'Sawyer' 'NridgHt' 'NAmes' 'SawyerW' 'IDOTRR'
 'MeadowV' 'Edwards' 'Timber' 'Gilbert' 'StoneBr' 'ClearCr' 'NPkVill'
 'Blmngtn' 'BrDale' 'SWISU' 'Blueste']
Condition1 : ['Norm' 'Feedr' 'PosN' 'Artery' 'RRAe' 'RRNn' 'RRAn' 'PosA' 'RRNe']
Condition2 : ['Norm' 'Artery' 'RRNn' 'Feedr' 'PosN' 'PosA' 'RRAn' 'RRAe']
BldgType : ['1Fam' '2fmCon' 'Duplex' 'TwnhsE' 'Twnhs']
HouseStyle : ['2Story' '1Story' '1.5Fin' '1.5Unf' 'SFoyer' 'SLvl' '2.5Unf' '2.5Fin']
RoofStyle : ['Gable' 'Hip' 'Gambrel' 'Mansard' 'Flat' 'Shed']
RoofMatl : ['CompShg' 'WdShngl' 'Metal' 'WdShake' 'Membran' 'Tar&Grv' 'Roll'
 'ClyTil

In [40]:
# Lidiamos con variables categoricas cuyas categorias tienen un orden lógico
variables_con_orden = [
    'LotShape',        # ['Reg', 'IR1', 'IR2', 'IR3'] (Regularidad del terreno)
    'LandSlope',       # ['Gtl', 'Mod', 'Sev'] (Pendiente del terreno)
    'ExterQual',       # ['Gd', 'TA', 'Ex', 'Fa'] (Calidad externa)
    'ExterCond',       # ['TA', 'Gd', 'Fa', 'Po', 'Ex'] (Condición externa)
    'BsmtQual',        # ['Gd', 'TA', 'Ex', 'Missing', 'Fa'] (Calidad del sótano)
    'BsmtCond',        # ['TA', 'Gd', 'Missing', 'Fa', 'Po'] (Condición del sótano)
    'BsmtExposure',    # ['No', 'Gd', 'Mn', 'Av', 'Missing'] (Exposición del sótano)
    'BsmtFinType1',    # ['GLQ', 'ALQ', 'Unf', 'Rec', 'BLQ', 'Missing', 'LwQ']
    'BsmtFinType2',    # ['Unf', 'BLQ', 'Missing', 'ALQ', 'Rec', 'LwQ', 'GLQ']
    'HeatingQC',       # ['Ex', 'Gd', 'TA', 'Fa', 'Po'] (Calidad de calefacción)
    'KitchenQual',     # ['Gd', 'TA', 'Ex', 'Fa'] (Calidad de cocina)
    'Functional',      # ['Typ', 'Min1', 'Maj1', 'Min2', 'Mod', 'Maj2', 'Sev']
    'FireplaceQu',     # ['Missing', 'TA', 'Gd', 'Fa', 'Ex', 'Po'] (Calidad de chimenea)
    'GarageFinish',    # ['RFn', 'Unf', 'Fin', 'Missing'] (Acabado del garaje)
    'GarageQual',      # ['TA', 'Fa', 'Gd', 'Missing', 'Ex', 'Po'] (Calidad del garaje)
    'GarageCond',      # ['TA', 'Fa', 'Missing', 'Gd', 'Po', 'Ex'] (Condición del garaje)
    'PoolQC',          # ['Missing', 'Ex', 'Fa', 'Gd'] (Calidad de la piscina)
    'Fence'            # ['Missing', 'MnPrv', 'GdWo', 'GdPrv', 'MnWw'] (Calidad de la cerca)
]


In [38]:
# Tesla hizo el trabajo de crear un diccionario con los valores que deberia tener cada categoria para cada variable
# Con este diccionario podemos hacer un bucle que sustituya los valores.
diccionario_asignacion = {
    'LotShape': {'Reg': 0, 'IR1': 1, 'IR2': 2, 'IR3': 3},  # Regularidad del terreno
    'LandSlope': {'Gtl': 0, 'Mod': 1, 'Sev': 2},  # Pendiente del terreno
    'ExterQual': {'Ex': 3, 'Gd': 2, 'TA': 1, 'Fa': 0},  # Calidad externa
    'ExterCond': {'Ex': 3, 'Gd': 2, 'TA': 1, 'Fa': 0, 'Po': -1},  # Condición externa
    'BsmtQual': {'Ex': 3, 'Gd': 2, 'TA': 1, 'Fa': 0, 'Missing': -1},  # Calidad del sótano
    'BsmtCond': {'Ex': 3, 'Gd': 2, 'TA': 1, 'Fa': 0, 'Po': -1, 'Missing': -1},  # Condición del sótano
    'BsmtExposure': {'Gd': 3, 'Av': 2, 'Mn': 1, 'No': 0, 'Missing': -1},  # Exposición del sótano
    'BsmtFinType1': {'GLQ': 5, 'ALQ': 4, 'BLQ': 3, 'Rec': 2, 'LwQ': 1, 'Unf': 0, 'Missing': -1},  # Tipo de sótano
    'BsmtFinType2': {'GLQ': 5, 'ALQ': 4, 'BLQ': 3, 'Rec': 2, 'LwQ': 1, 'Unf': 0, 'Missing': -1},  # Segundo tipo de sótano
    'HeatingQC': {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'Po': 0},  # Calidad de calefacción
    'KitchenQual': {'Ex': 3, 'Gd': 2, 'TA': 1, 'Fa': 0},  # Calidad de cocina
    'Functional': {'Typ': 5, 'Min1': 4, 'Min2': 3, 'Mod': 2, 'Maj1': 1, 'Maj2': 0, 'Sev': -1},  # Funcionalidad
    'FireplaceQu': {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'Po': 0, 'Missing': -1},  # Calidad de chimenea
    'GarageFinish': {'Fin': 2, 'RFn': 1, 'Unf': 0, 'Missing': -1},  # Acabado del garaje
    'GarageQual': {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'Po': 0, 'Missing': -1},  # Calidad del garaje
    'GarageCond': {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'Po': 0, 'Missing': -1},  # Condición del garaje
    'PoolQC': {'Ex': 3, 'Gd': 2, 'Fa': 1, 'Missing': -1},  # Calidad de la piscina
    'Fence': {'GdPrv': 4, 'MnPrv': 3, 'GdWo': 2, 'MnWw': 1, 'Missing': -1}  # Calidad de la cerca
}

for columna, diccionario in diccionario_asignacion.items():
    if columna in df.columns:
        df[columna] = df[columna].map(diccionario)

In [42]:
# Revisamos que las variables con orden tengan valores
df[variables_con_orden].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   LotShape      1460 non-null   int64
 1   LandSlope     1460 non-null   int64
 2   ExterQual     1460 non-null   int64
 3   ExterCond     1460 non-null   int64
 4   BsmtQual      1460 non-null   int64
 5   BsmtCond      1460 non-null   int64
 6   BsmtExposure  1460 non-null   int64
 7   BsmtFinType1  1460 non-null   int64
 8   BsmtFinType2  1460 non-null   int64
 9   HeatingQC     1460 non-null   int64
 10  KitchenQual   1460 non-null   int64
 11  Functional    1460 non-null   int64
 12  FireplaceQu   1460 non-null   int64
 13  GarageFinish  1460 non-null   int64
 14  GarageQual    1460 non-null   int64
 15  GarageCond    1460 non-null   int64
 16  PoolQC        1460 non-null   int64
 17  Fence         1460 non-null   int64
dtypes: int64(18)
memory usage: 205.4 KB


In [43]:
# Variables sin orden a las que aplicaremos un hot_encoder
variables_sin_orden = [
    'MSZoning',        # ['RL', 'RM', 'C (all)', 'FV', 'RH']
    'Street',          # ['Pave', 'Grvl']
    'Alley',           # ['Missing', 'Grvl', 'Pave']
    'LandContour',     # ['Lvl', 'Bnk', 'Low', 'HLS']
    'Utilities',       # ['AllPub', 'NoSeWa']
    'LotConfig',       # ['Inside', 'FR2', 'Corner', 'CulDSac', 'FR3']
    'Neighborhood',    # ['CollgCr', 'Veenker', ...] (Muchos valores nominales)
    'Condition1',      # ['Norm', 'Feedr', ...]
    'Condition2',      # ['Norm', 'Artery', ...]
    'BldgType',        # ['1Fam', '2fmCon', 'Duplex', 'TwnhsE', 'Twnhs']
    'HouseStyle',      # ['2Story', '1Story', ...]
    'RoofStyle',       # ['Gable', 'Hip', ...]
    'RoofMatl',        # ['CompShg', 'WdShngl', ...]
    'Exterior1st',     # ['VinylSd', 'MetalSd', ...]
    'Exterior2nd',     # ['VinylSd', 'MetalSd', ...]
    'MasVnrType',      # ['BrkFace', 'Missing', 'Stone', 'BrkCmn']
    'Foundation',      # ['PConc', 'CBlock', ...]
    'Heating',         # ['GasA', 'GasW', 'Grav', ...]
    'CentralAir',      # ['Y', 'N']
    'Electrical',      # ['SBrkr', 'FuseF', ...]
    'GarageType',      # ['Attchd', 'Detchd', ...]
    'PavedDrive',      # ['Y', 'N', 'P']
    'MiscFeature',     # ['Missing', 'Shed', ...]
    'SaleType',        # ['WD', 'New', ...]
    'SaleCondition'    # ['Normal', 'Abnorml', ...]
]


In [47]:
# Importamos Hot_Encoder
from sklearn.preprocessing import OneHotEncoder

In [45]:
# Es momento de aplicar el hot_encoder para lidiar con aquellas variables que no tienen un orden.
df[variables_sin_orden]

Unnamed: 0,MSZoning,Street,Alley,LandContour,Utilities,LotConfig,Neighborhood,Condition1,Condition2,BldgType,...,MasVnrType,Foundation,Heating,CentralAir,Electrical,GarageType,PavedDrive,MiscFeature,SaleType,SaleCondition
0,RL,Pave,Missing,Lvl,AllPub,Inside,CollgCr,Norm,Norm,1Fam,...,BrkFace,PConc,GasA,Y,SBrkr,Attchd,Y,Missing,WD,Normal
1,RL,Pave,Missing,Lvl,AllPub,FR2,Veenker,Feedr,Norm,1Fam,...,Missing,CBlock,GasA,Y,SBrkr,Attchd,Y,Missing,WD,Normal
2,RL,Pave,Missing,Lvl,AllPub,Inside,CollgCr,Norm,Norm,1Fam,...,BrkFace,PConc,GasA,Y,SBrkr,Attchd,Y,Missing,WD,Normal
3,RL,Pave,Missing,Lvl,AllPub,Corner,Crawfor,Norm,Norm,1Fam,...,Missing,BrkTil,GasA,Y,SBrkr,Detchd,Y,Missing,WD,Abnorml
4,RL,Pave,Missing,Lvl,AllPub,FR2,NoRidge,Norm,Norm,1Fam,...,BrkFace,PConc,GasA,Y,SBrkr,Attchd,Y,Missing,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,RL,Pave,Missing,Lvl,AllPub,Inside,Gilbert,Norm,Norm,1Fam,...,Missing,PConc,GasA,Y,SBrkr,Attchd,Y,Missing,WD,Normal
1456,RL,Pave,Missing,Lvl,AllPub,Inside,NWAmes,Norm,Norm,1Fam,...,Stone,CBlock,GasA,Y,SBrkr,Attchd,Y,Missing,WD,Normal
1457,RL,Pave,Missing,Lvl,AllPub,Inside,Crawfor,Norm,Norm,1Fam,...,Missing,Stone,GasA,Y,SBrkr,Attchd,Y,Shed,WD,Normal
1458,RL,Pave,Missing,Lvl,AllPub,Inside,NAmes,Norm,Norm,1Fam,...,Missing,CBlock,GasA,Y,FuseA,Attchd,Y,Missing,WD,Normal


In [49]:
# Usar OneHotEncoder para lidiar con variables sin orden
df = pd.get_dummies(df,columns=variables_sin_orden,drop_first=True)

In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Columns: 204 entries, MSSubClass to SaleCondition_Partial
dtypes: bool(150), float64(3), int64(51)
memory usage: 829.9 KB


In [128]:
# Identificamos columnas con al menos un NaN
columnas_con_nans = df.columns[df.isnull().any()].tolist()
print("Columnas con NaNs:", columnas_con_nans)


Columnas con NaNs: []


In [127]:
# Lidiamos con cada columna de acuerdo con sus caracteristicas
# EN el caso de lotes con poco frente probablemente es que no tienen 
df["LotFrontage"] = df["LotFrontage"].fillna(0)

# En el caso de area de revestimiento de mampostería en pies cuadrados 
df["MasVnrArea"] = df["MasVnrArea"].fillna(0)

# Para el caso de la garaje, creamos una función para que nos calcule la edad

df["GarageYrBlt_age"] = 2024 - df["GarageYrBlt"] 
df["GarageYrBlt_age"] = df["GarageYrBlt_age"].fillna(-1)
df = df.drop(columns="GarageYrBlt")

In [129]:
df.columns

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'LotShape', 'LandSlope',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
       ...
       'SaleType_ConLw', 'SaleType_New', 'SaleType_Oth', 'SaleType_WD',
       'SaleCondition_AdjLand', 'SaleCondition_Alloca', 'SaleCondition_Family',
       'SaleCondition_Normal', 'SaleCondition_Partial', 'GarageYrBlt_age'],
      dtype='object', length=204)

In [144]:
# Ahora trataremos a los booleanos
c_booleanos = df.select_dtypes(bool).columns.tolist()
df[c_booleanos] = df[c_booleanos].astype(int)
df

Unnamed: 0,MSSubClass,LotFrontage,LotArea,LotShape,LandSlope,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,GarageYrBlt_age
0,60,65.0,8450,0,0,7,5,2003,2003,196.0,...,0,0,0,1,0,0,0,1,0,21.0
1,20,80.0,9600,0,0,6,8,1976,1976,0.0,...,0,0,0,1,0,0,0,1,0,48.0
2,60,68.0,11250,1,0,7,5,2001,2002,162.0,...,0,0,0,1,0,0,0,1,0,23.0
3,70,60.0,9550,1,0,7,5,1915,1970,0.0,...,0,0,0,1,0,0,0,0,0,26.0
4,60,84.0,14260,1,0,8,5,2000,2000,350.0,...,0,0,0,1,0,0,0,1,0,24.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,62.0,7917,0,0,6,5,1999,2000,0.0,...,0,0,0,1,0,0,0,1,0,25.0
1456,20,85.0,13175,0,0,6,6,1978,1988,119.0,...,0,0,0,1,0,0,0,1,0,46.0
1457,70,66.0,9042,0,0,7,9,1941,2006,0.0,...,0,0,0,1,0,0,0,1,0,83.0
1458,20,68.0,9717,0,0,5,6,1950,1996,0.0,...,0,0,0,1,0,0,0,1,0,74.0


In [None]:
# NO SE USÓ
# Crear imputador para columnas numéricas (media)
imputer_num = SimpleImputer(strategy='mean')  # Cambia a 'median' o 'most_frequent' según necesites

# Aplicar a las columnas numéricas
df_numéricas = df.select_dtypes(include=['number'])
df_numéricas_imputadas = imputer_num.fit_transform(df_numéricas)

NameError: name 'SimpleImputer' is not defined

In [149]:
len(df.columns)

204

In [150]:
# Se procede a aplicar el PCA
pca_pipe = Pipeline([("scaler",StandardScaler()),("pca", PCA(n_components=len(df.columns)))])
pca_pipe.fit(df)

# Para acceder a un elemento del Pipe podemos hacerlo asi. Aprovechamos e instanciamos el modelo
modelo_pca = pca_pipe['pca']

In [151]:
# Podemos observar los pesos de cada PC
modelo_pca.components_

array([[-1.20011537e-02,  4.56189073e-02,  4.33376073e-02, ...,
        -5.91177335e-02,  1.17550410e-01, -1.29721915e-01],
       [-1.39742346e-01, -5.54446487e-03,  1.25210823e-01, ...,
         7.49970754e-02, -1.06693556e-01,  1.08874744e-01],
       [ 2.44938915e-02,  6.72388952e-02,  1.38874714e-01, ...,
         3.42734778e-02, -5.70424233e-02,  1.29353825e-01],
       ...,
       [-2.42922783e-16, -1.47936262e-17, -1.09334431e-16, ...,
        -1.38777878e-16, -2.77555756e-17,  5.55111512e-17],
       [-0.00000000e+00,  1.91336441e-16,  5.87606836e-17, ...,
         1.38777878e-17, -5.20417043e-17,  1.31838984e-16],
       [-0.00000000e+00,  1.35530721e-16, -2.84577656e-17, ...,
        -1.38777878e-17,  3.05311332e-16, -7.63278329e-17]])

In [152]:
# Con esto se obtiene cuanto de la varianza explicada por cada PC. NOTA: Las primeras siempre explican mas. 
modelo_pca.explained_variance_ratio_

array([7.38263057e-02, 3.34554843e-02, 2.74193463e-02, 2.30528333e-02,
       1.98832839e-02, 1.92393236e-02, 1.72722506e-02, 1.56148366e-02,
       1.48273587e-02, 1.35315717e-02, 1.32310795e-02, 1.30834149e-02,
       1.24643200e-02, 1.18498537e-02, 1.16389505e-02, 1.12178262e-02,
       1.10256034e-02, 1.06765034e-02, 1.04047331e-02, 1.03550050e-02,
       9.94815707e-03, 9.88271268e-03, 9.74437163e-03, 9.39699573e-03,
       9.34356552e-03, 9.12188556e-03, 9.05788700e-03, 8.78431660e-03,
       8.56425197e-03, 8.42229061e-03, 8.32434654e-03, 8.25377842e-03,
       8.05442029e-03, 7.76257113e-03, 7.70876473e-03, 7.61458797e-03,
       7.43727570e-03, 7.40531559e-03, 7.23448201e-03, 7.10316997e-03,
       7.03866860e-03, 6.92819283e-03, 6.75686736e-03, 6.72214886e-03,
       6.68309150e-03, 6.59433596e-03, 6.53185661e-03, 6.43958903e-03,
       6.39540658e-03, 6.29610263e-03, 6.25827273e-03, 6.22539616e-03,
       6.03304795e-03, 6.01412008e-03, 5.86269293e-03, 5.78534214e-03,
      

In [None]:
# Vemos a partir de que numeros de variables obtenemos un resultado de aprox 85% (0.85113056)
modelo_pca.explained_variance_ratio_.cumsum()

array([0.07382631, 0.10728179, 0.13470114, 0.15775397, 0.17763725,
       0.19687658, 0.21414883, 0.22976366, 0.24459102, 0.25812259,
       0.27135367, 0.28443709, 0.29690141, 0.30875126, 0.32039021,
       0.33160804, 0.34263364, 0.35331015, 0.36371488, 0.37406988,
       0.38401804, 0.39390075, 0.40364513, 0.41304212, 0.42238569,
       0.43150757, 0.44056546, 0.44934978, 0.45791403, 0.46633632,
       0.47466067, 0.48291444, 0.49096886, 0.49873144, 0.5064402 ,
       0.51405479, 0.52149206, 0.52889738, 0.53613186, 0.54323503,
       0.5502737 , 0.55720189, 0.56395876, 0.57068091, 0.577364  ,
       0.58395834, 0.59049019, 0.59692978, 0.60332519, 0.60962129,
       0.61587956, 0.62210496, 0.62813801, 0.63415213, 0.64001482,
       0.64580016, 0.65154811, 0.65720793, 0.66280675, 0.66837427,
       0.67392955, 0.67938843, 0.68483275, 0.6901912 , 0.69552919,
       0.70076157, 0.70597473, 0.71117167, 0.71625408, 0.72131984,
       0.72637961, 0.73139536, 0.73634745, 0.74126295, 0.74605

In [None]:
# Buscamos el valor que se acerqué a 85% y determinamos usar 100 columnas
chorizo = modelo_pca.explained_variance_ratio_.cumsum().tolist()
chorizo.index(0.851130563199729)

99

In [161]:
# Se procede a aplicar el PCA nuevamente pero con 100
pca_pipe = Pipeline([("scaler",StandardScaler()),("pca", PCA(n_components=100))])
pca_pipe.fit(df)

# Para acceder a un elemento del Pipe podemos hacerlo asi. Aprovechamos e instanciamos el modelo
modelo_pca = pca_pipe['pca']

In [165]:
proyecciones = np.dot(modelo_pca.components_, scale(df).T)
proyecciones = pd.DataFrame(proyecciones)
proyecciones = proyecciones.transpose().set_index(df.index)
proyecciones.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,3.246542,-1.592695,-0.830636,2.297805,1.633338,-0.842666,0.803569,-0.117186,1.206722,1.807368,...,-0.098993,-0.549076,-0.082358,0.745573,-0.457661,-0.30539,-0.682315,-0.148333,0.038672,-0.004168
1,-0.286539,2.92135,-0.536414,0.195674,-0.960511,0.448599,0.644978,-0.449636,-1.773699,1.434375,...,0.911819,1.466365,1.649932,0.341579,1.551343,-0.609995,-1.255288,0.766118,-0.160654,-1.127396
2,3.80116,-1.153988,-0.393338,1.965547,1.330308,-0.666818,1.335075,-0.05807,1.335892,1.281295,...,-0.036467,-0.341769,0.026748,-0.132277,-0.629282,-0.272687,-0.65564,0.544629,-0.082319,-0.135878
3,-0.926738,-0.729809,2.385649,0.460244,-1.882852,0.130015,1.406168,-0.553441,0.668019,-0.914306,...,-0.92427,0.122631,0.241845,0.402908,0.207929,0.036921,-1.549798,-0.973349,0.19976,0.165167
4,5.66465,-0.710225,2.139966,2.216816,1.788534,-0.487662,0.524954,-0.676847,1.549722,2.388146,...,-0.354551,1.166106,-0.348956,-0.61654,-0.221772,-0.322551,0.952489,0.485753,0.388614,0.822162


In [None]:
reconstruccion = pca_pipe.inverse_transform(proyecciones)
reconstruccion = pd.DataFrame(
    reconstruccion,
    columns = df.columns,
    index = df.index
)

print('------------------')
print('Valores reconstruidos')
print('------------------')
display(reconstruccion.head())

print('---------------------')
print('Valores originales')
print('---------------------')
display(df.head())

------------------
Valores reconstruidos
------------------


Unnamed: 0,MSSubClass,LotFrontage,LotArea,LotShape,LandSlope,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,GarageYrBlt_age
0,63.703654,67.923547,6266.699775,0.133842,0.016896,6.912789,5.28628,2002.273202,2003.859716,214.694054,...,0.004659,-0.007263,-0.003161,1.016774,0.008469,0.016289,-0.016378,0.977738,-0.007872,22.575303
1,44.448711,70.855028,14273.203972,0.638447,-0.075565,5.960475,7.606648,1964.767939,1987.095825,4.741662,...,0.030387,-0.076598,-0.005116,0.974955,-0.003961,0.006499,0.032748,1.081765,-0.060215,52.410437
2,62.51425,61.147217,10028.356096,0.502911,0.025972,7.225365,5.12432,2002.723045,2002.263375,208.807023,...,0.00765,-0.006688,0.004695,1.032269,0.005776,-0.013908,-0.020006,1.025019,-0.009388,25.240521
3,63.219085,53.158905,10276.717676,0.515758,0.127685,6.532576,5.77778,1939.767994,1973.658636,59.987637,...,-0.011677,0.106042,-0.001482,0.844855,-0.005893,-0.004863,0.06116,0.667607,0.112059,73.507562
4,57.243542,75.014806,14785.440274,0.662444,0.026057,7.562934,4.994461,1994.964298,1996.68153,445.812233,...,-0.00851,-0.016299,0.004758,1.036629,0.013914,-0.020543,-0.012955,0.957414,-0.008962,27.587084


---------------------
Valores originales
---------------------


Unnamed: 0,MSSubClass,LotFrontage,LotArea,LotShape,LandSlope,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,GarageYrBlt_age
0,60,65.0,8450,0,0,7,5,2003,2003,196.0,...,0,0,0,1,0,0,0,1,0,21.0
1,20,80.0,9600,0,0,6,8,1976,1976,0.0,...,0,0,0,1,0,0,0,1,0,48.0
2,60,68.0,11250,1,0,7,5,2001,2002,162.0,...,0,0,0,1,0,0,0,1,0,23.0
3,70,60.0,9550,1,0,7,5,1915,1970,0.0,...,0,0,0,1,0,0,0,0,0,26.0
4,60,84.0,14260,1,0,8,5,2000,2000,350.0,...,0,0,0,1,0,0,0,1,0,24.0
