# House Prices - Advanced Regression Techniques
(Preços de casas - Técnicas Avançadas de Regressão)

<img src="https://storage.googleapis.com/kaggle-media/competitions/House%20Prices/kaggle_5407_media_housesbanner.png" width=800>


- Vamos utilizar o [dataset disponível no Kaggle](https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/overview)
    - É um dataset de **competição**
    - Temos uma base com a **descrição de cada uma das colunas (data_description.txt)**
- **Etapas do Projeto:**
    - O projeto será realizado aplicando submissões no kaggle a medida que o tratamento dos dados forem tomando forma, isso possibilita verificar o desempenho dos modelos preditivos a modificações específicas neste tratamento.

In [50]:
# Importando o pandas
import pandas as pd

In [51]:
# Importando os datasets
base_train = pd.read_csv('train_2.csv')
base_test = pd.read_csv('test_2.csv')

## Tratar outliers

In [52]:
# Filtrar apenas colunas numéricas
base_train_num = base_train.select_dtypes(exclude=object)
base_train_num = base_train_num.drop('SalePrice',axis=1)
base_train_num

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,548,0,61,0,0,0,0,0,2,2008
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,460,298,0,0,0,0,0,0,5,2007
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,608,0,42,0,0,0,0,0,9,2008
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,642,0,35,272,0,0,0,0,2,2006
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,836,192,84,0,0,0,0,0,12,2008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,6,5,1999,2000,0.0,0,...,460,0,40,0,0,0,0,0,8,2007
1456,1457,20,85.0,13175,6,6,1978,1988,119.0,790,...,500,349,0,0,0,0,0,0,2,2010
1457,1458,70,66.0,9042,7,9,1941,2006,0.0,275,...,252,0,60,0,0,0,0,2500,5,2010
1458,1459,20,68.0,9717,5,6,1950,1996,0.0,49,...,240,366,0,112,0,0,0,0,4,2010


### Tratar outliers
**Como a base foi tratada, sabemos que os outliers restantes não são erros da base, devem ser considerados até certo ponto**

In [53]:
# Função para extrair colunas com outliers
# Calcular Q1 (25º percentil) e Q3 (75º percentil)
Q1 = base_train_num.quantile(0.25)
Q3 = base_train_num.quantile(0.75)

# Calcular o interquartil
IQR = Q3 - Q1

# Definir limites inferior e superior
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR


def outlier_columns(df):

    # Identificar colunas com outliers
    outlier_columns = []
    for col in df.columns:
        is_outlier = ((df[col] < lower_bound[col]) | (df[col] > upper_bound[col]))
        if is_outlier.any():
            outlier_columns.append(col)
    return outlier_columns

outlier_cols = outlier_columns(base_train_num)

print("Colunas com outliers:", outlier_cols)

Colunas com outliers: ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']


In [54]:
base_train[outlier_cols].describe()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,56.89726,69.863699,10516.828082,6.099315,5.575342,1971.267808,103.117123,443.639726,46.549315,567.240411,...,1868.684247,1.767123,472.980137,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041
std,42.300571,22.027677,9981.264932,1.382997,1.112799,30.202904,180.731373,456.098091,161.319273,441.866955,...,453.925967,0.747315,213.804841,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024
min,20.0,21.0,1300.0,1.0,1.0,1872.0,0.0,0.0,0.0,0.0,...,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20.0,60.0,7553.5,5.0,5.0,1954.0,0.0,0.0,0.0,223.0,...,1958.0,1.0,334.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,50.0,69.0,9478.5,6.0,5.0,1973.0,0.0,383.5,0.0,477.5,...,1977.0,2.0,480.0,0.0,25.0,0.0,0.0,0.0,0.0,0.0
75%,70.0,79.0,11601.5,7.0,6.0,2000.0,164.25,712.25,0.0,808.0,...,2001.0,2.0,576.0,168.0,68.0,0.0,0.0,0.0,0.0,0.0
max,190.0,313.0,215245.0,10.0,9.0,2010.0,1600.0,5644.0,1474.0,2336.0,...,2010.0,4.0,1418.0,857.0,547.0,552.0,508.0,480.0,738.0,15500.0


**Dessas colunas ainda serão filtradas as que possuem muitos valores repetidos, ex: percentil 50 = 0**

In [55]:
outlier_colsZ = base_train_num[base_train[outlier_cols].columns[base_train[outlier_cols].quantile(0.5)==0]].columns
outlier_cols2 = base_train[outlier_cols].columns.drop(outlier_colsZ)
print(outlier_colsZ)
print(outlier_cols2)

Index(['MasVnrArea', 'BsmtFinSF2', '2ndFlrSF', 'LowQualFinSF', 'BsmtFullBath',
       'BsmtHalfBath', 'WoodDeckSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'MiscVal'],
      dtype='object')
Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF',
       'GrLivArea', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'OpenPorchSF'],
      dtype='object')


In [56]:
# Criar coluna target para cada coluna de outlier
def target_outlier_columns(df):
    # Adicionar colunas de outliers
    for col in df.columns:
        if col in outlier_cols2:
            outlier_column_name = f"target_{col}"
            df.loc[:, outlier_column_name] = df[col].apply(lambda x: 1 if ((x < lower_bound[col]) | (x > upper_bound[col])) else 0)
        elif col in outlier_colsZ:
            outlier_column_name = f"target_{col}"
            s_zeros = df[col][df[col] > 0]
            df.loc[:, outlier_column_name] = df[col].apply(lambda x: 1 if ((x < (s_zeros.quantile(0.25) - 1.5 * (s_zeros.quantile(0.75) - s_zeros.quantile(0.25)))) | (x > (s_zeros.quantile(0.75) + 1.5 * (s_zeros.quantile(0.75) - s_zeros.quantile(0.25))))) else 0)
        else:
            pass
    return df

# Aplicar a função e criar as colunas target
base_train_target = target_outlier_columns(base_train.loc[:, outlier_cols])

base_train_target

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,target_GarageYrBlt,target_GarageCars,target_GarageArea,target_WoodDeckSF,target_OpenPorchSF,target_EnclosedPorch,target_3SsnPorch,target_ScreenPorch,target_PoolArea,target_MiscVal
0,60,65.0,8450,7,5,2003,196.0,706,0,150,...,0,0,0,0,0,0,1,1,1,0
1,20,80.0,9600,6,8,1976,0.0,978,0,284,...,0,0,0,0,0,0,1,1,1,0
2,60,68.0,11250,7,5,2001,162.0,486,0,434,...,0,0,0,0,0,0,1,1,1,0
3,70,60.0,9550,7,5,1915,0.0,216,0,540,...,0,0,0,0,0,0,1,1,1,0
4,60,84.0,14260,8,5,2000,350.0,655,0,490,...,0,0,0,0,0,0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,62.0,7917,6,5,1999,0.0,0,0,953,...,0,0,0,0,0,0,1,1,1,0
1456,20,85.0,13175,6,6,1978,119.0,790,163,589,...,0,0,0,0,0,0,1,1,1,0
1457,70,66.0,9042,7,9,1941,0.0,275,0,877,...,0,0,0,0,0,0,1,1,1,1
1458,20,68.0,9717,5,6,1950,0.0,49,1029,0,...,0,0,0,0,0,0,1,1,1,0


In [57]:
# Colunas adicionadas
df_with_outliers.columns.drop(outlier_cols)

KeyError: "['MSSubClass', 'OverallQual', 'YearBuilt', 'BsmtUnfSF', '2ndFlrSF', 'BsmtFullBath', 'Fireplaces', 'GarageCars'] not found in axis"

In [None]:
len(df_with_outliers.columns.drop(outlier_cols))

## Realizar mesmo procedimento na base de teste

In [58]:
# Utilizar mesmas colunas que foram criadas as tagets
base_test = base_test[outlier_cols]

In [59]:
# Criar coluna target para cada coluna de outlier
def target_outlier_columns(df):
    # Adicionar colunas de outliers
    for col in df.columns:
        if col in outlier_cols2:
            outlier_column_name = f"target_{col}"
            df.loc[:, outlier_column_name] = df[col].apply(lambda x: 1 if ((x < lower_bound[col]) | (x > upper_bound[col])) else 0)
        elif col in outlier_colsZ:
            outlier_column_name = f"target_{col}"
            s_zeros = df[col][df[col] > 0]
            df.loc[:, outlier_column_name] = df[col].apply(lambda x: 1 if ((x < (s_zeros.quantile(0.25) - 3 * (s_zeros.quantile(0.75) - s_zeros.quantile(0.25)))) | (x > (s_zeros.quantile(0.75) + 3 * (s_zeros.quantile(0.75) - s_zeros.quantile(0.25))))) else 0)
        else:
            pass
    return df

# Aplicar a função e criar as colunas target
base_test_target = target_outlier_columns(base_test.loc[:, outlier_cols])

base_test_target

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,target_GarageYrBlt,target_GarageCars,target_GarageArea,target_WoodDeckSF,target_OpenPorchSF,target_EnclosedPorch,target_3SsnPorch,target_ScreenPorch,target_PoolArea,target_MiscVal
0,20,80.0,11622,5,6,1961,-1.0,468.0,144.0,270.0,...,0,0,0,0,0,0,0,0,0,0
1,20,81.0,14267,6,6,1958,108.0,923.0,0.0,406.0,...,0,0,0,0,0,0,0,0,0,1
2,60,74.0,13830,5,5,1997,-1.0,791.0,0.0,137.0,...,0,0,0,0,0,0,0,0,0,0
3,60,78.0,9978,6,6,1998,20.0,602.0,0.0,324.0,...,0,0,0,0,0,0,0,0,0,0
4,120,43.0,5005,8,5,1992,-1.0,263.0,0.0,1017.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,160,21.0,1936,4,7,1970,-1.0,0.0,0.0,546.0,...,1,0,0,0,0,0,0,0,0,0
1455,160,21.0,1894,4,5,1970,-1.0,252.0,0.0,294.0,...,0,0,0,0,0,0,0,0,0,0
1456,20,160.0,20000,5,7,1960,-1.0,1224.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1457,85,62.0,10441,5,5,1992,-1.0,337.0,0.0,575.0,...,1,0,0,0,0,0,0,0,0,0


### Exportar 

In [60]:
# Adicionar novamente a coluna target
base_train_target['SalePrice'] = base_train['SalePrice']

In [61]:
# Exportando a base de treino e teste
base_train_target.to_csv('train_3.csv',index=False)
base_test_target.to_csv('test_3.csv',index=False)