In [28]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.inspection import permutation_importance

In [19]:
file = "../data/preparation/ML_Houses_dataset.csv"
data = pd.read_csv(file)
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [20]:
data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'Pesos', 'WallMat', 'RoofSurface',
       'ChimneyStyle', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'G

In [21]:
cols_keep = ['GrLivArea', 'BedroomAbvGr', 'KitchenAbvGr', 
             'OverallCond', 'SalePrice', 'Street', 'Alley']

data = data[cols_keep]

In [22]:
data.head()

Unnamed: 0,GrLivArea,BedroomAbvGr,KitchenAbvGr,OverallCond,SalePrice,Street,Alley
0,1710,3,1,5,208500,Pave,
1,1262,3,1,8,181500,Pave,
2,1786,3,1,5,223500,Pave,
3,1717,3,1,5,140000,Pave,
4,2198,4,1,5,250000,Pave,


In [23]:
data = data.drop_duplicates()

data.Alley = data.Alley.replace(np.nan, "NoAlley")

rb_scaler = RobustScaler()

rb_scaler.fit(data[['GrLivArea']])

data['GrLivArea'] = rb_scaler.transform(data[['GrLivArea']])


ohe = OneHotEncoder(sparse_output=False) # sparse_output=True quando há muitas categorias

ohe.fit(data[["Alley"]])

data[ohe.get_feature_names_out()] = ohe.transform(data[["Alley"]])

data = data.drop(columns=["Alley"])

ohe_binary = OneHotEncoder(sparse_output=False, drop="if_binary") # dropa uma das duas colunas

ohe_binary.fit(data[['Street']])

data[ohe_binary.get_feature_names_out()] = ohe_binary.transform(data[['Street']])

data = data.drop(columns=["Street"])

data['SalePriceBinary'] = pd.cut(x=data['SalePrice'],
                                 bins=[data['SalePrice'].min()-1,
                                       data['SalePrice'].mean(),
                                       data['SalePrice'].max()+1
                                    ],
                                 labels=['cheap', 'expensive'])
data.head()

Unnamed: 0,GrLivArea,BedroomAbvGr,KitchenAbvGr,OverallCond,SalePrice,Alley_Grvl,Alley_NoAlley,Alley_Pave,Street_Pave,SalePriceBinary
0,0.379045,3,1,5,208500,0.0,1.0,0.0,1.0,expensive
1,-0.311248,3,1,8,181500,0.0,1.0,0.0,1.0,expensive
2,0.496148,3,1,5,223500,0.0,1.0,0.0,1.0,expensive
3,0.389831,3,1,5,140000,0.0,1.0,0.0,1.0,cheap
4,1.130971,4,1,5,250000,0.0,1.0,0.0,1.0,expensive


In [24]:
# Encode the target
target_encoder = LabelEncoder().fit(data['SalePriceBinary'])
y = target_encoder.transform(data['SalePriceBinary'])

# Define the features
X = data.drop(columns=['SalePrice', 'SalePriceBinary'])

# Scale numerical features
minmax_scaler = MinMaxScaler()
X[["BedroomAbvGr","KitchenAbvGr","OverallCond"]] = minmax_scaler.fit_transform(X[["BedroomAbvGr","KitchenAbvGr","OverallCond"]])


In [25]:
# Instantiate a model
log_reg = LogisticRegression(max_iter=1000)

# Score on multiple folds aka Cross Validation
scores = cross_val_score(log_reg, X, y, cv=10)
scores.mean()

np.float64(0.8297354747283892)

Feature permutation

In [None]:
# sem permutação
log_model = LogisticRegression()

np.mean(cross_val_score(log_model, X, y, cv=5))

np.float64(0.8290778138680978)

In [29]:
log_model = LogisticRegression().fit(X, y)

permutation_score = permutation_importance(log_model, X, y, n_repeats=10)

# Unstack results showing the decrease in performance after shuffling features
importance_df = pd.DataFrame(np.vstack((X.columns, permutation_score.importances_mean)).T)
importance_df.columns=['feature', 'score decrease']

# Show the important features
importance_df.sort_values(by="score decrease", ascending=False)

Unnamed: 0,feature,score decrease
0,GrLivArea,0.300343
1,BedroomAbvGr,0.02464
2,KitchenAbvGr,0.013795
5,Alley_NoAlley,0.009403
4,Alley_Grvl,0.004804
3,OverallCond,0.004187
7,Street_Pave,0.000686
6,Alley_Pave,-0.00048


In [35]:
# Selecting the strongest features
strongest_features = X[["GrLivArea", "BedroomAbvGr", "KitchenAbvGr"]]

# Re-instantiating a Logistic Regression
log_reg = LogisticRegression()

# Average accuracy of the cross-validated model
np.mean(cross_val_score(log_reg, strongest_features, y, cv=10))

np.float64(0.8249173358526216)