In [48]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

data_path = 'input/home-data-for-ml-course/train.csv'
home_data = pd.read_csv(data_path)
home_data = home_data.loc[:, home_data.columns != 'Id']
home_data = pd.get_dummies(home_data)

original_features = home_data.loc[:, home_data.columns != 'SalePrice']
print(f"Original features shape: {home_data.shape}")
print(f"Number of features: {len(original_features)}")
print(home_data.head())

Original features shape: (1460, 288)
Number of features: 1460
   MSSubClass  LotFrontage  LotArea  OverallQual  OverallCond  YearBuilt  \
0          60         65.0     8450            7            5       2003   
1          20         80.0     9600            6            8       1976   
2          60         68.0    11250            7            5       2001   
3          70         60.0     9550            7            5       1915   
4          60         84.0    14260            8            5       2000   

   YearRemodAdd  MasVnrArea  BsmtFinSF1  BsmtFinSF2  ...  SaleType_ConLw  \
0          2003       196.0         706           0  ...           False   
1          1976         0.0         978           0  ...           False   
2          2002       162.0         486           0  ...           False   
3          1970         0.0         216           0  ...           False   
4          2000       350.0         655           0  ...           False   

   SaleType_New  SaleTyp

In [52]:
selected_features = []
y = home_data.SalePrice # Sale prices
previous_mae = 0

def feature_selection(features, feature_train, feature_val, output_train, output_val):
    model = RandomForestRegressor(random_state=1) # Current ML model
    model.fit(feature_train, output_train) # Train the model with the training data

    predictions = model.predict(feature_val) # Make the predictions
    mae = mean_absolute_error(output_val, predictions) # Calculate the Mean absolute error

    print(f"MAE for {feature}: {mae}") # Check current feature tested

    return mae

for feature in original_features:
    if feature not in selected_features: # We append a new feature to a list to test the new MAE
        selected_features.append(feature)

        X = home_data[selected_features]
        train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1) # Create training data and validation data

        mae = feature_selection(selected_features, train_X, val_X, train_y, val_y)
        if not previous_mae: previous_mae = mae

        if min(mae, previous_mae) == mae:
            print(f"New MAE: {mae}")
            previous_mae = mae
        else:
            selected_features.remove(feature)

# Write the important features to a file:
home_data_1 = home_data[selected_features].dropna()
home_data_1.to_csv("output/one-to-one.csv", index=False)
with open("output/Best_MAE_1.txt", "w") as file:
    file.write(f"The current best MAE is : {str(previous_mae)}")

MAE for MSSubClass: 48346.43988419981
New MAE: 48346.43988419981
MAE for LotFrontage: 47393.61977376279
New MAE: 47393.61977376279
MAE for LotArea: 44334.28285513973
New MAE: 44334.28285513973
MAE for OverallQual: 27358.171082362383
New MAE: 27358.171082362383
MAE for OverallCond: 26901.762810672284
New MAE: 26901.762810672284
MAE for YearBuilt: 24978.787902804954
New MAE: 24978.787902804954
MAE for YearRemodAdd: 24657.612986083932
New MAE: 24657.612986083932
MAE for MasVnrArea: 23422.985160273965
New MAE: 23422.985160273965
MAE for BsmtFinSF1: 21731.020533789953
New MAE: 21731.020533789953
MAE for BsmtFinSF2: 21530.836876255707
New MAE: 21530.836876255707
MAE for BsmtUnfSF: 21313.763767123288
New MAE: 21313.763767123288
MAE for TotalBsmtSF: 20641.59981917808
New MAE: 20641.59981917808
MAE for 1stFlrSF: 19725.411223744293
New MAE: 19725.411223744293
MAE for 2ndFlrSF: 18117.665178082192
New MAE: 18117.665178082192
MAE for LowQualFinSF: 18260.917817351597
MAE for GrLivArea: 17092.9071872