In [14]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

data_path = 'input/home-data-for-ml-course/train.csv'
home_data = pd.read_csv(data_path)
home_data = pd.get_dummies(home_data)

original_features = home_data.columns
print(f"Original features shape: {home_data.shape}")
print(f"Number of features: {len(original_features)}")
print(home_data.head())

Original features shape: (1460, 289)
Number of features: 289
   Id  MSSubClass  LotFrontage  LotArea  OverallQual  OverallCond  YearBuilt  \
0   1          60         65.0     8450            7            5       2003   
1   2          20         80.0     9600            6            8       1976   
2   3          60         68.0    11250            7            5       2001   
3   4          70         60.0     9550            7            5       1915   
4   5          60         84.0    14260            8            5       2000   

   YearRemodAdd  MasVnrArea  BsmtFinSF1  ...  SaleType_ConLw  SaleType_New  \
0          2003       196.0         706  ...           False         False   
1          1976         0.0         978  ...           False         False   
2          2002       162.0         486  ...           False         False   
3          1970         0.0         216  ...           False         False   
4          2000       350.0         655  ...           False        

In [13]:

selected_features = []
y = home_data.SalePrice # Sale prices
home_model = RandomForestRegressor(random_state=1) # Current ML model
previous_mae = 1
for feature in original_features:
    if feature not in selected_features: # We append a new feature to a list to test the new MAE
        selected_features.append(feature)
        X = home_data[selected_features]
        train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1) # Create training data and validation data
        home_data = pd.get_dummies(home_data) # Convert any string values into numerical data
        home_model.fit(train_X, train_y) # Train the model with the training data

        predictions = home_model.predict(val_X) # Make the predictions
        mae = mean_absolute_error(val_y, predictions)
        print(f"MAE: {mae}")
        if 0.02 < abs(1 - mae / previous_mae) < 1:
            if mae > 10000:
                previous_mae = mae
                print(f"New MAE: {mae}")
            else:
                break
        else:
            selected_features.remove(feature)

KeyboardInterrupt: 