In [1]:
#@title Default title text
TRAIN_URL = 'https://raw.githubusercontent.com/JamesBarciz/ML_algorithm_exploration/master/linear_regression/cleaned_datasets/train_cleaned.csv'
TEST_URL = 'https://raw.githubusercontent.com/JamesBarciz/ML_algorithm_exploration/master/linear_regression/test.csv'

import pandas as pd
from sklearn.model_selection import train_test_split


class Train:
    '''
    Class Train contains methods to perform data cleaning of a particular
    DataFrame for the Ames Housing data set.
    '''
    def __init__(self, df: pd.DataFrame):
        self.df = df
        self.cleaned = self.df.copy()

    def show_stats(self, series: str):
        """Print stats from a Pandas Series"""

            # Describe
        descriptive_stats = self.cleaned[series].describe()
        print('Describe Method:')
        print(descriptive_stats)
        print('---------------------------------------------------')
        
        # Numeric Columns only
        if self.cleaned[series].dtype == 'int64':
            mean = self.cleaned[series].mean()
            maximum = max(self.cleaned[series])
            minimum = min(self.cleaned[series])
            print(f'''
The Min/Max of column {self.cleaned[series].name}: ({minimum}, {maximum})
        
The maximum is {maximum - mean} away from the mean
The minimum is {mean - minimum} away from the mean
            ''')
            print('---------------------------------------------------')
        elif self.cleaned[series].dtype == 'O':
            mode = self.cleaned[series].mode()
            print(f'The most freqent class is: {mode[0]}')
            print('---------------------------------------------------')
            
            print(f'Value Counts for column: {self.cleaned[series].name}')
            print(self.cleaned[series].value_counts())
            print('---------------------------------------------------')
        
        # Number of NaN values
        nans = self.cleaned[series].isna().sum()
        nans_to_percent = nans/len(self.cleaned[series] * 100)
        print(f'Number of NaNs: {nans}')
        print(f'Percent of Null Values for the column: {nans_to_percent}%')
        print('---------------------------------------------------')
        
        # Number of Unique Values - Only display if there are less than 20 unique
        unique = self.cleaned[series].unique()
        print(f'There are {len(unique)} values')
        
        if len(unique) <= 20:
            print('Unique Values:')
            print(unique)
            print('---------------------------------------------------')
        else:
            print('Warning: High Cardinality')
            print('---------------------------------------------------')

    def train_val_test_split(self, X, y, random_state, test_size=0.20, val_size=0.25):
        '''
        Performs two train_test_splits on the dataset returning X and y for
        train, validation and test sets.
        Parameters:
            - X: Feature data as a Pandas.DataFrame object
            - y: Target column data as a Pandas.Series object
            - random_state: Int value applied to the random_state parameter
                            of both train_test_split calls.
            - test_size (default=0.20): Percentage (0-1) representing a
                        proportion of the (X, y) data PRIOR to first split.
            - val_size (default=0.25): Percentage (0-1) representing a
                        proportion of the (X, y) data POST first split.
        
        Returns (in order):
            - X_train, y_train, X_val, y_val, X_test, y_test
        '''

        X_remain, X_test, y_remain, y_test = train_test_split(X, y, test_size=test_size,
                                                              random_state=random_state)

        X_train, X_val, y_train, y_val = train_test_split(X_remain, y_remain, test_size=val_size,
                                                          random_state=random_state)

        return X_train, y_train, X_val, y_val, X_test, y_test

    def make_conditions(self):

        # 1. LotArea > 95-percentile (17401.15 ftsq)
        condition_LotArea = (self.cleaned['LotArea'] <= 17401.15)

        # 2. TotalBsmtSF > 1753
        condition_TotalBsmtSF = (self.cleaned['TotalBsmtSF'] <= 1753)

        # 3. 1stFlrSF > 1831.25
        condition_1stFlrSF = (self.cleaned['1stFlrSF'] <= 1831.25)

        # 4. GrLivArea > 2466.1
        condition_GrLivArea = (self.cleaned['GrLivArea'] <= 2466.1)

        # 5. SalePrice > 326100
        condition_SalePrice = (self.cleaned['SalePrice'] <= 326100)

        return [condition_LotArea, condition_TotalBsmtSF, condition_1stFlrSF, condition_GrLivArea, condition_SalePrice]

    def clean_df(self, condition_LotArea=None, condition_TotalBsmtSF=None, condition_1stFlrSF=None, condition_GrLivArea=None, condition_SalePrice=None):
        '''
        This cleans up the dataframe, makes a few features and ordinally encodes some categorical features.
        
        Returns a DataFrame for Train or Test set depending on whether conditionals were given - default is for Test
        '''

        # First, remove all NaN values
        condition0 = (len(self.cleaned) * 0.1)
        cols = self.cleaned.columns

        for col in cols:
            n_NaN = (self.cleaned[col].isna().sum())
            if n_NaN > condition0:
                self.cleaned.drop(columns=col, inplace=True)
            elif 0 < n_NaN < condition0:
                if self.cleaned[col].dtype != 'O':
                    if len(self.cleaned[col].unique()) > 20:
                        self.cleaned[col].fillna(value=self.cleaned[col].mean(), inplace=True)
                    else:
                        self.cleaned[col].fillna(value=self.cleaned[col].mode()[0], inplace=True)
                else:
                    self.cleaned[col].fillna(value=self.cleaned[col].mode()[0], inplace=True)

        # 1. LotShape - (Combine Irregular)
        self.cleaned.loc[self.cleaned['LotShape'].str.startswith('IR'), 'RegularLotShape'] = 0
        self.cleaned.loc[self.cleaned['LotShape'].str.startswith('Reg'), 'RegularLotShape'] = 1

        # 2. LandContour - (Combine non Lvl values)
        self.cleaned.loc[(self.cleaned['LandContour'] == 'Bnk') | (self.cleaned['LandContour'] == 'HLS') | (self.cleaned['LandContour'] == 'Low'), 'LandIsLvl'] = 0
        self.cleaned.loc[self.cleaned['LandContour'] == 'Lvl', 'LandIsLvl'] = 1

        # 3. LotConfig - (FR2, FR3 essentially the same)
        # Ordinality - {'Inside': 0, 'Corner': 1, 'CulDSac': 2, 'FR': 3}
        self.cleaned.loc[self.cleaned['LotConfig'] == 'Inside', 'LotConfigCL'] = 0
        self.cleaned.loc[self.cleaned['LotConfig'] == 'Corner', 'LotConfigCL'] = 1
        self.cleaned.loc[self.cleaned['LotConfig'] == 'CulDSac', 'LotConfigCL'] = 2
        self.cleaned.loc[self.cleaned['LotConfig'].str.startswith('FR'), 'LotConfigCL'] = 3

        # 4. Condition1 - (Combine adjacency types)
        # Ordinality - {'Norm': 0, 'Feedr/Artery': 1, 'RRA/N': 2, 'PosFeat': 3}
        self.cleaned.loc[self.cleaned['Condition1'] == 'Norm', 'LotAdjacencyType'] = 0
        self.cleaned.loc[(self.cleaned['Condition1'] == 'Feedr') | (self.cleaned['Condition1'] == 'Artery'), 'LotAdjacencyType'] = 1
        self.cleaned.loc[self.cleaned['Condition1'].str.startswith('RR'), 'LotAdjacencyType'] = 2
        self.cleaned.loc[self.cleaned['Condition1'].str.startswith('Pos'), 'LotAdjacencyType'] = 3

        # 5. OverallQual - (Combine extremes)
        # Ordinality - {'below_4': 0, 'Average(4,5,6)': 1, 'above_6': 2}
        self.cleaned.loc[self.cleaned['OverallQual'] < 4, 'HouseCondition'] = 0
        self.cleaned.loc[self.cleaned['OverallQual'] <= 6, 'HouseCondition'] = 1
        self.cleaned.loc[self.cleaned['OverallQual'] >= 7, 'HouseCondition'] = 2

        # 6. YearBuilt - Split {MadeBefore1946: 0, MadeAfter1946: 1}
        self.cleaned.loc[self.cleaned['YearBuilt'] < 1946, 'YrBuilt'] = 0
        self.cleaned.loc[self.cleaned['YearBuilt'] >= 1946, 'YrBuilt'] = 1

        # 7. YearRemodAdd - NEW COLUMN - WasRemodeled
        # Process - If the years for YearBuilt and YearRemodAdd are the same, there was no remodel
        self.cleaned.loc[self.cleaned['YearBuilt'] == self.cleaned['YearRemodAdd'], 'WasRemodeled'] = 0
        self.cleaned.loc[self.cleaned['YearBuilt'] != self.cleaned['YearRemodAdd'], 'WasRemodeled'] = 1

        # 8. MasVnrType - (Combine brick-types)
        # Ordinality - {'None': 0, 'Brick': 1, 'Stone': 3}
        self.cleaned.loc[self.cleaned['MasVnrType'] == 'None', 'VeneerType'] = 0
        self.cleaned.loc[self.cleaned['MasVnrType'].str.startswith('Brk'), 'VeneerType'] = 1
        self.cleaned.loc[self.cleaned['MasVnrType'] == 'Stone', 'VeneerType'] = 2

        # 9. HeatingQC - (Combine Fair and Poor - heating is important!)
        # Ordinality - {'Excellent': 0, 'Average': 1, 'Good': 2, 'Poor': 3}
        self.cleaned.loc[self.cleaned['HeatingQC'] == 'Ex', 'HeatingQuality'] = 0
        self.cleaned.loc[self.cleaned['HeatingQC'] == 'TA', 'HeatingQuality'] = 1
        self.cleaned.loc[self.cleaned['HeatingQC'] == 'Gd', 'HeatingQuality'] = 2
        self.cleaned.loc[(self.cleaned['HeatingQC'] == 'Fa') | (self.cleaned['HeatingQC'] == 'Po'), 'HeatingQuality'] = 3

        # 10. Electrical - (Combine all Fuse types)
        # Binary - {'Breaker': 0, 'Fuse': 1}
        self.cleaned.loc[self.cleaned['Electrical'] == 'SBrkr', 'EleSystem'] = 0
        self.cleaned.loc[(self.cleaned['Electrical'].str.startswith('Fuse')) | (self.cleaned['Electrical'] == 'Mix'), 'EleSystem'] = 1

        # 11. BsmtFull/HalfBath - NEW COLUMN - BsmtHasBath
        self.cleaned.loc[(self.cleaned['BsmtFullBath'] == 0) | (self.cleaned['BsmtHalfBath'] == 0), 'BsmtHasBath'] = 0
        self.cleaned.loc[(self.cleaned['BsmtFullBath'] > 0) | (self.cleaned['BsmtHalfBath'] > 0), 'BsmtHasBath'] = 1

        # 12. HalfBath - (Combine 1 and 2 to make binary) - HasHalfBath
        self.cleaned.loc[self.cleaned['HalfBath'] == 0, 'HasHalfBath'] = 0
        self.cleaned.loc[self.cleaned['HalfBath'] > 0, 'HasHalfBath'] = 1

        # 13. BedroomAbvGr - (0-1, 2, 3, 4+)
        # Ordinality - {'less_than_2': 0, '2': 1, '3': 2, '4+': 3}
        self.cleaned.loc[self.cleaned['BedroomAbvGr'] < 2, 'Bedrooms'] = 0
        self.cleaned.loc[self.cleaned['BedroomAbvGr'] == 2, 'Bedrooms'] = 1
        self.cleaned.loc[self.cleaned['BedroomAbvGr'] == 3, 'Bedrooms'] = 2
        self.cleaned.loc[self.cleaned['BedroomAbvGr'] > 3, 'Bedrooms'] = 3

        # 14. TotRmsAvbGrd - NEW COLUMN - AdditionalRooms
        # Make a new column called RemainingRooms that is the difference between Total Rooms and Bedrooms
        # Ordinality - {'less_than_3': 0, '3': 1, '4': 2, '5': 3, 'more_than_5': 4}
        self.cleaned['RemainingRooms'] = self.cleaned['TotRmsAbvGrd'] - self.cleaned['BedroomAbvGr']
        self.cleaned.loc[self.cleaned['RemainingRooms'] < 3, 'AdditionalRooms'] = 0
        self.cleaned.loc[self.cleaned['RemainingRooms'] == 3, 'AdditionalRooms'] = 1
        self.cleaned.loc[self.cleaned['RemainingRooms'] == 4, 'AdditionalRooms'] = 2
        self.cleaned.loc[self.cleaned['RemainingRooms'] == 5, 'AdditionalRooms'] = 3
        self.cleaned.loc[self.cleaned['RemainingRooms'] > 5, 'AdditionalRooms'] = 4

        # 15. Fireplaces - (Combine 2 and 3)
        # Ordinality - {'None': 0, '1': 1, '2+': 2}
        self.cleaned.loc[self.cleaned['Fireplaces'] == 0, 'NumFireplaces'] = 0
        self.cleaned.loc[self.cleaned['Fireplaces'] == 1, 'NumFireplaces'] = 1
        self.cleaned.loc[self.cleaned['Fireplaces'] > 1, 'NumFireplaces'] = 2

        # 16. GarageCars - (Combine 3 and 4)
        # Ordinality - {'0': 0, '1': 1, '2': 2, '3+': 3}
        self.cleaned.loc[self.cleaned['GarageCars'] == 0, 'GarageAreaByCar'] = 0
        self.cleaned.loc[self.cleaned['GarageCars'] == 1, 'GarageAreaByCar'] = 1
        self.cleaned.loc[self.cleaned['GarageCars'] == 2, 'GarageAreaByCar'] = 2
        self.cleaned.loc[self.cleaned['GarageCars'] > 2, 'GarageAreaByCar'] = 3

        # 17. WoodDeckSF - NEW COLUMN - HasDeck
        self.cleaned.loc[self.cleaned['WoodDeckSF'] == 0, 'HasDeck'] = 0
        self.cleaned.loc[self.cleaned['WoodDeckSF'] > 0, 'HasDeck'] = 1

        # 18. PoolArea - NEW COLUMN - HasPool
        self.cleaned.loc[self.cleaned['PoolArea'] == 0, 'HasPool'] = 0
        self.cleaned.loc[self.cleaned['PoolArea'] > 0, 'HasPool'] = 1

        # 19. MoSold - Subtract all items by 1
        # Ordinality - {'Jan': 0 ... 'Dec': 11}
        self.cleaned['MonthSold'] = self.cleaned['MoSold'] - 1

        # 20. YrSold - Convert years to 0-4 - 2010 might not have concluded at creation of dataset
        # Ordinality - {'2006': 0, '2007': 1, '2008': 2, '2009': 3, '2010': 4}
        self.cleaned.loc[self.cleaned['YrSold'] <= 2006, 'YearSold'] = 0
        self.cleaned.loc[self.cleaned['YrSold'] == 2007, 'YearSold'] = 1
        self.cleaned.loc[self.cleaned['YrSold'] == 2008, 'YearSold'] = 2
        self.cleaned.loc[self.cleaned['YrSold'] == 2009, 'YearSold'] = 3
        self.cleaned.loc[self.cleaned['YrSold'] == 2010, 'YearSold'] = 4
        
        # =====================================================================================

        new_columns = list(self.cleaned.columns[-21:])

        features_encoded = self.cleaned[new_columns].astype('int64')

        if condition_LotArea != None:

            df_inter = pd.concat([self.cleaned[['LotArea', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'SalePrice']], features_encoded], axis=1)

            # Final data set should be shape (1253, 26)

            final_clean_filtered = df_inter[condition_LotArea & condition_TotalBsmtSF & condition_1stFlrSF & condition_GrLivArea & condition_SalePrice]

            return final_clean_filtered  # Specifically for Train data set
        
        else:
            
            df_inter = pd.concat([self.cleaned[['Id', 'LotArea', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea']], features_encoded], axis=1)

            return df_inter  # This is specifically for the Test data set (does not have SalePrice)

In [2]:
import numpy as np

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import make_scorer, mean_squared_log_error
from sklearn.model_selection import cross_val_score

def rmsle(pred, true):
  return mean_squared_log_error(pred, true) ** 0.5


scorer = make_scorer(rmsle, greater_is_better=False)

In [3]:
train = Train(pd.read_csv(TRAIN_URL))
test = Train(pd.read_csv(TEST_URL))

# Split the data
X = train.cleaned.drop(columns='SalePrice')
y = train.cleaned['SalePrice']

X_train, y_train, X_val, y_val, X_test, y_test = train.train_val_test_split(
    X=X, y=y, random_state=42
)

print(X_train.shape, y_train.shape, X_val.shape,
      y_val.shape, X_test.shape, y_test.shape)

(751, 25) (751,) (251, 25) (251,) (251, 25) (251,)


In [4]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


param_grid = {
    'n_estimators': [_ for _ in range(50, 501, 50)],
    'max_depth': [_ for _ in range(1, 12)],
    'learning_rate': [1, 0.1, 0.01, 0.001, 0.0001],
    'booster': ['gbtree', 'gblinear', 'dart'],
    'tree_method': ['auto', 'exact', 'approx', 'hist', 'gpu_hist'],
    'gamma': [1, 10, 100, 1000, 1e4, 1e5],
    'min_child_weight': [_ for _ in range(1, 11)],
    'min_delta_step': [_ for _ in range(11)],
    'subsample': [0.25, 0.5, 0.75, 1],
    'reg_alpha': [0, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10, 15, 20, 30, 40, 50, 100],
    'reg_lambda': [0.001, 0.01, 0.1, 1, 5, 10, 15, 20, 30, 40, 50, 100]
}

In [5]:
xgbr = XGBRegressor()

rs = RandomizedSearchCV(xgbr, param_distributions=param_grid, n_iter=100,
                        n_jobs=-1, cv=10, verbose=10)
rs

RandomizedSearchCV(cv=10, error_score=nan,
                   estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                          colsample_bylevel=1,
                                          colsample_bynode=1,
                                          colsample_bytree=1, gamma=0,
                                          importance_type='gain',
                                          learning_rate=0.1, max_delta_step=0,
                                          max_depth=3, min_child_weight=1,
                                          missing=None, n_estimators=100,
                                          n_jobs=1, nthread=None,
                                          objective='reg:linear',
                                          random_state=0, reg_alpha...
                                        'n_estimators': [50, 100, 150, 200, 250,
                                                         300, 350, 400, 450,
                              

In [6]:
rs.fit(X_train, y_train)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1807s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0695s.) Setting batch_size=1.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:   18.1s
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:   20.0s
[Parallel(n_jo



RandomizedSearchCV(cv=10, error_score=nan,
                   estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                          colsample_bylevel=1,
                                          colsample_bynode=1,
                                          colsample_bytree=1, gamma=0,
                                          importance_type='gain',
                                          learning_rate=0.1, max_delta_step=0,
                                          max_depth=3, min_child_weight=1,
                                          missing=None, n_estimators=100,
                                          n_jobs=1, nthread=None,
                                          objective='reg:linear',
                                          random_state=0, reg_alpha...
                                        'n_estimators': [50, 100, 150, 200, 250,
                                                         300, 350, 400, 450,
                              

In [7]:
rs.best_params_

{'booster': 'gbtree',
 'gamma': 1000,
 'learning_rate': 0.1,
 'max_depth': 8,
 'min_child_weight': 9,
 'min_delta_step': 0,
 'n_estimators': 350,
 'reg_alpha': 5,
 'reg_lambda': 100,
 'subsample': 0.5,
 'tree_method': 'approx'}

In [8]:
y_pred_test = rs.predict(X_test)
y_pred_val = rs.predict(X_val)

rmsle_test = mean_squared_log_error(y_test, y_pred_test) ** 0.5
rmsle_val = mean_squared_log_error(y_val, y_pred_val) ** 0.5
print(f'Test RMSLE: {rmsle_test}')
print(f'Val RMSLE: {rmsle_val}')

Test RMSLE: 0.14625599544544843
Val RMSLE: 0.15420556549878434


In [9]:
cleaned_df = test.clean_df()

IDs = cleaned_df['Id']
feat = cleaned_df[cleaned_df.columns[1:]]

y_pred = rs.predict(feat)
submission_rs_xgbr = pd.DataFrame({'Id': IDs, 'SalePrice': y_pred})
submission_rs_xgbr.head()

Unnamed: 0,Id,SalePrice
0,1461,109771.03125
1,1462,150588.5625
2,1463,182100.25
3,1464,176627.6875
4,1465,185286.390625


In [10]:
submission_rs_xgbr.to_csv('submission_rs_xgbr0.csv', index=False)