In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from ipywidgets import interact, fixed  
from itertools import product
import scipy.stats as ss
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("housing_dataset.csv")
numerical_feature_df= pd.read_csv("Selected_numerical_features.csv")
categorical_feature_df = pd.read_csv("Selected_categorical_features.csv")
categorical_feature_df= categorical_feature_df[["Feature","Frequence"]]

In [3]:
numerical_feature_list = numerical_feature_df[numerical_feature_df["Frequence"]>1].Feature.values
categorical_feature_list = categorical_feature_df[categorical_feature_df["Frequence"]>4].Feature.values
categorical_feature_list

array(['Neighborhood', 'ExterQual', 'BsmtQual', 'KitchenQual',
       'FireplaceQu', 'GarageFinish'], dtype=object)

In [4]:
final_feature = list(numerical_feature_list)+list(categorical_feature_list )
len(final_feature)

26

In [5]:
df = df[final_feature+["SalePrice"]]

In [6]:
#@interact(df_arg=fixed(numerical_df),k=(3,0,-0.5))
def remove_outliers_iqr(df_arg,k = 1.5):
    df_iqr = df_arg.copy()
    #columns = [col for col in df_arg.columns]
    for col in df_arg.columns:
        if df_arg[col].dtypes!="object":
            q25, q75 = q25, q75 = df_iqr[col].quantile(.25), df_iqr[col].quantile(.75)
            ecart_iqr = q75- q25
            cut_off = ecart_iqr*k
            lower = q25-cut_off
            upper = q75+cut_off
            df_iqr[col] =np.where(((df_iqr[col]< lower)|(df_iqr[col]> upper)),df_iqr[col].median(),df_iqr[col])
    return df_iqr  

In [7]:
df_work = remove_outliers_iqr(df,3)
df_work.columns

Index(['OverallQual', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
       '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath',
       'BsmtHalfBath', 'FullBath', 'BedroomAbvGr', 'KitchenAbvGr',
       'GarageCars', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'GarageArea',
       'TotRmsAbvGrd', 'Neighborhood', 'ExterQual', 'BsmtQual', 'KitchenQual',
       'FireplaceQu', 'GarageFinish', 'SalePrice'],
      dtype='object')

In [8]:
df_work.isnull().sum().sum()

808

 ## Model Training

In [21]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder 
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector as selector


def preprocess_data(df_arg,features):
    categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
    numerical_preprocessor = MinMaxScaler()
    x_inputs = df_arg.copy().drop(columns=["SalePrice"])[features]
    numerical_columns_selector = selector(dtype_exclude=object)
    categorical_columns_selector = selector(dtype_include=object)
    numerical_columns = numerical_columns_selector(x_inputs)
    categorical_columns = categorical_columns_selector(x_inputs)
    preprocessor = ColumnTransformer(transformers=[
    ('one-hot-encoder', categorical_preprocessor,categorical_columns),
    ('min_max_scaler', numerical_preprocessor, numerical_columns)])
    X_processed= preprocessor.fit_transform(x_inputs)
    y_processed = (df_arg.copy()["SalePrice"]-df_arg.copy()["SalePrice"].min())/((df_arg.copy()["SalePrice"].max()-df_arg.copy()["SalePrice"].min()))
    trainX, testX, trainY, testY= train_test_split(X_processed, y_processed, train_size=0.8)
    trainX.shape, testY.shape
    return trainX, testX, trainY, testY


In [None]:
def get_metrics(y_true, y_pred):
    metrics_dict  = {"MSE": 0,"MAE":0, "RMSE":0,"MAPE":0}
    mse = np.mean(np.power(y_true - y_pred,2))
    metrics_dict["MSE"] = np.round(mse,5)
    mae = np.mean(np.abs(y_true - y_pred,2))
    metrics_dict["MAE"] = np.round(mae,5)
    rmse = np.sqrt(np.mean(np.power(y_true - y_pred,2)))
    metrics_dict["RMSE"] = np.round(rmse,5)
    mape = 100*((y_true - y_pred,2) / np.abs(y_true)).sum() / len(y_true)
    metrics_dict["MAPE"] = np.round(mape,5)
    return metrics_dict

def print_metrics(metrics_dict):
    for k in metrics_dict.keys():
        print(k+" : %f"%(np.round(metrics_dict[k],3)))
    

In [25]:
trainX, testX, trainY, testY = preprocess_data(df_work,final_feature)
trainX.shape, testY.shape

((1168, 68), (292,))

In [28]:
rf = RandomForestRegressor()
y_pred = rf.fit(trainX, trainY).predict(testX)

(292,)

In [29]:
y_pred

array([0.16116824, 0.5073654 , 0.20727993, 0.33877005, 0.26000784,
       0.41473999, 0.18424869, 0.29218944, 0.24921425, 0.28349751,
       0.10013469, 0.20584515, 0.30593815, 0.33907475, 0.30961172,
       0.28034294, 0.05123557, 0.37787696, 0.25789282, 0.48349393,
       0.27187854, 0.47086705, 0.14212751, 0.40901279, 0.65117587,
       0.25237619, 0.40253536, 0.2394313 , 0.50848147, 0.17547431,
       0.17596647, 0.20760995, 0.17793313, 0.31403743, 0.21811207,
       0.55043257, 0.6465365 , 0.18020181, 0.54540718, 0.11184401,
       0.2194292 , 0.41061321, 0.27607059, 0.27216061, 0.53410881,
       0.3406903 , 0.3147482 , 0.19438349, 0.23739014, 0.25955889,
       0.38361079, 0.31780251, 0.21387236, 0.15698028, 0.41096257,
       0.29822948, 0.39541374, 0.36159928, 0.43372399, 0.25470293,
       0.28179149, 0.47057938, 0.41761665, 0.27038163, 0.56764111,
       0.65499549, 0.1483093 , 0.29822969, 0.16006199, 0.19345606,
       0.25853174, 0.25548103, 0.69035936, 0.25598826, 0.58013

In [None]:
import lazypredict
from lazypredict.Supervised import LazyRegressor
reg = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = reg.fit(trainX,testX, trainY,testY)
print(models)