In [155]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.linear_model import LinearRegression
from EvaluationFunction import *
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import set_config 
set_config(transform_output='pandas')



In [156]:
# Core libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn import set_config 
set_config(transform_output='pandas')
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error



def classification_metrics(y_true, y_pred, label='', output_dict=False,
                           figsize=(8, 4), normalize='true', cmap='Blues', colorbar=False):
    # Print header and classification report
    header = "-" * 70
    print(header, f" Classification Metrics: {label} ", header, sep='\n')
    print(classification_report(y_true, y_pred, digits=3))
    # Plot confusion matrices
    fig, axes = plt.subplots(ncols=2, figsize=figsize)
    # Raw counts
    ConfusionMatrixDisplay.from_predictions(
        y_true, y_pred, normalize=None, cmap='gist_gray', colorbar=colorbar, ax=axes[0]
    )
    axes[0].set_title("Raw Counts")
    # Normalized
    ConfusionMatrixDisplay.from_predictions(
        y_true, y_pred, normalize=normalize, cmap=cmap, colorbar=colorbar, ax=axes[1]
    )
    axes[1].set_title("Normalized Confusion Matrix")
    fig.tight_layout()
    plt.show()
    # Optional: return dictionary of metrics
    if output_dict:
        return classification_report(y_true, y_pred, output_dict=True)

def evaluate_classification(model, X_train, y_train, X_test, y_test,
                         figsize=(6,4), normalize='true', output_dict = False,
                            cmap_train='Blues', cmap_test="Reds",colorbar=False):
  # Get predictions for training data
  y_train_pred = model.predict(X_train)
  # Call the helper function to obtain regression metrics for training data
  results_train = classification_metrics(y_train, y_train_pred, #verbose = verbose,
                                     output_dict=True, figsize=figsize,
                                         colorbar=colorbar, cmap=cmap_train,
                                     label='Training Data')
  print()
  # Get predictions for test data
  y_test_pred = model.predict(X_test)
  # Call the helper function to obtain regression metrics for test data
  results_test = classification_metrics(y_test, y_test_pred, #verbose = verbose,
                                  output_dict=True,figsize=figsize,
                                         colorbar=colorbar, cmap=cmap_test,
                                    label='Test Data' )
  if output_dict == True:
    # Store results in a dataframe if ouput_frame is True
    results_dict = {'train':results_train,
                    'test': results_test}
    return results_dict

def regression_metrics(y_true, y_pred, label = '', verbose= True, output_dict = False):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    # rmse = mean_squared_error(y_true, y_pred, squared = False)
    rmse = np.sqrt(mse)
    r_squared = r2_score(y_true, y_pred)
    if verbose == True:
        header = "-"*60
        print(header, f"Regression Metrics: {label}", header, sep = '\n')
        print(f"- MAE = {mae:,.3f}")
        print(f"- MSE = {mse:,.3f}")
        print(f"- RMSE = {rmse:,.3f}")
        print(f"- R² = {r_squared:,.3f}")
    if output_dict == True:
        metrics = {'label':label, 'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'R sqaured': r_squared}
        return metrics
def evaluate_regression(reg, X_train, y_train, X_test, y_test, verbose = True, output_frame = False):
    y_train_pred = reg.predict(X_train)
    results_train = regression_metrics(y_train, y_train_pred, verbose = verbose, output_dict = output_frame, label = 'Training Data')
    
    print()
    
    y_test_pred = reg.predict(X_test)
    results_test = regression_metrics(y_test, y_test_pred, verbose = verbose, output_dict= output_frame, label = 'Test Data')
    if output_frame: 
        results_df = pd.DataFrame([results_train, results_test])
        # results_df = results_df.set_index('label')
        return res



In [106]:
df = pd.read_csv('ames-housing-dojo.csv')

# df.head()

In [107]:
df['Gr Liv Area'] = df['Gr Liv Area'].str.replace("sqft",'')
# df[['Gr Liv Area']]

In [108]:
df  = df.drop(columns=['Unnamed: 0'])
# df.head()

In [109]:
rename_dict = {"Year Remod/Add": "Year Remodeled",
"Bsmt Unf SF": "Bsmt Unf Sqft",
"Total Bsmt SF": "Total Bsmnt Sqft",
"TotRms AbvGrd": "Total Rooms",
"Gr Liv Area": "Living Area Sqft"}
# rename_dict
df = df.rename(rename_dict, axis  = 1)

In [110]:
df.columns

Index(['PID', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Street', 'Alley',
       'Utilities', 'Neighborhood', 'Bldg Type', 'House Style', 'Overall Qual',
       'Overall Cond', 'Year Built', 'Year Remodeled', 'Exter Qual',
       'Exter Cond', 'Bsmt Unf Sqft', 'Total Bsmnt Sqft', 'Central Air',
       'Living Area Sqft', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath',
       'Half Bath', 'Bedroom', 'Kitchen', 'Total Rooms', 'Garage Type',
       'Garage Yr Blt', 'Garage Cars', 'Garage Area', 'Garage Qual',
       'Garage Cond', 'Paved Drive', 'Fence', 'Date Sold', 'SalePrice'],
      dtype='object')

In [111]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2959 entries, 0 to 2958
Data columns (total 37 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PID               2959 non-null   int64  
 1   MS Zoning         2959 non-null   object 
 2   Lot Frontage      2959 non-null   int64  
 3   Lot Area          2959 non-null   int64  
 4   Street            2959 non-null   object 
 5   Alley             201 non-null    object 
 6   Utilities         2959 non-null   object 
 7   Neighborhood      2959 non-null   object 
 8   Bldg Type         2959 non-null   object 
 9   House Style       2959 non-null   object 
 10  Overall Qual      2959 non-null   int64  
 11  Overall Cond      2959 non-null   int64  
 12  Year Built        2959 non-null   int64  
 13  Year Remodeled    2959 non-null   int64  
 14  Exter Qual        2959 non-null   object 
 15  Exter Cond        2959 non-null   object 
 16  Bsmt Unf Sqft     2958 non-null   float64


In [112]:
duplicated_rows = df.duplicated()
df[duplicated_rows]

Unnamed: 0,PID,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Utilities,Neighborhood,Bldg Type,House Style,...,Garage Type,Garage Yr Blt,Garage Cars,Garage Area,Garage Qual,Garage Cond,Paved Drive,Fence,Date Sold,SalePrice
869,535153150,RL,76,9120,Pave,,AllPub,NAmes,1Fam,1Story,...,Attchd,1958.0,2.0,433.0,TA,TA,Y,,11-2008,163000.0
1019,921205030,RL,88,11443,Pave,,AllPub,Timber,1Fam,1Story,...,Attchd,2005.0,3.0,880.0,TA,TA,Y,,03-2006,369900.0
1867,908103280,RL,65,6500,Pave,,AllPub,Edwards,1Fam,1Story,...,Detchd,1991.0,2.0,480.0,TA,TA,Y,,05-2008,135000.0
2029,526351010,RL,81,14267,Pave,,AllPub,NAmes,1Fam,1Story,...,Attchd,1958.0,1.0,312.0,TA,TA,Y,,06-2010,172000.0
2203,923230040,RL,63,9297,Pave,,AllPub,Mitchel,Duplex,1Story,...,Detchd,1976.0,2.0,560.0,TA,TA,Y,,07-2006,188000.0
2306,907262070,RL,72,7226,Pave,,AllPub,CollgCr,1Fam,2Story,...,Attchd,2003.0,2.0,595.0,TA,TA,Y,,06-2008,183000.0
2552,528174020,RL,34,3901,Pave,,AllPub,NridgHt,Twnhs,1Story,...,Attchd,2005.0,2.0,631.0,TA,TA,Y,,08-2007,204000.0


In [113]:
df[duplicated_rows]
df = df.drop_duplicates()

In [114]:
df.duplicated().sum()

np.int64(0)

In [115]:
df.isna().sum()

PID                    0
MS Zoning              0
Lot Frontage           0
Lot Area               0
Street                 0
Alley               2751
Utilities              0
Neighborhood           0
Bldg Type              0
House Style            0
Overall Qual           0
Overall Cond           0
Year Built             0
Year Remodeled         0
Exter Qual             0
Exter Cond             0
Bsmt Unf Sqft          1
Total Bsmnt Sqft       1
Central Air            0
Living Area Sqft       0
Bsmt Full Bath         2
Bsmt Half Bath         2
Full Bath              0
Half Bath              0
Bedroom                0
Kitchen                0
Total Rooms            0
Garage Type          157
Garage Yr Blt        159
Garage Cars            1
Garage Area            1
Garage Qual          159
Garage Cond          159
Paved Drive            0
Fence               2378
Date Sold              0
SalePrice             22
dtype: int64

In [116]:
cols_lables = df.select_dtypes("object").columns

for col in cols_lables:
    print(f"Value counts for {col}")
    print(df[col].value_counts(dropna= False))    
    print('\n')

    

Value counts for MS Zoning
MS Zoning
RL         2287
RM          467
FV          142
RH           27
C (all)      25
I (all)       2
A (agr)       2
Name: count, dtype: int64


Value counts for Street
Street
Pave    2940
Grvl      12
Name: count, dtype: int64


Value counts for Alley
Alley
NaN     2751
Grvl     121
Pave      80
Name: count, dtype: int64


Value counts for Utilities
Utilities
AllPub    2949
NoSewr       2
NoSeWa       1
Name: count, dtype: int64


Value counts for Neighborhood
Neighborhood
NAmes      446
CollgCr    270
OldTown    242
Edwards    195
Somerst    186
NridgHt    167
Gilbert    166
Sawyer     151
NWAmes     132
SawyerW    126
Mitchel    114
BrkSide    109
Crawfor    104
IDOTRR      93
Timber      72
NoRidge     71
StoneBr     51
SWISU       49
ClearCr     44
MeadowV     38
BrDale      30
Blmngtn     28
Veenker     24
NPkVill     23
Blueste     10
Greens       8
GrnHill      2
Landmrk      1
Name: count, dtype: int64


Value counts for Bldg Type
Bldg Type
1Fam

In [117]:
df['Central Air'] = df['Central Air'].replace({'yes' : "Y", 'no':'N'})

In [118]:
df['Half Bath'] = df['Half Bath'].replace({'?': np.nan})
df['Living Area Sqft'] =  df['Living Area Sqft'].astype(float)
df['Half Bath'] = df['Half Bath'].astype(float)

In [119]:
df['Date Sold'].head()

0    03-2006
1    03-2006
2    04-2007
3    06-2008
4    02-2007
Name: Date Sold, dtype: object

In [120]:
df[['Month', 'Year']] = df['Date Sold'].str.split('-', expand= True)
# df.head()

In [121]:
df = df.drop(columns=['Date Sold'])

In [122]:
Excluded_features = [ 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Street', 'Alley',
       'Utilities', 'Neighborhood', 'Bldg Type', 'House Style', 'Year Built', 
        'Bedroom', 'Kitchen', 'Total Rooms', 'Garage Type',
       'Garage Yr Blt', 'Garage Area', 'Month', 'Year']
df = df.drop(columns = Excluded_features)

In [123]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2952 entries, 0 to 2958
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PID               2952 non-null   int64  
 1   Overall Qual      2952 non-null   int64  
 2   Overall Cond      2952 non-null   int64  
 3   Year Remodeled    2952 non-null   int64  
 4   Exter Qual        2952 non-null   object 
 5   Exter Cond        2952 non-null   object 
 6   Bsmt Unf Sqft     2951 non-null   float64
 7   Total Bsmnt Sqft  2951 non-null   float64
 8   Central Air       2952 non-null   object 
 9   Living Area Sqft  2952 non-null   float64
 10  Bsmt Full Bath    2950 non-null   float64
 11  Bsmt Half Bath    2950 non-null   float64
 12  Full Bath         2952 non-null   int64  
 13  Half Bath         2949 non-null   float64
 14  Garage Cars       2951 non-null   float64
 15  Garage Qual       2793 non-null   object 
 16  Garage Cond       2793 non-null   object 
 17  

In [124]:
columns  = df.columns 
columns

cat_cols  = df.select_dtypes('object').columns


In [125]:
drop_target_nulls = df.dropna(subset=['SalePrice'])
df = drop_target_nulls

df['SalePrice'].isna().sum()

np.int64(0)

In [126]:
for column in columns:
    print(f"Vlaues for {column} are:")
    print(df[column].value_counts(dropna = False))
    print("\n")

Vlaues for PID are:
PID
902201120    1
907227090    1
527108010    1
534275170    1
528104050    1
            ..
908225310    1
904101170    1
535377070    1
528102100    1
528456240    1
Name: count, Length: 2930, dtype: int64


Vlaues for Overall Qual are:
Overall Qual
5     825
6     732
7     602
8     350
4     226
9     107
3      40
10     31
2      13
1       4
Name: count, dtype: int64


Vlaues for Overall Cond are:
Overall Cond
5    1654
6     533
7     390
8     144
4     101
3      50
9      41
2      10
1       7
Name: count, dtype: int64


Vlaues for Year Remodeled are:
Year Remodeled
1950    361
2006    202
2007    164
2005    141
2004    111
       ... 
1986     13
1981     13
2010     13
1983     11
1982      9
Name: count, Length: 61, dtype: int64


Vlaues for Exter Qual are:
Exter Qual
TA    1799
Gd     989
Ex     107
Fa      35
Name: count, dtype: int64


Vlaues for Exter Cond are:
Exter Cond
TA    2549
Gd     299
Fa      67
Ex      12
Po       3
Name: count, dtype

In [127]:
X = df.drop(columns = 'SalePrice')
y = df['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 42)

In [128]:
# Numerical features 
numeric_features = X_train.select_dtypes('number').columns

median_impute = SimpleImputer(strategy='median')

numeric_scaler = StandardScaler()

numeric_pipe = make_pipeline(median_impute, numeric_scaler)

# Ordinal features 
ordinal_features = X_train.select_dtypes('object').drop(columns = 'Central Air').columns
ordinal_imputer  = SimpleImputer(strategy='constant', fill_value = 'NA')

exterior_order = ["Po", "Fa", "TA", "Gd", "Ex"]
garage_order = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]  # include NA(NAN) as lowest
paved_drive_order = ["N", "P", "Y"]
fence_order = ["NA","MnWw", "MnPrv", "GdWo", "GdPrv"]

ordinal_categories = [
    exterior_order,   # Exter Qual
    exterior_order,   # Exter Cond
    garage_order,     # Garage Qual
    garage_order,     # Garage Cond
    paved_drive_order,# Paved Drive
    fence_order       # Fence
]

ordinal_encoder = OrdinalEncoder(categories= ordinal_categories)
ordinal_sacler = StandardScaler()
ordinal_pipe = make_pipeline(ordinal_imputer,ordinal_encoder, ordinal_sacler)

# Nominal
nominal_features = ["Central Air"] 
ohe_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')



In [129]:
# ordinal tuple
ordinal_tuple  = ('ordinal', ordinal_pipe, ordinal_features)

# Nominal tuple
nominal_tuple = ('nominal', ohe_encoder, nominal_features)

# numerical tuple

numeric_tuple  = ('numeric', numeric_pipe, numeric_features)


col_transformer = ColumnTransformer([ordinal_tuple, nominal_tuple, numeric_tuple ], verbose_feature_names_out= False)
col_transformer


0,1,2
,transformers,"[('ordinal', ...), ('nominal', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'NA'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,"[['Po', 'Fa', ...], ['Po', 'Fa', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [130]:
col_transformer.fit(X_train)

0,1,2
,transformers,"[('ordinal', ...), ('nominal', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'NA'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,"[['Po', 'Fa', ...], ['Po', 'Fa', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [131]:
X_train_prep = col_transformer.transform(X_train)
X_test_prep  = col_transformer.transform(X_test)

In [132]:
# fpth_out = r"C:\Users\hutha\Documents\AXSOS\machine learning\ames-housing-Cleand-withoutNull.csv"
# df.to_csv(fpth_out, index=False)

In [133]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2197 entries, 854 to 871
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PID               2197 non-null   int64  
 1   Overall Qual      2197 non-null   int64  
 2   Overall Cond      2197 non-null   int64  
 3   Year Remodeled    2197 non-null   int64  
 4   Exter Qual        2197 non-null   object 
 5   Exter Cond        2197 non-null   object 
 6   Bsmt Unf Sqft     2197 non-null   float64
 7   Total Bsmnt Sqft  2197 non-null   float64
 8   Central Air       2197 non-null   object 
 9   Living Area Sqft  2197 non-null   float64
 10  Bsmt Full Bath    2196 non-null   float64
 11  Bsmt Half Bath    2196 non-null   float64
 12  Full Bath         2197 non-null   int64  
 13  Half Bath         2195 non-null   float64
 14  Garage Cars       2196 non-null   float64
 15  Garage Qual       2073 non-null   object 
 16  Garage Cond       2073 non-null   object 
 17 

In [145]:
# LinearReg_model = LinearRegression()
# LinearReg_model.fit(X_train_prep, y_train)

def run_model_preprocessing(df, target='SalePrice', model=None, model_name='Model', random_state=42):


    # Split
    X = df.drop(columns=target)
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)

    # === Your preprocessing logic ===
    numeric_features = X_train.select_dtypes('number').columns
    median_impute = SimpleImputer(strategy='median')
    numeric_scaler = StandardScaler()
    numeric_pipe = make_pipeline(median_impute, numeric_scaler)

    ordinal_features = X_train.select_dtypes('object').drop(columns='Central Air').columns
    ordinal_imputer = SimpleImputer(strategy='constant', fill_value='NA')

    exterior_order = ["Po", "Fa", "TA", "Gd", "Ex"]
    garage_order = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
    paved_drive_order = ["N", "P", "Y"]
    fence_order = ["NA", "MnWw", "MnPrv", "GdWo", "GdPrv"]

    ordinal_categories = [
        exterior_order,   # Exter Qual
        exterior_order,   # Exter Cond
        garage_order,     # Garage Qual
        garage_order,     # Garage Cond
        paved_drive_order,# Paved Drive
        fence_order       # Fence
    ]

    ordinal_encoder = OrdinalEncoder(categories=ordinal_categories)
    ordinal_scaler = StandardScaler()
    ordinal_pipe = make_pipeline(ordinal_imputer, ordinal_encoder, ordinal_scaler)

    nominal_features = ['Central Air']
    ohe_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

    # ColumnTransformer
    col_transformer = ColumnTransformer([
        ('ordinal', ordinal_pipe, ordinal_features),
        ('nominal', ohe_encoder, nominal_features),
        ('numeric', numeric_pipe, numeric_features)
    ], verbose_feature_names_out=False)

    # Fit and transform
    col_transformer.fit(X_train)
    X_train_prep = col_transformer.transform(X_train)
    X_test_prep = col_transformer.transform(X_test)

    # Fit model
    model.fit(X_train_prep, y_train)

    # Predict
    # y_train_pred = model.predict(X_train_prep)
    # y_test_pred = model.predict(X_test_prep)

    print(f"\n Evaluating {model_name}")
    train_metrics = evaluate_regression(model, X_train_prep, y_train, X_test_prep, y_test

)#, dataset_name='Train')
    # test_metrics = evaluate_regression(model, X_train_prep, y_train, X_test_prep, y_test)#, dataset_name='Test')

    # return model, train_metrics, test_metrics


In [148]:

Linear_model = LinearRegression()
run_model_preprocessing(df, model=Linear_model, model_name='Linear Regression')



 Evaluating Linear Regression
------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 23,123.309
- MSE = 2,922,825,458.694
- RMSE = 54,063.162
- R² = 0.651

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 21,387.621
- MSE = 935,438,761.878
- RMSE = 30,584.943
- R² = 0.809


In [149]:
RForest_model = RandomForestRegressor()
run_model_preprocessing(df,model=RForest_model, model_name= 'Random Forest')


 Evaluating Random Forest
------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 6,788.958
- MSE = 300,005,848.985
- RMSE = 17,320.677
- R² = 0.964

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 17,161.471
- MSE = 795,427,333.597
- RMSE = 28,203.321
- R² = 0.838


In [150]:
RForest_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [157]:
param_grid = {
    "n_estimators": [300, 600, 900],
    "max_depth": [None, 15, 25, 35],
    "max_features": ["sqrt", "log2", 0.5],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4, 8],
    "max_samples": [None, 0.9, 0.75]
}
cv = KFold(n_splits=3, shuffle=True, random_state=42)

grid_search = GridSearchCV( estimator=RForest_model, param_grid=param_grid,cv=cv, scoring={"r2": "r2", "rmse": "neg_root_mean_squared_error"}, refit="rmse", n_jobs=-1, verbose=1, return_train_score=True)
grid_search.fit(X_train_prep, y_train)


Fitting 3 folds for each of 1296 candidates, totalling 3888 fits


KeyboardInterrupt: 