In [22]:
import pandas as pd

# Load the datasets
test_df = pd.read_csv('test.csv')
train_df = pd.read_csv('train.csv')

# Display the first few rows of each dataset to understand their structure
print(test_df.head())
print(train_df.head())



     Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape   
0  1461          20       RH         80.0    11622   Pave   NaN      Reg  \
1  1462          20       RL         81.0    14267   Pave   NaN      IR1   
2  1463          60       RL         74.0    13830   Pave   NaN      IR1   
3  1464          60       RL         78.0     9978   Pave   NaN      IR1   
4  1465         120       RL         43.0     5005   Pave   NaN      IR1   

  LandContour Utilities  ... ScreenPorch PoolArea PoolQC  Fence MiscFeature   
0         Lvl    AllPub  ...         120        0    NaN  MnPrv         NaN  \
1         Lvl    AllPub  ...           0        0    NaN    NaN        Gar2   
2         Lvl    AllPub  ...           0        0    NaN  MnPrv         NaN   
3         Lvl    AllPub  ...           0        0    NaN    NaN         NaN   
4         HLS    AllPub  ...         144        0    NaN    NaN         NaN   

  MiscVal MoSold  YrSold  SaleType  SaleCondition  
0       0      6

In [23]:
# Display basic information about each dataset to understand data types and missing values
print(test_df.info())
print(train_df.info())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

In [24]:
# Function to display missing values
def missing_values_table(df):
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table_ren_columns = mis_val_table.rename(columns={0: 'Missing Values', 1: '% of Total Values'})
    mis_val_table_ren_columns = mis_val_table_ren_columns[mis_val_table_ren_columns.iloc[:, 1] != 0].sort_values('% of Total Values', ascending=False).round(1)
    print(mis_val_table_ren_columns)

# Display missing values in each dataset
print("Missing values in test dataset:")
missing_values_table(test_df)
print("\nMissing values in train dataset:")
missing_values_table(train_df)


Missing values in test dataset:
              Missing Values  % of Total Values
PoolQC                  1456               99.8
MiscFeature             1408               96.5
Alley                   1352               92.7
Fence                   1169               80.1
MasVnrType               894               61.3
FireplaceQu              730               50.0
LotFrontage              227               15.6
GarageCond                78                5.3
GarageYrBlt               78                5.3
GarageQual                78                5.3
GarageFinish              78                5.3
GarageType                76                5.2
BsmtCond                  45                3.1
BsmtExposure              44                3.0
BsmtQual                  44                3.0
BsmtFinType1              42                2.9
BsmtFinType2              42                2.9
MasVnrArea                15                1.0
MSZoning                   4                0.3
BsmtFull

In [25]:
# Handling missing values
# Fill missing values for numerical columns with median
test_df['LotFrontage'] = test_df['LotFrontage'].fillna(test_df['LotFrontage'].median())
train_df['LotFrontage'] = train_df['LotFrontage'].fillna(train_df['LotFrontage'].median())

# Fill missing values for categorical columns with mode
test_df['MSZoning'] = test_df['MSZoning'].fillna(test_df['MSZoning'].mode()[0])
train_df['MSZoning'] = train_df['MSZoning'].fillna(train_df['MSZoning'].mode()[0])

# Drop columns with too many missing values
test_df = test_df.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature'], axis=1)
train_df = train_df.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature'], axis=1)

# Verify if missing values have been handled
print("After handling missing values:")
print(test_df.isnull().sum())
print(train_df.isnull().sum())

After handling missing values:
Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
                ..
MiscVal          0
MoSold           0
YrSold           0
SaleType         1
SaleCondition    0
Length: 76, dtype: int64
Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
                ..
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
Length: 77, dtype: int64


In [32]:
test_df.to_csv("test_df.csv", index=False)


In [33]:
train_df.to_csv("train_df.csv", index=False)

In [35]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd

# Load the updated datasets
test_df = pd.read_csv('test_df.csv')
train_df = pd.read_csv('train_df.csv')

# Separate features and target variable from training data
X = train_df.drop(['SalePrice', 'Id'], axis=1)
y = train_df['SalePrice']

# Split the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for grid search
param_grid = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2', None]
}

# Initialize the DecisionTreeRegressor
dt = DecisionTreeRegressor(random_state=42)

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best model
best_dt = grid_search.best_estimator_

# Predict on validation set
y_val_pred = best_dt.predict(X_val)
val_mse = mean_squared_error(y_val, y_val_pred)
val_rmse = val_mse ** 0.5

# Predict on test set
X_test = test_df.drop(['Id'], axis=1)
test_df['SalePrice'] = best_dt.predict(X_test)

#import ace_tools as tools; tools.display_dataframe_to_user(name="Test Predictions", dataframe=test_df)

# Save predictions to a new CSV file
test_df[['Id', 'SalePrice']].to_csv('/mnt/data/test_predictions.csv', index=False)

val_rmse, grid_search.best_params_


Fitting 5 folds for each of 180 candidates, totalling 900 fits


ValueError: 
All the 900 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\tree\_classes.py", line 1247, in fit
    super().fit(
  File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\tree\_classes.py", line 186, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 579, in _validate_data
    X = check_array(X, input_name="X", **check_X_params)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\validation.py", line 879, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\_array_api.py", line 185, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\generic.py", line 1998, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'RM'

--------------------------------------------------------------------------------
720 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\tree\_classes.py", line 1247, in fit
    super().fit(
  File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\tree\_classes.py", line 186, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 579, in _validate_data
    X = check_array(X, input_name="X", **check_X_params)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\validation.py", line 879, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\_array_api.py", line 185, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\generic.py", line 1998, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'RL'


In [None]:
# Display the test predictions
#import ace_tools as tools

#tools.display_dataframe_to_user(name="Test Predictions", dataframe=test_df[['Id', 'SalePrice']])

# Return the best parameters and RMSE
val_rmse, grid_search.best_params_


In [None]:
test_df[['Id', 'SalePrice']].to_csv('/mnt/data/test_predictions.csv', index=False)


In [38]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Load the datasets
test_df = pd.read_csv('test_df.csv')
train_df = pd.read_csv('train_df.csv')

# Separate features and target variable from training data
X = train_df.drop(['SalePrice', 'Id'], axis=1)
y = train_df['SalePrice']
X_test = test_df.drop(['Id'], axis=1)

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
numerical_cols = X.select_dtypes(include=['number']).columns

# Preprocessing pipelines for numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define the model
model = DecisionTreeRegressor(random_state=42)

# Create and evaluate the pipeline with grid search
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('model', model)])

param_grid = {
    'model__max_depth': [3, 5, 7, 10, None],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__max_features': ['auto', 'sqrt', 'log2', None]
}

grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
grid_search.fit(X, y)

# Get the best model
best_model = grid_search.best_estimator_

# Predict on the test set
test_df['SalePrice'] = best_model.predict(X_test)

# Save predictions to a new CSV file
test_df[['Id', 'SalePrice']].to_csv('test_predictions.csv', index=False)

#import ace_tools as tools; tools.display_dataframe_to_user(name="Test Predictions", dataframe=test_df)

grid_search.best_params_, grid_search.best_score_


Fitting 5 folds for each of 180 candidates, totalling 900 fits




({'model__max_depth': 7,
  'model__max_features': 'auto',
  'model__min_samples_leaf': 2,
  'model__min_samples_split': 10},
 -1428981414.0830266)

In [39]:
grid_search.best_score_

-1428981414.0830266

In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load the datasets
test_df = pd.read_csv('test_df.csv')
train_df = pd.read_csv('train_df.csv')

# Separate features and target variable from training data
X = train_df.drop(['SalePrice', 'Id'], axis=1)
y = train_df['SalePrice']
X_test = test_df.drop(['Id'], axis=1)

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
numerical_cols = X.select_dtypes(include=['number']).columns

# Preprocessing pipelines for numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define the model
model = DecisionTreeRegressor(random_state=42)

# Create and evaluate the pipeline with grid search
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('model', model)])

param_grid = {
    'model__max_depth': [3, 5, 7, 10, None],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__max_features': ['auto', 'sqrt', 'log2', None]
}

# Split the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Predict on validation set
y_val_pred = best_model.predict(X_val)
val_mse = mean_squared_error(y_val, y_val_pred)
val_rmse = np.sqrt(val_mse)

# Predict on test set
test_df['SalePrice'] = best_model.predict(X_test)

#import ace_tools as tools; tools.display_dataframe_to_user(name="Test Predictions", dataframe=test_df)

# Save predictions to a new CSV file
test_df[['Id', 'SalePrice']].to_csv('test_predictions.csv', index=False)

# Print evaluation metrics
print(f"Validation RMSE: {val_rmse}")
print(f"Best Parameters: {grid_search.best_params_}")

# Return the best parameters and RMSE
val_rmse, grid_search.best_params_


Fitting 5 folds for each of 180 candidates, totalling 900 fits




Validation RMSE: 41170.644163996796
Best Parameters: {'model__max_depth': None, 'model__max_features': 'auto', 'model__min_samples_leaf': 4, 'model__min_samples_split': 10}


(41170.644163996796,
 {'model__max_depth': None,
  'model__max_features': 'auto',
  'model__min_samples_leaf': 4,
  'model__min_samples_split': 10})

In [46]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load the datasets
train_df = pd.read_csv('train_df.csv')
test_df = pd.read_csv('test_df.csv')

# Create new features
train_df['TotalSF'] = train_df['TotalBsmtSF'] + train_df['1stFlrSF'] + train_df['2ndFlrSF']
test_df['TotalSF'] = test_df['TotalBsmtSF'] + test_df['1stFlrSF'] + test_df['2ndFlrSF']

train_df['OverallQual_GrLivArea'] = train_df['OverallQual'] * train_df['GrLivArea']
test_df['OverallQual_GrLivArea'] = test_df['OverallQual'] * test_df['GrLivArea']

train_df['GrLivArea'] = np.log1p(train_df['GrLivArea'])
test_df['GrLivArea'] = np.log1p(test_df['GrLivArea'])

train_df['HouseAge'] = train_df['YrSold'] - train_df['YearBuilt']
test_df['HouseAge'] = test_df['YrSold'] - test_df['YearBuilt']

train_df['TotalPorchSF'] = (train_df['OpenPorchSF'] + train_df['3SsnPorch'] +
                            train_df['EnclosedPorch'] + train_df['ScreenPorch'] + train_df['WoodDeckSF'])
test_df['TotalPorchSF'] = (test_df['OpenPorchSF'] + test_df['3SsnPorch'] +
                           test_df['EnclosedPorch'] + test_df['ScreenPorch'] + test_df['WoodDeckSF'])

# Handle categorical variables
categorical_cols = train_df.select_dtypes(include=['object', 'category']).columns
train_df = pd.get_dummies(train_df, columns=categorical_cols)
test_df = pd.get_dummies(test_df, columns=categorical_cols)
train_df, test_df = train_df.align(test_df, join='left', axis=1, fill_value=0)

# Separate features and target variable from training data
X = train_df.drop(['SalePrice', 'Id'], axis=1)
y = train_df['SalePrice']
X_test = test_df.drop(['Id'], axis=1)

# Split the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
model = DecisionTreeRegressor(random_state=42)

# Create and evaluate the pipeline with grid search
pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('model', model)
])

param_grid = {
    'model__max_depth': [3, 5, 7, 10, None],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__max_features': ['auto', 'sqrt', 'log2', None]
}

grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Predict on validation set
y_val_pred = best_model.predict(X_val)
val_mse = mean_squared_error(y_val, y_val_pred)
val_rmse = np.sqrt(val_mse)

# Predict on test set
test_df['SalePrice'] = best_model.predict(X_test)

# Save predictions to a new CSV file
test_df[['Id', 'SalePrice']].to_csv('/mnt/data/test_predictions.csv', index=False)

#import ace_tools as tools; tools.display_dataframe_to_user(name="Test Predictions", dataframe=test_df)

# Print evaluation metrics
print(f"Validation RMSE: {val_rmse}")
print(f"Best Parameters: {grid_search.best_params_}")

# Return the best parameters and RMSE
val_rmse, grid_search.best_params_


Fitting 5 folds for each of 180 candidates, totalling 900 fits




ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- SalePrice


In [48]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load the datasets
train_df = pd.read_csv('train_df.csv')
test_df = pd.read_csv('test_df.csv')

# Create new features
train_df['TotalSF'] = train_df['TotalBsmtSF'] + train_df['1stFlrSF'] + train_df['2ndFlrSF']
test_df['TotalSF'] = test_df['TotalBsmtSF'] + test_df['1stFlrSF'] + test_df['2ndFlrSF']

train_df['OverallQual_GrLivArea'] = train_df['OverallQual'] * train_df['GrLivArea']
test_df['OverallQual_GrLivArea'] = test_df['OverallQual'] * test_df['GrLivArea']

train_df['GrLivArea'] = np.log1p(train_df['GrLivArea'])
test_df['GrLivArea'] = np.log1p(test_df['GrLivArea'])

train_df['HouseAge'] = train_df['YrSold'] - train_df['YearBuilt']
test_df['HouseAge'] = test_df['YrSold'] - test_df['YearBuilt']

train_df['TotalPorchSF'] = (train_df['OpenPorchSF'] + train_df['3SsnPorch'] +
                            train_df['EnclosedPorch'] + train_df['ScreenPorch'] + train_df['WoodDeckSF'])
test_df['TotalPorchSF'] = (test_df['OpenPorchSF'] + test_df['3SsnPorch'] +
                           test_df['EnclosedPorch'] + test_df['ScreenPorch'] + test_df['WoodDeckSF'])

# Handle categorical variables
categorical_cols = train_df.select_dtypes(include=['object', 'category']).columns
train_df = pd.get_dummies(train_df, columns=categorical_cols)
test_df = pd.get_dummies(test_df, columns=categorical_cols)
train_df, test_df = train_df.align(test_df, join='left', axis=1, fill_value=0)

# Separate features and target variable from training data
X = train_df.drop(['SalePrice', 'Id'], axis=1)
y = train_df['SalePrice']
X_test = test_df.drop(['Id', 'SalePrice'], axis=1, errors='ignore')

# Split the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
model = DecisionTreeRegressor(random_state=42)

# Create and evaluate the pipeline with grid search
pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('model', model)
])

param_grid = {
    'model__max_depth': [3, 5, 7, 10, None],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__max_features': ['sqrt', 'log2', None]  # 'auto' removed to avoid warning
}

grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Predict on validation set
y_val_pred = best_model.predict(X_val)
val_mse = mean_squared_error(y_val, y_val_pred)
val_rmse = np.sqrt(val_mse)

# Predict on test set
test_df['SalePrice'] = best_model.predict(X_test)

# Save predictions to a new CSV file
test_df[['Id', 'SalePrice']].to_csv('test_predictions100.csv', index=False)

#import ace_tools as tools; tools.display_dataframe_to_user(name="Test Predictions", dataframe=test_df)

# Print evaluation metrics
print(f"Validation RMSE: {val_rmse}")
print(f"Best Parameters: {grid_search.best_params_}")

# Return the best parameters and RMSE
val_rmse, grid_search.best_params_


Fitting 5 folds for each of 135 candidates, totalling 675 fits
Validation RMSE: 38440.0099609417
Best Parameters: {'model__max_depth': 7, 'model__max_features': None, 'model__min_samples_leaf': 2, 'model__min_samples_split': 5}


(38440.0099609417,
 {'model__max_depth': 7,
  'model__max_features': None,
  'model__min_samples_leaf': 2,
  'model__min_samples_split': 5})

In [52]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load the datasets
train_df = pd.read_csv('train_df.csv')
test_df = pd.read_csv('test_df.csv')

# Create new features
train_df['TotalSF'] = train_df['TotalBsmtSF'] + train_df['1stFlrSF'] + train_df['2ndFlrSF']
test_df['TotalSF'] = test_df['TotalBsmtSF'] + test_df['1stFlrSF'] + test_df['2ndFlrSF']

train_df['OverallQual_GrLivArea'] = train_df['OverallQual'] * train_df['GrLivArea']
test_df['OverallQual_GrLivArea'] = test_df['OverallQual'] * test_df['GrLivArea']

train_df['GrLivArea'] = np.log1p(train_df['GrLivArea'])
test_df['GrLivArea'] = np.log1p(test_df['GrLivArea'])

train_df['HouseAge'] = train_df['YrSold'] - train_df['YearBuilt']
test_df['HouseAge'] = test_df['YrSold'] - test_df['YearBuilt']

train_df['TotalPorchSF'] = (train_df['OpenPorchSF'] + train_df['3SsnPorch'] +
                            train_df['EnclosedPorch'] + train_df['ScreenPorch'] + train_df['WoodDeckSF'])
test_df['TotalPorchSF'] = (test_df['OpenPorchSF'] + test_df['3SsnPorch'] +
                           test_df['EnclosedPorch'] + test_df['ScreenPorch'] + test_df['WoodDeckSF'])

# Handle categorical variables
categorical_cols = train_df.select_dtypes(include=['object', 'category']).columns
train_df = pd.get_dummies(train_df, columns=categorical_cols)
test_df = pd.get_dummies(test_df, columns=categorical_cols)
train_df, test_df = train_df.align(test_df, join='left', axis=1, fill_value=0)

# Separate features and target variable from training data
X = train_df.drop(['SalePrice', 'Id'], axis=1)
y = train_df['SalePrice']
X_test = test_df.drop(['Id', 'SalePrice'], axis=1, errors='ignore')

# Split the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
model = DecisionTreeRegressor(random_state=42)

# Create and evaluate the pipeline with grid search
pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('model', model)
])

param_grid = {
    'model__max_depth': [3, 5, 7, 10, None],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__max_features': ['sqrt', 'log2', None]  # 'auto' removed to avoid warning
}

grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Predict on validation set
y_val_pred = best_model.predict(X_val)
val_mse = mean_squared_error(y_val, y_val_pred)
val_rmse = np.sqrt(val_mse)
val_r2 = r2_score(y_val, y_val_pred)

# Predict on test set
test_df['SalePrice'] = best_model.predict(X_test)

# Save predictions to a new CSV file
test_df[['Id', 'SalePrice']].to_csv('test_predictions102.csv', index=False)

#import ace_tools as tools; tools.display_dataframe_to_user(name="Test Predictions", dataframe=test_df)

# Print evaluation metrics
print(f"Validation RMSE: {val_rmse}")
print(f"Validation R^2: {val_r2}")
print(f"Best Parameters: {grid_search.best_params_}")

# Return the best parameters, RMSE, and R^2 score
val_rmse, val_r2, grid_search.best_params_


Fitting 5 folds for each of 135 candidates, totalling 675 fits
Validation RMSE: 38440.0099609417
Validation R^2: 0.8073570372518514
Best Parameters: {'model__max_depth': 7, 'model__max_features': None, 'model__min_samples_leaf': 2, 'model__min_samples_split': 5}


(38440.0099609417,
 0.8073570372518514,
 {'model__max_depth': 7,
  'model__max_features': None,
  'model__min_samples_leaf': 2,
  'model__min_samples_split': 5})

In [56]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Load the datasets
train_df = pd.read_csv('train_df.csv')
test_df = pd.read_csv('test_df.csv')

# Create new features
train_df['TotalSF'] = train_df['TotalBsmtSF'] + train_df['1stFlrSF'] + train_df['2ndFlrSF']
test_df['TotalSF'] = test_df['TotalBsmtSF'] + test_df['1stFlrSF'] + test_df['2ndFlrSF']

train_df['OverallQual_GrLivArea'] = train_df['OverallQual'] * train_df['GrLivArea']
test_df['OverallQual_GrLivArea'] = test_df['OverallQual'] * test_df['GrLivArea']

train_df['GrLivArea'] = np.log1p(train_df['GrLivArea'])
test_df['GrLivArea'] = np.log1p(test_df['GrLivArea'])

train_df['HouseAge'] = train_df['YrSold'] - train_df['YearBuilt']
test_df['HouseAge'] = test_df['YrSold'] - test_df['YearBuilt']

train_df['TotalPorchSF'] = (train_df['OpenPorchSF'] + train_df['3SsnPorch'] +
                            train_df['EnclosedPorch'] + train_df['ScreenPorch'] + train_df['WoodDeckSF'])
test_df['TotalPorchSF'] = (test_df['OpenPorchSF'] + test_df['3SsnPorch'] +
                           test_df['EnclosedPorch'] + test_df['ScreenPorch'] + test_df['WoodDeckSF'])

# Handle categorical variables
categorical_cols = train_df.select_dtypes(include=['object', 'category']).columns
train_df = pd.get_dummies(train_df, columns=categorical_cols)
test_df = pd.get_dummies(test_df, columns=categorical_cols)
train_df, test_df = train_df.align(test_df, join='left', axis=1, fill_value=0)

# Separate features and target variable from training data
#X = train_df.drop(['SalePrice', 'Id'], axis=1)
#y = train_df['SalePrice']
#X_test = test_df.drop(['Id'], axis=1)
X = train_df.drop(['SalePrice', 'Id'], axis=1)
y = train_df['SalePrice']
X_test = test_df.drop(['Id', 'SalePrice'], axis=1, errors='ignore')
# Split the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
rf_model = RandomForestRegressor(random_state=42)

# Create and evaluate the pipeline with grid search
pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('model', rf_model)
])

param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__max_features': ['auto', 'sqrt', 'log2']
}

grid_search_rf = GridSearchCV(pipe, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
grid_search_rf.fit(X_train, y_train)

# Get the best model
best_rf_model = grid_search_rf.best_estimator_

# Predict on validation set
y_val_pred_rf = best_rf_model.predict(X_val)
val_mse_rf = mean_squared_error(y_val, y_val_pred_rf)
val_rmse_rf = np.sqrt(val_mse_rf)
val_r2_rf = r2_score(y_val, y_val_pred_rf)

# Predict on test set
test_df['SalePrice'] = best_rf_model.predict(X_test)

# Save predictions to a new CSV file
test_df[['Id', 'SalePrice']].to_csv('test_predictions_rf_corrected.csv', index=False)

# Print evaluation metrics
print(f"Validation RMSE for Random Forest: {val_rmse_rf}")
print(f"Validation R² for Random Forest: {val_r2_rf}")
print(f"Best Parameters for Random Forest: {grid_search_rf.best_params_}")

# Return the best parameters and evaluation metrics
val_rmse_rf, val_r2_rf, grid_search_rf.best_params_


Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Validation RMSE for Random Forest: 31292.242082808014
Validation R² for Random Forest: 0.8723386218370293
Best Parameters for Random Forest: {'model__max_depth': 20, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 300}


(31292.242082808014,
 0.8723386218370293,
 {'model__max_depth': 20,
  'model__max_features': 'sqrt',
  'model__min_samples_leaf': 1,
  'model__min_samples_split': 2,
  'model__n_estimators': 300})

In [58]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Load the datasets
train_df = pd.read_csv('train_df.csv')
test_df = pd.read_csv('test_df.csv')

# Create new features
train_df['TotalSF'] = train_df['TotalBsmtSF'] + train_df['1stFlrSF'] + train_df['2ndFlrSF']
test_df['TotalSF'] = test_df['TotalBsmtSF'] + test_df['1stFlrSF'] + test_df['2ndFlrSF']

train_df['OverallQual_GrLivArea'] = train_df['OverallQual'] * train_df['GrLivArea']
test_df['OverallQual_GrLivArea'] = test_df['OverallQual'] * test_df['GrLivArea']

train_df['GrLivArea'] = np.log1p(train_df['GrLivArea'])
test_df['GrLivArea'] = np.log1p(test_df['GrLivArea'])

train_df['HouseAge'] = train_df['YrSold'] - train_df['YearBuilt']
test_df['HouseAge'] = test_df['YrSold'] - test_df['YearBuilt']

train_df['TotalPorchSF'] = (train_df['OpenPorchSF'] + train_df['3SsnPorch'] +
                            train_df['EnclosedPorch'] + train_df['ScreenPorch'] + train_df['WoodDeckSF'])
test_df['TotalPorchSF'] = (test_df['OpenPorchSF'] + test_df['3SsnPorch'] +
                           test_df['EnclosedPorch'] + test_df['ScreenPorch'] + test_df['WoodDeckSF'])

# Handle categorical variables
categorical_cols = train_df.select_dtypes(include=['object', 'category']).columns
train_df = pd.get_dummies(train_df, columns=categorical_cols)
test_df = pd.get_dummies(test_df, columns=categorical_cols)
train_df, test_df = train_df.align(test_df, join='left', axis=1, fill_value=0)

# Separate features and target variable from training data
#X = train_df.drop(['SalePrice', 'Id'], axis=1)
#y = train_df['SalePrice']
#X_test = test_df.drop(['Id'], axis=1)

X = train_df.drop(['SalePrice', 'Id'], axis=1)
y = train_df['SalePrice']
X_test = test_df.drop(['Id', 'SalePrice'], axis=1, errors='ignore')

# Split the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
gb_model = GradientBoostingRegressor(random_state=42)

# Create and evaluate the pipeline with grid search
pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('model', gb_model)
])

param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__max_depth': [3, 4, 5],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__max_features': ['auto', 'sqrt', 'log2']
}

grid_search_gb = GridSearchCV(pipe, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
grid_search_gb.fit(X_train, y_train)

# Get the best model
best_gb_model = grid_search_gb.best_estimator_

# Predict on validation set
y_val_pred_gb = best_gb_model.predict(X_val)
val_mse_gb = mean_squared_error(y_val, y_val_pred_gb)
val_rmse_gb = np.sqrt(val_mse_gb)
val_r2_gb = r2_score(y_val, y_val_pred_gb)

# Predict on test set
test_df['SalePrice'] = best_gb_model.predict(X_test)

# Save predictions to a new CSV file
test_df[['Id', 'SalePrice']].to_csv('test_predictions_gb.csv', index=False)

# Print evaluation metrics
print(f"Validation RMSE for Gradient Boosting: {val_rmse_gb}")
print(f"Validation R² for Gradient Boosting: {val_r2_gb}")
print(f"Best Parameters for Gradient Boosting: {grid_search_gb.best_params_}")

# Return the best parameters and evaluation metrics
val_rmse_gb, val_r2_gb, grid_search_gb.best_params_


Fitting 5 folds for each of 729 candidates, totalling 3645 fits
Validation RMSE for Gradient Boosting: 25302.03237578527
Validation R² for Gradient Boosting: 0.916536425563567
Best Parameters for Gradient Boosting: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 4, 'model__min_samples_split': 2, 'model__n_estimators': 300}


(25302.03237578527,
 0.916536425563567,
 {'model__learning_rate': 0.1,
  'model__max_depth': 3,
  'model__max_features': 'sqrt',
  'model__min_samples_leaf': 4,
  'model__min_samples_split': 2,
  'model__n_estimators': 300})

In [61]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Load the datasets
train_df = pd.read_csv('train_df.csv')
test_df = pd.read_csv('test_df.csv')

# Create new features
train_df['TotalSF'] = train_df['TotalBsmtSF'] + train_df['1stFlrSF'] + train_df['2ndFlrSF']
test_df['TotalSF'] = test_df['TotalBsmtSF'] + test_df['1stFlrSF'] + test_df['2ndFlrSF']

train_df['OverallQual_GrLivArea'] = train_df['OverallQual'] * train_df['GrLivArea']
test_df['OverallQual_GrLivArea'] = test_df['OverallQual'] * test_df['GrLivArea']

train_df['GrLivArea'] = np.log1p(train_df['GrLivArea'])
test_df['GrLivArea'] = np.log1p(test_df['GrLivArea'])

train_df['HouseAge'] = train_df['YrSold'] - train_df['YearBuilt']
test_df['HouseAge'] = test_df['YrSold'] - test_df['YearBuilt']

train_df['TotalPorchSF'] = (train_df['OpenPorchSF'] + train_df['3SsnPorch'] +
                            train_df['EnclosedPorch'] + train_df['ScreenPorch'] + train_df['WoodDeckSF'])
test_df['TotalPorchSF'] = (test_df['OpenPorchSF'] + test_df['3SsnPorch'] +
                           test_df['EnclosedPorch'] + test_df['ScreenPorch'] + test_df['WoodDeckSF'])

# Handle categorical variables
categorical_cols = train_df.select_dtypes(include=['object', 'category']).columns
train_df = pd.get_dummies(train_df, columns=categorical_cols)
test_df = pd.get_dummies(test_df, columns=categorical_cols)
train_df, test_df = train_df.align(test_df, join='left', axis=1, fill_value=0)

# Separate features and target variable from training data
#X = train_df.drop(['SalePrice', 'Id'], axis=1)
#y = train_df['SalePrice']
#X_test = test_df.drop(['Id'], axis=1)

X = train_df.drop(['SalePrice', 'Id'], axis=1)
y = train_df['SalePrice']
X_test = test_df.drop(['Id', 'SalePrice'], axis=1, errors='ignore')
# Split the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the models
rf_model = RandomForestRegressor(random_state=42)
gb_model = GradientBoostingRegressor(random_state=42)

# Create pipelines
pipe_rf = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('model', rf_model)
])

pipe_gb = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('model', gb_model)
])

# Define parameter grids
param_grid_rf = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [10, 20],
    'model__min_samples_split': [2, 5],
    'model__min_samples_leaf': [1, 2]
}

param_grid_gb = {
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.01, 0.1],
    'model__max_depth': [3, 4],
    'model__min_samples_split': [2, 5],
    'model__min_samples_leaf': [1, 2]
}

# Perform grid search
grid_search_rf = GridSearchCV(pipe_rf, param_grid_rf, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
grid_search_gb = GridSearchCV(pipe_gb, param_grid_gb, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)

grid_search_rf.fit(X_train, y_train)
grid_search_gb.fit(X_train, y_train)

# Get the best models
best_rf_model = grid_search_rf.best_estimator_
best_gb_model = grid_search_gb.best_estimator_

# Create a Voting Regressor with both models
voting_reg = VotingRegressor([('rf', best_rf_model), ('gb', best_gb_model)])
voting_reg.fit(X_train, y_train)

# Predict on validation set
y_val_pred_voting = voting_reg.predict(X_val)
val_mse_voting = mean_squared_error(y_val, y_val_pred_voting)
val_rmse_voting = np.sqrt(val_mse_voting)
val_r2_voting = r2_score(y_val, y_val_pred_voting)

# Predict on test set
test_df['SalePrice'] = voting_reg.predict(X_test)

# Save predictions to a new CSV file
test_df[['Id', 'SalePrice']].to_csv('test_predictions_voting.csv', index=False)

# Print evaluation metrics
print(f"Validation RMSE for Voting Regressor: {val_rmse_voting}")
print(f"Validation R² for Voting Regressor: {val_r2_voting}")

# Return the best parameters and evaluation metrics
val_rmse_voting, val_r2_voting, grid_search_rf.best_params_, grid_search_gb.best_params_


Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 32 candidates, totalling 160 fits
Validation RMSE for Voting Regressor: 27058.75672296335
Validation R² for Voting Regressor: 0.9045443054682


(27058.75672296335,
 0.9045443054682,
 {'model__max_depth': 20,
  'model__min_samples_leaf': 2,
  'model__min_samples_split': 2,
  'model__n_estimators': 200},
 {'model__learning_rate': 0.1,
  'model__max_depth': 3,
  'model__min_samples_leaf': 2,
  'model__min_samples_split': 5,
  'model__n_estimators': 200})