In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as MSE
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.model_selection import GridSearchCV




In [2]:
df_train = pd.read_csv("train.csv")
df_train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [3]:
df_train.shape

(1460, 81)

In [4]:
df_test = pd.read_csv("test.csv")
df_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [5]:
sample_submission = pd.read_csv("sample_submission.csv")
sample_submission.head()

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.68357
3,1464,179317.477511
4,1465,150730.079977


In [6]:
#df_train.columns

In [7]:
#df_test.columns

In [None]:
# Model 1: Random Forest

In [6]:
# Preprocessing data to make rf model

target_column = 'SalePrice'

# Extract features and target variable from the training set
X = df_train.drop(target_column, axis=1)
y = df_train[target_column]

# Identify categorical columns in the dataset
categorical_cols = X.select_dtypes(include=['object']).columns

# Identify quantative columns in the dataset
numerical_cols = X.select_dtypes(exclude=['object']).columns

# Split the data into training and testing sets- 80% is train, and 20% is test data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Create transformers for numerical and categorical columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a preprocessor to apply transformers to appropriate columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])



In [7]:
# Define the parameter grid
param_grid = {
    'regressor__n_estimators': [1, 5, 10, 25, 50],
    'regressor__max_depth': [None, 2, 5, 7, 10],
    'regressor__min_samples_split': [2, 4, 5, 7, 10],
    'regressor__min_samples_leaf': [1, 2, 3, 5, 8]
}

# Create a Random Forest regressor model
rf_model = RandomForestRegressor(random_state=12)

# Create a pipeline with the preprocessor and Random Forest model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', rf_model)
])

# Instantiate GridSearchCV with the pipeline and parameter grid
grid_search = GridSearchCV(model,  # Use the entire pipeline here
                           param_grid=param_grid,  # Correct parameter name
                           cv=5, 
                           scoring='neg_mean_squared_error', 
                           n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and best score found
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", -grid_search.best_score_)  # Take the negative of the score to get MSE


Best Parameters: {'regressor__max_depth': 10, 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 4, 'regressor__n_estimators': 50}
Best Score: 945389671.9318911


In [8]:

# Get the best model from the grid search
best_model_RF = grid_search.best_estimator_

#best_model

# Make predictions on the test set using the best model
y_test_pred = best_model_RF.predict(X_test)

In [9]:
# Calculate RMSE using cross-validation on the training data
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
cv_rmse = np.sqrt(-np.mean(cv_scores))

print("RMSE using cross-validation on training data:", cv_rmse)



# Make predictions on the training set
y_train_pred = best_model_RF.predict(X_train)

# Calculate RMSE on the entire training set
train_rmse = np.sqrt(MSE(y_train, y_train_pred))

print("\nRMSE on the entire training set:", train_rmse)

RMSE using cross-validation on training data: 31026.438022257378

RMSE on the entire training set: 13861.158264096135


In [19]:
# get the RMSE value for rf model

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_test_pred)

# Calculate Root Mean Squared Error
rmse = np.sqrt(mse)

print(f'Root Mean Squared Error (RMSE): ${rmse:.2f}')

# the final model RF model works pretty good, because the RMSE is quite lower than CV's RMSE.


Root Mean Squared Error (RMSE): $28281.04


In [10]:
RF_pred = best_model_RF.predict(df_test)
RF_pred

array([127490.28962418, 151242.83652503, 182568.33700651, ...,
       153252.39607066, 121323.17295333, 227060.81088162])

In [11]:
# Create a DataFrame with 'id' column from df_test and predictions column
RF_submission = pd.DataFrame({
    'id': df_test['Id'],  # Replace 'id' with the actual column name from df_test
    'SalePrice': RF_pred
})

# Display the resulting DataFrame
(RF_submission)

Unnamed: 0,id,SalePrice
0,1461,127490.289624
1,1462,151242.836525
2,1463,182568.337007
3,1464,184013.992390
4,1465,206827.306708
...,...,...
1454,2915,83229.212303
1455,2916,85837.197817
1456,2917,153252.396071
1457,2918,121323.172953


In [12]:
# Save the DataFrame to a CSV file
RF_submission.to_csv('RF_submission.csv', index=False, header=['Id', 'SalePrice'])

# Display a message indicating the file has been saved
print("Submission file saved as 'RF_submission.csv'")

Submission file saved as 'RF_submission.csv'


In [None]:
# Model 2- Elastic Reg

In [13]:
import pandas as pd
import warnings
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
from sklearn.pipeline import make_pipeline



In [14]:
#Elastic Reg

# Pre processing the data

# Assuming df_train is your training dataset
# Replace 'SalePrice' with the actual name of your target variable column
target_column = 'SalePrice'

# Extract features and target variable from the dataset
X = df_train.drop(target_column, axis=1)
y = df_train[target_column]

# Split the data into training and testing sets- 80% train and 20% test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical columns in the dataset
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Identify quantitative columns in the dataset
numerical_cols = X_train.select_dtypes(exclude=['object']).columns

# Create transformers for numerical and categorical columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a preprocessor to apply transformers to appropriate columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Create an Elastic Net regression model
elastic_net_model = ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=2)

# You can adjust the alpha and l1_ratio parameters based on your needs

# Create a pipeline with the preprocessor and Elastic Net model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', elastic_net_model)
])

# Train the model on the training set
model.fit(X_train, y_train)

# Make predictions on the training set
y_train_pred = model.predict(X_train)

# Make predictions on the test set
y_test_pred = model.predict(X_test)

# Evaluate performance on training set
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
r2_train = r2_score(y_train, y_train_pred)

# Evaluate performance on test set
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
r2_test = r2_score(y_test, y_test_pred)

# Reset warnings to default
warnings.resetwarnings()

print(f"Training Set Metrics:")
print(f"  RMSE: {rmse_train}")
print(f"  R-squared: {r2_train}")

print(f"\nTest Set Metrics:")
print(f"  RMSE: {rmse_test}")
print(f"  R-squared: {r2_test}")




Training Set Metrics:
  RMSE: 32360.736110918246
  R-squared: 0.8244264571572313

Test Set Metrics:
  RMSE: 36236.382686460376
  R-squared: 0.8288110055907076


In [15]:
model_elastic = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', elastic_net_model)
])

# model_elastic.get_params()

In [16]:
# using CV

import warnings
from sklearn.exceptions import ConvergenceWarning

# Suppress warnings from the ElasticNet model
with warnings.catch_warnings():
    warnings.filterwarnings('ignore', category=ConvergenceWarning)


# Create an Elastic Net regression model
elastic_net_model = ElasticNet()

# Create a pipeline with the preprocessor and Elastic Net model
model_elastic = make_pipeline(preprocessor, elastic_net_model)

# Define the parameter grid
param_grid = {
    'elasticnet__alpha': [0.01, 0.1, 1.0, 10.0],
    'elasticnet__l1_ratio': [0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1.0],
    'elasticnet__max_iter': [1000, 2000, 5000]
}

# Instantiate GridSearchCV with the pipeline and parameter grid
elastic_grid_search = GridSearchCV(estimator=model_elastic, 
                                   param_grid=param_grid, 
                                   cv=5, 
                                   scoring='neg_mean_squared_error', 
                                   n_jobs=-1,
                                   verbose=2)

# Fit the grid search to the data
elastic_grid_search.fit(X_train, y_train)

# Print the best parameters and best score found
#print("Best Parameters:", elastic_grid_search.best_params_)
#print("Best Score:", -elastic_grid_search.best_score_)  # Take the negative of the score to get MSE

# Optionally, you can evaluate the model on the test set
y_test_pred = elastic_grid_search.best_estimator_.predict(X_test)
mse_test = mean_squared_error(y_test, y_test_pred)
print(f'Mean Squared Error (Test Set): {mse_test}')


Fitting 5 folds for each of 84 candidates, totalling 420 fits


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs

Mean Squared Error (Test Set): 941093942.1128374


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]


In [17]:
# best estimators using the CV method

elastic_grid_search.best_params_


{'elasticnet__alpha': 1.0,
 'elasticnet__l1_ratio': 0.99,
 'elasticnet__max_iter': 1000}

In [35]:
y_train_pred = elastic_grid_search.best_estimator_.predict(X_train)
rmse_train = mean_squared_error(y_train, y_train_pred)**(1/2)
print(f'RMSE (Train Set): ${rmse_train:.2f}')

rmse_test = mean_squared_error(y_test, y_test_pred)**(1/2)
print(f'RMSE (Test Set): ${rmse_test:.2f}')

# Ignore ConvergenceWarnings to prevent them from being displayed
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Your code with the Elastic Net model training goes here

# Reset warnings to default if needed
warnings.resetwarnings()

RMSE (Train Set): $24591.86
RMSE (Test Set): $30677.25


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]


[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.1, elasticnet__max_iter=2000; total time=   0.6s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.1, elasticnet__max_iter=5000; total time=   0.6s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.5, elasticnet__max_iter=2000; total time=   0.7s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.5, elasticnet__max_iter=5000; total time=   0.7s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.7, elasticnet__max_iter=1000; total time=   0.8s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.7, elasticnet__max_iter=5000; total time=   1.3s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.9, elasticnet__max_iter=2000; total time=   1.6s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.95, elasticnet__max_iter=1000; total time=   0.8s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.95, elasticnet__max_iter=1000; total time=   0.8s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio

[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.1, elasticnet__max_iter=1000; total time=   0.6s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.5, elasticnet__max_iter=1000; total time=   0.8s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.5, elasticnet__max_iter=2000; total time=   0.7s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.7, elasticnet__max_iter=1000; total time=   0.8s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.7, elasticnet__max_iter=2000; total time=   1.0s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.9, elasticnet__max_iter=1000; total time=   0.8s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.9, elasticnet__max_iter=2000; total time=   1.5s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.9, elasticnet__max_iter=5000; total time=   2.9s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.95, elasticnet__max_iter=5000; total time=   4.0s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=

[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.1, elasticnet__max_iter=1000; total time=   0.5s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.1, elasticnet__max_iter=2000; total time=   0.6s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.5, elasticnet__max_iter=1000; total time=   0.8s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.5, elasticnet__max_iter=5000; total time=   0.8s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.7, elasticnet__max_iter=2000; total time=   1.1s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.7, elasticnet__max_iter=5000; total time=   1.1s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.9, elasticnet__max_iter=5000; total time=   2.6s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.95, elasticnet__max_iter=2000; total time=   1.6s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.95, elasticnet__max_iter=5000; total time=   4.0s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio

[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.1, elasticnet__max_iter=1000; total time=   0.6s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.1, elasticnet__max_iter=5000; total time=   0.6s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.5, elasticnet__max_iter=2000; total time=   0.8s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.5, elasticnet__max_iter=5000; total time=   0.8s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.7, elasticnet__max_iter=2000; total time=   1.4s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.9, elasticnet__max_iter=1000; total time=   0.8s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.9, elasticnet__max_iter=2000; total time=   1.6s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.95, elasticnet__max_iter=1000; total time=   0.9s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.95, elasticnet__max_iter=2000; total time=   1.6s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio

[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.1, elasticnet__max_iter=1000; total time=   0.5s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.1, elasticnet__max_iter=2000; total time=   0.6s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.5, elasticnet__max_iter=1000; total time=   0.7s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.5, elasticnet__max_iter=2000; total time=   0.7s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.7, elasticnet__max_iter=1000; total time=   0.8s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.7, elasticnet__max_iter=5000; total time=   1.0s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.9, elasticnet__max_iter=1000; total time=   0.8s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.9, elasticnet__max_iter=5000; total time=   3.5s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.95, elasticnet__max_iter=2000; total time=   1.6s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=

In [None]:
# Model 3: Gradient Boosting Machine

In [18]:
from sklearn.ensemble import GradientBoostingRegressor

In [19]:
# Define the parameter grid
param_grid = {
    'regressor__n_estimators': [50, 100, 150],
    'regressor__learning_rate': [0.01, 0.1, 0.2],
    'regressor__max_depth': [3, 4, 5],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

# Create the Gradient Boosting Regressor model
gb_model = GradientBoostingRegressor(random_state=1)

# Create a pipeline with the preprocessor and Gradient Boosting model
gbm_model_pre = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', gb_model)
])

# Instantiate GridSearchCV with the pipeline and parameter grid
grid_search_model = GridSearchCV(estimator = gbm_model_pre, 
                           param_grid = param_grid, 
                           cv=5, 
                           scoring='neg_mean_squared_error', 
                           n_jobs=-1)

# Fit the grid search to the data
grid_search_model.fit(X_train, y_train)

# Print the best parameters found
print("Best Hyperparameters:", grid_search_model.best_params_)

# Get the best model
best_gbm_model = grid_search_model.best_estimator_



See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]


Best Hyperparameters: {'regressor__learning_rate': 0.1, 'regressor__max_depth': 4, 'regressor__min_samples_leaf': 2, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 150}


In [20]:
# Make predictions on the test set using the best model
y_test_pred_best = best_gbm_model.predict(X_test)

# Evaluate the best model
mse_best = MSE(y_test, y_test_pred_best)
rmse_best = np.sqrt(mse_best)

print(f'Generalization error- RMSE: {rmse_best}')

Generalization error- RMSE: 27090.982863677506


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]


In [21]:
# Make predictions on the test set using the best model
y_train_pred_best = best_gbm_model.predict(X_train)

# Evaluate the best model performance on training data
mse_train = MSE(y_train, y_train_pred_best)
rmse_train = np.sqrt(mse_train)

print(f'Error on Training data- RMSE: ${rmse_train:.2f}')


# Evaluate the best model
y_test_pred_best = best_gbm_model.predict(X_test)

mse_test = MSE(y_test, y_test_pred_best)
rmse_test = np.sqrt(mse_test)

print(f'Error on Test data- RMSE: ${rmse_test:.2f}')



Error on Training data- RMSE: $8182.11
Error on Test data- RMSE: $27090.98


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]


In [26]:
# trying RandomSearch to fine tune GBM's hyper parameters

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define the parameter grid for Randomized Search
param_dist = {
    'regressor__n_estimators': randint(50, 200),
    'regressor__learning_rate': [0.01, 0.1, 0.2, 0.5],
    'regressor__max_depth': randint(3, 10),
    'regressor__min_samples_split': randint(2, 20),
    'regressor__min_samples_leaf': randint(1, 10)
}

# Instantiate RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=gbm_model_pre,
    param_distributions=param_dist,
    scoring='neg_mean_squared_error',
    n_iter=10,  # Number of parameter settings that are sampled
    cv=5,
    random_state=1,
    n_jobs=-1
)

# Fit the RandomizedSearchCV to the data
random_search.fit(X_train, y_train)

# Print the best parameters found
print("Best Hyperparameters:", random_search.best_params_)

# Get the best model
best_gbm_model_randomsearch = random_search.best_estimator_

# Optionally, you can use the best model for predictions and evaluation
y_test_pred = best_gbm_model_randomsearch.predict(X_test)


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]


Best Hyperparameters: {'regressor__learning_rate': 0.1, 'regressor__max_depth': 6, 'regressor__min_samples_leaf': 9, 'regressor__min_samples_split': 11, 'regressor__n_estimators': 183}


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]


In [30]:
# Make predictions on the test set using the best model
y_train_pred_best = best_gbm_model_randomsearch.predict(X_train)

# Evaluate the best model performance on training data
mse_train = MSE(y_train, y_train_pred_best)
rmse_train = np.sqrt(mse_train)

print(f'Error on Training data- RMSE: ${rmse_train:.2f}')


# Evaluate the best model
y_test_pred_best = best_gbm_model_randomsearch.predict(X_test)

mse_test = MSE(y_test, y_test_pred_best)
rmse_test = np.sqrt(mse_test)

print(f'Error on Test data- RMSE: ${rmse_test:.2f}')


Error on Training data- RMSE: $5429.32
Error on Test data- RMSE: $27105.32


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]


In [32]:
# trying Bayesian Optimization for fine-tuning



In [None]:
# Model 4: Nerual Nets

In [None]:
# Model 5: Support Vector Machines