## Model Training House Prices

**1.1 Import Data and Required Packages**

In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [2]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
# Selecting necessary features
selected_features = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'GarageArea', '1stFlrSF', 'TotRmsAbvGrd', 'YearBuilt', 'SalePrice']
df_selected = df[selected_features]
df_selected.head()

Unnamed: 0,OverallQual,GrLivArea,GarageCars,TotalBsmtSF,GarageArea,1stFlrSF,TotRmsAbvGrd,YearBuilt,SalePrice
0,7,1710,2,856,548,856,8,2003,208500
1,6,1262,2,1262,460,1262,6,1976,181500
2,7,1786,2,920,608,920,6,2001,223500
3,7,1717,3,756,642,961,7,1915,140000
4,8,2198,3,1145,836,1145,9,2000,250000


In [4]:
X = df_selected.drop(columns = ['SalePrice'], axis = 1)
Y = df_selected['SalePrice']

In [5]:
# numerical_features = [feature for feature in df_selected.columns if df[feature].dtype != 'O']
# categorical_features = [feature for feature in df_selected.columns if df[feature].dtype == 'O']
numerical_features = X.columns.tolist()

In [6]:
# one hot encoding and standard scalar
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# numeric_transformer = StandardScaler()
# oh_transformer = OneHotEncoder()

# preprocessor = ColumnTransformer(
#     [
#         ("OneHotEncoder", oh_transformer, categorical_features),
#          ("StandardScaler", numeric_transformer, numerical_features),        
#     ]
# )

In [7]:
numeric_transformer = StandardScaler()

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ('StandardScaler', numeric_transformer, numerical_features)
    ])

In [9]:
X_transformed = preprocessor.fit_transform(X)

In [10]:
X.shape

(1460, 8)

In [11]:
# seperate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((1168, 8), (292, 8))

**Create an Evaluate Function to give all metrics after model Training**

In [12]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [13]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}

In [14]:
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 37838.0349
- Mean Absolute Error: 24201.2195
- R2 Score: 0.7600
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 39610.1215
- Mean Absolute Error: 25028.1212
- R2 Score: 0.7955


Lasso
Model performance for Training set
- Root Mean Squared Error: 37838.0351
- Mean Absolute Error: 24200.7906
- R2 Score: 0.7600
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 39610.0365
- Mean Absolute Error: 25027.5388
- R2 Score: 0.7955


Ridge
Model performance for Training set
- Root Mean Squared Error: 37838.0492
- Mean Absolute Error: 24195.7346
- R2 Score: 0.7600
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 39608.9704
- Mean Absolute Error: 25022.3609
- R2 Score: 0.7955


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 33757.2543
- Mean Absolute Erro

In [24]:
# hyperparameter tuning
params = {
    "Decision Tree": {
        'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
    },
    "Random Forest": {
        'n_estimators': [8, 16, 32, 64, 128, 256]
    },
    "Gradient Boosting": {
        'learning_rate': [0.1, 0.01, 0.05, 0.001],
        'subsample': [0.6, 0.7, 0.75, 0.8, 0.85, 0.9],
        'n_estimators': [8, 16, 32, 64, 128, 256]
    },
    "Linear Regression": {},
    "Lasso": {
        'alpha': [0.001, 0.01, 0.1, 1.0, 10.0]
    },
    "Ridge": {
        'alpha': [0.001, 0.01, 0.1, 1.0, 10.0]
    },
    "K Neighbors Regressor": {
        'n_neighbors': [5, 10, 15, 20],
        'weights': ['uniform', 'distance'],
        'p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
    },
    "XGBRegressor": {
        'learning_rate': [0.1, 0.01, 0.05, 0.001],
        'n_estimators': [8, 16, 32, 64, 128, 256]
    },
    "CatBoosting Regressor": {
        'depth': [6, 8, 10],
        'learning_rate': [0.01, 0.05, 0.1],
        'iterations': [30, 50, 100]
    },
    "AdaBoost Regressor": {
        'learning_rate': [0.1, 0.01, 0.5, 0.001],
        'n_estimators': [8, 16, 32, 64, 128, 256]
    }
}

In [25]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
from sklearn.exceptions import ConvergenceWarning
import numpy as np

def evaluate_models(X_train, y_train, X_test, y_test, models, params):
    model_list = []
    train_metrics = []
    test_metrics = []
    
    # Filter out specific warnings
    warnings.filterwarnings("ignore", category=FutureWarning)
    warnings.filterwarnings("ignore", category=ConvergenceWarning)

    # Iterate over models and perform GridSearchCV with hyperparameter tuning
    for model_name, model in models.items():
        print(f"Evaluating {model_name}...")
        
        if model_name in params:
            param_grid = params[model_name]
        else:
            param_grid = {}

        if param_grid:
            grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_root_mean_squared_error', cv=5)
            grid_search.fit(X_train, y_train)

            # Best parameters and best score
            best_params = grid_search.best_params_
            best_model = grid_search.best_estimator_
        else:
            model.fit(X_train, y_train)
            best_params = model.get_params()
            best_model = model

        # Fit the model with the best parameters
        best_model.fit(X_train, y_train)

        # Make predictions on training and test sets
        y_train_pred = best_model.predict(X_train)
        y_test_pred = best_model.predict(X_test)

        # Calculate evaluation metrics for training and test sets
        train_mae = mean_absolute_error(y_train, y_train_pred)
        train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
        train_r2 = r2_score(y_train, y_train_pred)
        
        test_mae = mean_absolute_error(y_test, y_test_pred)
        test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
        test_r2 = r2_score(y_test, y_test_pred)

        # Store model name and evaluation metrics
        model_list.append(model_name)
        train_metrics.append((train_mae, train_rmse, train_r2))
        test_metrics.append((test_mae, test_rmse, test_r2))

        # Print model performance
        print(f"Model: {model_name}")
        print(f"Best Parameters: {best_params}")
        print(f"Train MAE: {train_mae:.4f}, Train RMSE: {train_rmse:.4f}, Train R²: {train_r2:.4f}")
        print(f"Test MAE: {test_mae:.4f}, Test RMSE: {test_rmse:.4f}, Test R²: {test_r2:.4f}")
        print('=' * 35)
        print()

    # Create and return a report dictionary
    model_report = {
        'model_list': model_list,
        'train_metrics': train_metrics,
        'test_metrics': test_metrics
    }
    
    return model_report

# Evaluate models
model_report = evaluate_models(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, models=models, params=params)

Evaluating Linear Regression...
Model: Linear Regression
Best Parameters: {'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'positive': False}
Train MAE: 24201.2195, Train RMSE: 37838.0349, Train R²: 0.7600
Test MAE: 25028.1212, Test RMSE: 39610.1215, Test R²: 0.7955

Evaluating Lasso...
Model: Lasso
Best Parameters: {'alpha': 10.0}
Train MAE: 24196.8141, Train RMSE: 37838.0485, Train R²: 0.7600
Test MAE: 25022.2654, Test RMSE: 39609.0936, Test R²: 0.7955

Evaluating Ridge...
Model: Ridge
Best Parameters: {'alpha': 10.0}
Train MAE: 24150.3353, Train RMSE: 37839.3367, Train R²: 0.7599
Test MAE: 24973.7462, Test RMSE: 39600.4592, Test R²: 0.7956

Evaluating K-Neighbors Regressor...
Model: K-Neighbors Regressor
Best Parameters: {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}
Train MAE: 21400.8834, Train RMSE: 33757.2543, Train R²: 0.8089
Test MAE: 27005.7699, Test RMSE: 43360.3716, T

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import AdaBoostRegressor

# Load train.csv and test.csv
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# Selecting specific features for training and testing
selected_features = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'GarageArea', 
                     '1stFlrSF', 'TotRmsAbvGrd', 'YearBuilt', 'SalePrice']

# Separate features and target from train.csv
X_train = df_train[selected_features].drop(columns=['SalePrice'], axis=1)
y_train = df_train['SalePrice']

# Separate features from test.csv
X_test = df_test[selected_features[:-1]]  # Adjust here if needed based on actual columns in test.csv

# Define numerical features
numerical_features = X_train.columns.tolist()

# Define transformer for numerical features with imputation
numeric_transformer = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numerical_features)
    ])

# Apply preprocessing to train and test data
X_train_transformed = numeric_transformer.fit_transform(X_train)
X_test_transformed = numeric_transformer.transform(X_test)

# Define models
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}

# Train models and predict on test data
for model_name, model in models.items():
    model.fit(X_train_transformed, y_train)
    y_test_pred = model.predict(X_test_transformed)
    
    # Create output DataFrame for each model
    output = pd.DataFrame({'Id': df_test['Id'], 'SalePrice': y_test_pred})
    
    # Save output to CSV
    output.to_csv(f'{model_name}_predictions.csv', index=False)

In [26]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

# Load train.csv and test.csv
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# Selecting specific features for training and testing
selected_features = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'GarageArea', 
                     '1stFlrSF', 'TotRmsAbvGrd', 'YearBuilt', 'SalePrice']

# Separate features and target from train.csv
X_train = df_train[selected_features].drop(columns=['SalePrice'], axis=1)
y_train = df_train['SalePrice']

# Separate features from test.csv
X_test = df_test[selected_features[:-1]]  # Adjust here if needed based on actual columns in test.csv

# Define numerical features
numerical_features = X_train.columns.tolist()

# Define transformer for numerical features with imputation
numeric_transformer = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numerical_features)
    ])

# Apply preprocessing to train and test data
X_train_transformed = numeric_transformer.fit_transform(X_train)
X_test_transformed = numeric_transformer.transform(X_test)

# Define best parameters found for each model
best_params = {
    "Linear Regression": {'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'positive': False},
    "Lasso": {'alpha': 10.0},
    "Ridge": {'alpha': 10.0},
    "K-Neighbors Regressor": {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'},
    "Decision Tree": {'criterion': 'squared_error'},
    "Random Forest Regressor": {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False},
    "XGBRegressor": {'learning_rate': 0.05, 'n_estimators': 128},
    "CatBoosting Regressor": {'depth': 10, 'iterations': 100, 'learning_rate': 0.1},
    "AdaBoost Regressor": {'learning_rate': 0.1, 'n_estimators': 256}
}

# Initialize models with best parameters
models = {
    "Linear Regression": LinearRegression(**best_params["Linear Regression"]),
    "Lasso": Lasso(**best_params["Lasso"]),
    "Ridge": Ridge(**best_params["Ridge"]),
    "K-Neighbors Regressor": KNeighborsRegressor(**best_params["K-Neighbors Regressor"]),
    "Decision Tree": DecisionTreeRegressor(**best_params["Decision Tree"]),
    "Random Forest Regressor": RandomForestRegressor(**best_params["Random Forest Regressor"]),
    "XGBRegressor": XGBRegressor(**best_params["XGBRegressor"]), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False, **best_params["CatBoosting Regressor"]),
    "AdaBoost Regressor": AdaBoostRegressor(**best_params["AdaBoost Regressor"])
}

# Train models and predict on test data
for model_name, model in models.items():
    model.fit(X_train_transformed, y_train)
    y_test_pred = model.predict(X_test_transformed)
    
    # Create output DataFrame for each model
    output = pd.DataFrame({'Id': df_test['Id'], 'SalePrice': y_test_pred})
    
    # Save output to CSV
    output.to_csv(f'{model_name}_predictions_hyperparameter_tuning.csv', index=False)

In [None]:
# Best score - 0.15914