In [1]:
import pandas as pd

In [2]:
# reading the csv file
url = "https://drive.google.com/file/d/1YxeVDZHfDhqWb0VOn-lfxnDKoLOayJeD/view?usp=drive_link" # > Data from the iteration 5
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
housing = pd.read_csv(path)

In [3]:
housing

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,8450,65.0,856,3,0,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,9600,80.0,1262,3,1,0,2,298,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,11250,68.0,920,3,1,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,9550,60.0,756,3,1,0,3,0,0,0,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,14260,84.0,1145,4,1,0,3,192,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,7917,62.0,953,3,1,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1456,13175,85.0,1542,3,2,0,2,349,0,0,...,Attchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
1457,9042,66.0,1152,4,2,0,1,0,0,1,...,Attchd,RFn,TA,TA,Y,,GdPrv,Shed,WD,Normal
1458,9717,68.0,1078,2,0,0,1,366,0,0,...,Attchd,Unf,TA,TA,Y,,,,WD,Normal


In [4]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   LotArea        1460 non-null   int64  
 1   LotFrontage    1201 non-null   float64
 2   TotalBsmtSF    1460 non-null   int64  
 3   BedroomAbvGr   1460 non-null   int64  
 4   Fireplaces     1460 non-null   int64  
 5   PoolArea       1460 non-null   int64  
 6   GarageCars     1460 non-null   int64  
 7   WoodDeckSF     1460 non-null   int64  
 8   ScreenPorch    1460 non-null   int64  
 9   Expensive      1460 non-null   int64  
 10  MSZoning       1460 non-null   object 
 11  Condition1     1460 non-null   object 
 12  Heating        1460 non-null   object 
 13  Street         1460 non-null   object 
 14  CentralAir     1460 non-null   object 
 15  Foundation     1460 non-null   object 
 16  ExterQual      1460 non-null   object 
 17  ExterCond      1460 non-null   object 
 18  BsmtQual

## Creating the `full_pipeline` (`preprocessor` + Decision Tree)

#### Spliting the Data into Features and Target

In [102]:
X = housing.drop(columns=['Id', 'Expensive']) 
y = housing['Expensive']  


####  Spliting Data into Training and Test Sets

In [103]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Numerical and Categorical Columns

In [104]:
X_num_columns = X.select_dtypes(include="number").columns  # Numerical columns
X_cat_columns = X.select_dtypes(exclude="number").columns  # Categorical columns

####  Numerical Pipeline

In [105]:
from sklearn.impute import SimpleImputer

numeric_pipe = SimpleImputer(strategy="median")  # Use median to fill missing values in numerical columns

#### Nominal Features (One-Hot Encoding)

In [106]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline

nominal_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),  # Fill missing values with 'N_A'
    OneHotEncoder(handle_unknown='infrequent_if_exist', sparse_output=False)  # One-Hot Encoding for nominal features
)

#### Ordinal Features (Ordinal Encoding)

In [107]:
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline

# Define the rankings of each ordinal feature
ordinal_rankings = [
    ['Po', 'Fa', 'TA', 'Gd', 'Ex'],  # ExterQual
    ['Po', 'Fa', 'TA', 'Gd', 'Ex'],  # ExterCond
    ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],  # BsmtQual
    ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],  # BsmtCond
    ['Po', 'Fa', 'TA', 'Gd', 'Ex'],  # KitchenQual
    ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']   # FireplaceQu
]

In [108]:
# Define the ordinal pipeline
ordinal_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="NA"),  # Handle missing values with 'NA'
    OrdinalEncoder(categories=ordinal_rankings, handle_unknown='use_encoded_value', unknown_value=-1)  # Apply encoding
)

#### Combining the Pipelines Using ColumnTransformer

In [109]:
from sklearn.compose import ColumnTransformer

# Define ordinal and nominal columns
ordinal_cols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'KitchenQual', 'FireplaceQu']  # Ordinal columns
nominal_cols = ['MSZoning', 'Condition1', 'Heating', 'Street', 'CentralAir', 'Foundation']  # Nominal columns

# Create the preprocessor with ordinal, nominal, and numerical pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipe, X_num_columns),  # Handle numerical columns
        ('ordinal', ordinal_pipe, ordinal_cols),  # Handle ordinal columns with custom ranking (Ex first)
        ('nominal', nominal_pipe, nominal_cols)  # Handle nominal columns with One-Hot Encoding
    ]
)

#### Build and Train the Decision Tree Model

In [110]:
from sklearn.tree import DecisionTreeClassifier
d_tree = make_pipeline(
    preprocessor, 
    DecisionTreeClassifier() 
)
d_tree.fit(X_train, y_train)

#### NA is treated as a valid category for features like BsmtQual and FireplaceQu, indicating “No Basement” or “No Fireplace.”
#### •	The ordinal rankings now include NA as the lowest rank for these features.
#### •	Using OrdinalEncoder to directly encode NA as part of the ranking system.

In [111]:
# Extract the ordinal encoder from the pipeline
ordinal_encoder = d_tree.named_steps['columntransformer'].transformers_[1][1].named_steps['ordinalencoder']

for col, categories in zip(ordinal_cols, ordinal_encoder.categories_):
    print(f"Ranking for {col}:")
    for idx, category in enumerate(categories):
        print(f"{category}: {idx}")
    print() 

Ranking for ExterQual:
Po: 0
Fa: 1
TA: 2
Gd: 3
Ex: 4

Ranking for ExterCond:
Po: 0
Fa: 1
TA: 2
Gd: 3
Ex: 4

Ranking for BsmtQual:
NA: 0
Po: 1
Fa: 2
TA: 3
Gd: 4
Ex: 5

Ranking for BsmtCond:
NA: 0
Po: 1
Fa: 2
TA: 3
Gd: 4
Ex: 5

Ranking for KitchenQual:
Po: 0
Fa: 1
TA: 2
Gd: 3
Ex: 4

Ranking for FireplaceQu:
NA: 0
Po: 1
Fa: 2
TA: 3
Gd: 4
Ex: 5



In [112]:
import numpy as np
np.set_printoptions(threshold=np.inf)

print(d_tree.predict(X_train))

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0
 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 1
 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0
 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0
 0 1 0 0 0 0 0 0 0 0 1 0 

####  Making Predictions

In [113]:
# Make predictions on the training set
y_train_pred = d_tree.predict(X_train)

# Make predictions on the test set
y_test_pred = d_tree.predict(X_test)

In [114]:
from sklearn.metrics import accuracy_score

# Calculating accuracy on the training set
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Training Accuracy:", train_accuracy)

# Calculating accuracy on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", test_accuracy)

Training Accuracy: 1.0
Test Accuracy: 0.9315068493150684


### Summary

#### The training accuracy is 100%, which means the model learned perfectly from the data it was trained on. It correctly predicted whether each house was expensive or not based on the training data.
#### •	The test accuracy is 93%, which means when the model was given new, unseen data, it correctly predicted the house classification (expensive or not) about 93% of the time.

#### This is a good result, though we should keep an eye on the 100% training accuracy as it might indicate the model has memorized the training data rather than generalizing well. which may result to OVERFITTING

### Using GridSearchCV (cross validation) to find the best parameters for the decision tree to avoid overfitting and achieve a good balance between training and test accuracy.

In [115]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

param_grid_dtree = {
    "columntransformer__num__strategy": ["mean", "median"],  # Imputer strategy for numeric columns
    "decisiontreeclassifier__max_depth": range(2, 14, 2),  # Max depth of the decision tree
    "decisiontreeclassifier__min_samples_leaf": range(3, 12, 2)  # Min samples per leaf
}

grid_search_dtree = GridSearchCV(
    d_tree,  
    param_grid_dtree,  
    cv=5,  # 5-fold cross-validation
    scoring='accuracy',  
    verbose=1  
)
grid_search_dtree.fit(X_train, y_train)

best_d_tree = grid_search_dtree.best_estimator_

# Make predictions on the training set with the best model
y_train_pred_dtree = best_d_tree.predict(X_train)

# Make predictions on the test set with the best model
y_test_pred_dtree = best_d_tree.predict(X_test)

# Calculate training accuracy
train_accuracy_dtree = accuracy_score(y_train, y_train_pred_dtree)
print("Training Accuracy with Best Decision Tree Model:", train_accuracy_dtree)

# Calculate test accuracy
test_accuracy_dtree = accuracy_score(y_test, y_test_pred_dtree)
print("Test Accuracy with Best Decision Tree Model:", test_accuracy_dtree)

print("Best Hyperparameters for Decision Tree:", grid_search_dtree.best_params_)
print("Best Cross-Validation Accuracy for Decision Tree:", grid_search_dtree.best_score_)

results_dtree = {
    'Best Hyperparameters': grid_search_dtree.best_params_,
    'Best Cross-Validation Accuracy': grid_search_dtree.best_score_,
    'Training Accuracy': train_accuracy_dtree,
    'Test Accuracy': test_accuracy_dtree
}

print("\nStored Results for Decision Tree:")
print(results_dtree)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
Training Accuracy with Best Decision Tree Model: 0.958904109589041
Test Accuracy with Best Decision Tree Model: 0.9452054794520548
Best Hyperparameters for Decision Tree: {'columntransformer__num__strategy': 'mean', 'decisiontreeclassifier__max_depth': 4, 'decisiontreeclassifier__min_samples_leaf': 3}
Best Cross-Validation Accuracy for Decision Tree: 0.9340743186236748

Stored Results for Decision Tree:
{'Best Hyperparameters': {'columntransformer__num__strategy': 'mean', 'decisiontreeclassifier__max_depth': 4, 'decisiontreeclassifier__min_samples_leaf': 3}, 'Best Cross-Validation Accuracy': 0.9340743186236748, 'Training Accuracy': 0.958904109589041, 'Test Accuracy': 0.9452054794520548}


In [117]:
results_dtree

{'Best Hyperparameters': {'columntransformer__num__strategy': 'mean',
  'decisiontreeclassifier__max_depth': 4,
  'decisiontreeclassifier__min_samples_leaf': 3},
 'Best Cross-Validation Accuracy': 0.9340743186236748,
 'Training Accuracy': 0.958904109589041,
 'Test Accuracy': 0.9452054794520548}

### Summary for DTree

#### 	•	The model went through different settings (or “hyperparameters”) and found that the best combination is using an average (median) for missing values and setting certain limits for how the decision tree grows.
#### •	During testing, the model was able to correctly predict 93% of the time on unseen data through a process called cross-validation.
#### •	On the training data, where the model was trained, it achieved 95% accuracy.
#### •	On the new test data, the model achieved 94% accuracy. Model is performing well and is generalizing to new data, not just memorizing the training data.

## Creating `pipeline` for ( Logistic Regression)

In [118]:
from sklearn.linear_model import LogisticRegression

In [119]:
# Create a pipeline with a preprocessor and a logistic regression model
log_reg_pipe = make_pipeline(
    preprocessor,
    LogisticRegression(max_iter=1000) 
)

####  Defining a parameter grid for Logistic Regression

In [120]:
param_grid_logreg = {
    'columntransformer__num__strategy': ["mean", "median"],  
    'logisticregression__C': [0.01, 0.1, 1, 10, 100]  # Regularization par
}

In [None]:
grid_search_logreg = GridSearchCV(
    log_reg_pipe,  # Pipeline for Logistic Regression
    param_grid_logreg,  
    cv=5,  
    verbose=1  # Progress messages
)

grid_search_logreg.fit(X_train, y_train)

best_log_reg = grid_search_logreg.best_estimator_

train_acc_logreg = accuracy_score(y_train, best_log_reg.predict(X_train))
test_acc_logreg = accuracy_score(y_test, best_log_reg.predict(X_test))

results_logreg = {
    'Best Hyperparameters': grid_search_logreg.best_params_,
    'Best Cross-Validation Accuracy': grid_search_logreg.best_score_,
    'Training Accuracy': train_acc_logreg,
    'Test Accuracy': test_acc_logreg
}

results_logreg

In [122]:
results_logreg

{'Best Hyperparameters': {'columntransformer__num__strategy': 'mean',
  'logisticregression__C': 0.1},
 'Best Cross-Validation Accuracy': 0.943487032757419,
 'Training Accuracy': 0.9486301369863014,
 'Test Accuracy': 0.9417808219178082}

### Summary for Logistic Regression

#### Tested the Logistic Regression model to see how well it predicts whether a house is expensive or not based on certain features. After trying different settings, I found the best combination:

#### •	Best Settings: Using the “mean” to fill in missing numbers and a certain regularization strength (C = 1) in the model.
#### •	Performance: The model correctly predicted the outcome around 94% of the time on the training data, and 94% of the time on the unseen test data.
#### •	Reliability: During the testing process (cross-validation), the model consistently performed well, with an accuracy of about 94%.

#### This shows that the Logistic Regression model is performing quite well and is reliable for predictions.

## Creating `pipeline` for ( Support Vector Machine (SVM))

#### Defining the SVM Pipeline

In [26]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline

svm_pipe = make_pipeline(
    preprocessor,  
    SVC()  
)

####  Parameter Grid for GridSearchCV

In [29]:
# Parameter grid for SVM
param_grid_svm = {
    "columntransformer__num__strategy": ["mean", "median"],  
    "svc__C": [0.1, 1, 10],
    "svc__kernel": ["linear"]
}

In [None]:
from sklearn.model_selection import RandomizedSearchCV

random_search_svm = RandomizedSearchCV(
    svm_pipe,  
    param_distributions=param_grid_svm, 
    cv=3,  
    verbose=1, 
    n_jobs=-1,  # Use all available CPUs
    n_iter=10,  # 10 random combinations of hyperparameters
    random_state=42  # Ensure reproducibility
)

random_search_svm.fit(X_train, y_train)

best_svm = random_search_svm.best_estimator_

train_accuracy_svm = best_svm.score(X_train, y_train)
test_accuracy_svm = best_svm.score(X_test, y_test)

results_svm = {
    'Best Hyperparameters': random_search_svm.best_params_,
    'Best Cross-Validation Accuracy': random_search_svm.best_score_,
    'Training Accuracy': train_accuracy_svm,
    'Test Accuracy': test_accuracy_svm
}


In [None]:
results_svm

### Summary for Support Vector Machine (SVM)

#### The Support Vector Machine (SVM) model was trained and tested to find the best settings. After testing different options, the best SVM model used a linear kernel and a regularization strength (C) of 0.1. it uses the 'median' to handle missing values

#### •	The model was 90.7% accurate when tested on different groups of the training data (cross-validation).
#### •	When tested on the entire training set, the accuracy was 92.4%.
#### •	When tested on new, unseen data (the test set), the model achieved an accuracy of 92.8%.

#### This means the SVM model performed very well and was consistent in predicting outcomes on both training and test data.

## Creating `pipeline` for ( Random Forest)

####  pipeline for Random Forest

In [123]:
from sklearn.ensemble import RandomForestClassifier

# pipeline for Random Forest
rf_pipe = make_pipeline(
    preprocessor,  
    RandomForestClassifier(random_state=42)  
)

### Definig Grid parameter

In [124]:
param_grid_rf = {
    "columntransformer__num__strategy": ["mean", "median"],  
    "randomforestclassifier__n_estimators": [100],  
    "randomforestclassifier__max_depth": [5, 10],  
    "randomforestclassifier__min_samples_split": [5, 10],  
    "randomforestclassifier__min_samples_leaf": [1, 2], 
}

####  Run GridSearchCV to find the best parameters

In [125]:
grid_search_rf = GridSearchCV(
    rf_pipe,  
    param_grid_rf,  
    cv=3,  
    verbose=1
)

In [126]:
from sklearn.model_selection import RandomizedSearchCV

random_search_rf = RandomizedSearchCV(
    rf_pipe,  
    param_distributions=param_grid_rf,  
    n_iter=10,  
    cv=3,  
    verbose=1
)

random_search_rf.fit(X_train, y_train)

# Best Random Forest model based on RandomizedSearchCV
best_rf = random_search_rf.best_estimator_

train_accuracy_rf = best_rf.score(X_train, y_train)
test_accuracy_rf = best_rf.score(X_test, y_test)

results_rf = {
    'Best Hyperparameters': random_search_rf.best_params_,
    'Best Cross-Validation Accuracy': random_search_rf.best_score_,
    'Training Accuracy': train_accuracy_rf,
    'Test Accuracy': test_accuracy_rf
}
results_rf

Fitting 3 folds for each of 10 candidates, totalling 30 fits


{'Best Hyperparameters': {'randomforestclassifier__n_estimators': 100,
  'randomforestclassifier__min_samples_split': 5,
  'randomforestclassifier__min_samples_leaf': 1,
  'randomforestclassifier__max_depth': 10,
  'columntransformer__num__strategy': 'median'},
 'Best Cross-Validation Accuracy': 0.9503306747522685,
 'Training Accuracy': 0.9931506849315068,
 'Test Accuracy': 0.952054794520548}

In [44]:
results_rf

{'Best Hyperparameters': {'randomforestclassifier__n_estimators': 100,
  'randomforestclassifier__min_samples_split': 5,
  'randomforestclassifier__min_samples_leaf': 1,
  'randomforestclassifier__max_depth': 10,
  'columntransformer__num__strategy': 'median'},
 'Best Cross-Validation Accuracy': 0.9520400764616702,
 'Training Accuracy': 0.9940068493150684,
 'Test Accuracy': 0.9486301369863014}

### Summary for Random Forest

#### •	Best Settings: The model worked best when using the median to handle missing values, a tree depth of 10, splitting nodes if they had at least 5 samples, and using 100 trees in total.
#### •	Performance During Testing: When we tested the model using new data, it got the right answers about 94% of the time.
#### •	Performance During Training: When the model was learning from the training data, it got the right answers about 99% of the time.
#### •	Cross-Validation: When we tested the model in several different training and testing rounds (cross-validation), it was correct about 95% of the time on average.

#### Random Forest model is performing well and is quite accurate at making predictions.

## Test Data `Prediction`

### Here's a new data, this time *without labels!*. To see how well our model performs, we'll predict whether these houses are expensive or not and upload the results to the [a site](https://housingcomp-data023.streamlit.app/).

In [127]:
test_url = "https://drive.google.com/file/d/1MZnPvWoGQtBHij32Rti26C2T0KT1xGBc/view?usp=drive_link"
test_path = 'https://drive.google.com/uc?export=download&id='+test_url.split('/')[-2]
test = pd.read_csv(test_path)
test

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,11622,80.0,882.0,2,0,0,1.0,140,120,RH,...,Attchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
1,14267,81.0,1329.0,3,0,0,1.0,393,0,RL,...,Attchd,Unf,TA,TA,Y,,,Gar2,WD,Normal
2,13830,74.0,928.0,3,1,0,2.0,212,0,RL,...,Attchd,Fin,TA,TA,Y,,MnPrv,,WD,Normal
3,9978,78.0,926.0,3,1,0,2.0,360,0,RL,...,Attchd,Fin,TA,TA,Y,,,,WD,Normal
4,5005,43.0,1280.0,2,0,0,2.0,0,144,RL,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,1936,21.0,546.0,3,0,0,0.0,0,0,RM,...,,,,,Y,,,,WD,Normal
1455,1894,21.0,546.0,3,0,0,1.0,0,0,RM,...,CarPort,Unf,TA,TA,Y,,,,WD,Abnorml
1456,20000,160.0,1224.0,4,1,0,2.0,474,0,RL,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
1457,10441,62.0,912.0,3,0,0,0.0,80,0,RL,...,,,,,Y,,MnPrv,Shed,WD,Normal


In [128]:
test.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,11622,80.0,882.0,2,0,0,1.0,140,120,RH,...,Attchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
1,14267,81.0,1329.0,3,0,0,1.0,393,0,RL,...,Attchd,Unf,TA,TA,Y,,,Gar2,WD,Normal
2,13830,74.0,928.0,3,1,0,2.0,212,0,RL,...,Attchd,Fin,TA,TA,Y,,MnPrv,,WD,Normal
3,9978,78.0,926.0,3,1,0,2.0,360,0,RL,...,Attchd,Fin,TA,TA,Y,,,,WD,Normal
4,5005,43.0,1280.0,2,0,0,2.0,0,144,RL,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [129]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   LotArea        1459 non-null   int64  
 1   LotFrontage    1232 non-null   float64
 2   TotalBsmtSF    1458 non-null   float64
 3   BedroomAbvGr   1459 non-null   int64  
 4   Fireplaces     1459 non-null   int64  
 5   PoolArea       1459 non-null   int64  
 6   GarageCars     1458 non-null   float64
 7   WoodDeckSF     1459 non-null   int64  
 8   ScreenPorch    1459 non-null   int64  
 9   MSZoning       1455 non-null   object 
 10  Condition1     1459 non-null   object 
 11  Heating        1459 non-null   object 
 12  Street         1459 non-null   object 
 13  CentralAir     1459 non-null   object 
 14  Foundation     1459 non-null   object 
 15  ExterQual      1459 non-null   object 
 16  ExterCond      1459 non-null   object 
 17  BsmtQual       1415 non-null   object 
 18  BsmtCond

In [130]:
test["Id"].head()

0    1461
1    1462
2    1463
3    1464
4    1465
Name: Id, dtype: int64

In [131]:
len(test)

1459

## Make `Prediction`

In [133]:
X_actual_test = test.drop(columns=['Id'], errors='ignore') 

# Making predictions using the best model 'Random Forest'
y_pred = best_rf.predict(X_actual_test)  
submission_rf = pd.DataFrame({
    'Id': test['Id'],  # Attach the Id column from the test data
    'Expensive': y_pred  # Attach the predictions from the best model
})

# Save the submission file as CSV
submission_rf.to_csv('submission.csv', index=False)