## 1.&nbsp;Preprocessor
Before feeding data into a machine learning model, it is important to make sure it is in the right shape and format. This often means handling missing values, scaling numerical features, and encoding categorical ones. A preprocessor takes care of these steps in a structured and consistent way, so you do not have to repeat the same code each time. It also helps avoid mistakes by keeping the transformations the same for all models. Using a preprocessor ensures your data is properly prepared for any model you choose to use.

In [None]:
from sklearn import set_config
set_config(transform_output='pandas')

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn import set_config
set_config(transform_output='pandas')
url = "https://drive.google.com/file/d/17c4r03ARJJ77gUZaV0CaNXfW0Cx5fZXy/view?usp=drive_link"
download_url = 'https://drive.google.com/uc?export=download&id=' + url.split('/')[-2]
data = pd.read_csv(download_url)

In [None]:
data.columns

Index(['LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces',
       'PoolArea', 'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'Expensive',
       'MSZoning', 'Condition1', 'Heating', 'Street', 'CentralAir',
       'Foundation', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'KitchenQual', 'FireplaceQu',
       'MSSubClass', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', '1stFlrSF',
       '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath',
       'FullBath', 'HalfBath', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageYrBlt',
       'GarageArea', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'MiscVal',
       'MoSold', 'YrSold', 'Id', 'Alley', 'LotShape', 'LandContour',
       'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrTyp

In [None]:
X=data.copy()

In [None]:
X.set_index('Id',inplace=True)

In [None]:
y= X['Expensive']

In [None]:
df=X.loc[:,X.columns!='Expensive']

In [None]:
cat_cols=list(df.select_dtypes(include='object').columns)

In [None]:
numeric_cols=list(df.select_dtypes(include='number').columns)

In [None]:
len(cat_cols)

43

In [None]:
cat_cols

['MSZoning',
 'Condition1',
 'Heating',
 'Street',
 'CentralAir',
 'Foundation',
 'ExterQual',
 'ExterCond',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'KitchenQual',
 'FireplaceQu',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'BsmtFinType2',
 'HeatingQC',
 'Electrical',
 'Functional',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    StandardScaler, OneHotEncoder, OrdinalEncoder
)

# 1) Define your column lists:

ordinal_cols = [
    'OverallQual',
    'OverallCond',
    'KitchenQual',
    'CentralAir',
    'HeatingQC',
    'BsmtFinType2',
    'BsmtFinType1',
    'BsmtExposure',
    'BsmtCond',
    'BsmtQual',
    'ExterCond',
    'ExterQual'
]

categories = [
    # OverallQual: 1 (Very Poor) … 10 (Very Excellent)
    list(range(1, 11)),

    # OverallCond: 1 (Very Poor) … 10 (Very Excellent)
    list(range(1, 11)),

    # KitchenQual: Po → Fa → TA → Gd → Ex
    ["Po", "Fa", "TA", "Gd", "Ex"],

    # CentralAir: No → Yes
    ["N", "Y"],

    # HeatingQC: Po → Fa → TA → Gd → Ex
    ["Po", "Fa", "TA", "Gd", "Ex"],

    # BsmtFinType2: NA → Unf → LwQ → Rec → BLQ → ALQ → GLQ
    ["NA", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],

    # BsmtFinType1: same as BsmtFinType2
    ["NA", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],

    # BsmtExposure: NA → No → Mn → Av → Gd
    ["NA", "No", "Mn", "Av", "Gd"],

    # BsmtCond: NA → Po → Fa → TA → Gd → Ex
    ["NA", "Po", "Fa", "TA", "Gd", "Ex"],

    # BsmtQual: same scale as BsmtCond
    ["NA", "Po", "Fa", "TA", "Gd", "Ex"],

    # ExterCond: Po → Fa → TA → Gd → Ex
    ["Po", "Fa", "TA", "Gd", "Ex"],

    # ExterQual: same as ExterCond
    ["Po", "Fa", "TA", "Gd", "Ex"],
]


nominal_cols = [item for item in cat_cols if item not in ordinal_cols]

In [None]:
len(nominal_cols)

33

In [None]:
# 2) Build preprocessors:
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler()),
])

ordinal_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=3)),
    ('encoder', OrdinalEncoder(
        categories=categories,
        handle_unknown='use_encoded_value',
        unknown_value=-1
    )),
])

nominal_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore',sparse_output=False)),
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_cols),
    ('ord', ordinal_transformer, ordinal_cols),
    ('nom', nominal_transformer, nominal_cols),
])



In [None]:
preprocessor

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1460 entries, 1 to 1460
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   LotArea        1460 non-null   int64  
 1   LotFrontage    1201 non-null   float64
 2   TotalBsmtSF    1460 non-null   int64  
 3   BedroomAbvGr   1460 non-null   int64  
 4   Fireplaces     1460 non-null   int64  
 5   PoolArea       1460 non-null   int64  
 6   GarageCars     1460 non-null   int64  
 7   WoodDeckSF     1460 non-null   int64  
 8   ScreenPorch    1460 non-null   int64  
 9   Expensive      1460 non-null   int64  
 10  MSZoning       1460 non-null   object 
 11  Condition1     1460 non-null   object 
 12  Heating        1460 non-null   object 
 13  Street         1460 non-null   object 
 14  CentralAir     1460 non-null   object 
 15  Foundation     1460 non-null   object 
 16  ExterQual      1460 non-null   object 
 17  ExterCond      1460 non-null   object 
 18  BsmtQual     

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df,
    y,
    test_size=0.2,
    random_state=123
)

## 2.&nbsp;Decision Tree
Now that the preprocessor is set up, we can connect it to a machine learning model to build a complete pipeline. In this example, we will use a decision tree classifier, but the same structure can be used with any model. By combining the preprocessor and model into a single pipeline, we ensure that all the steps, from cleaning and transforming the data to fitting the model, are applied in the correct order. This also makes it easier to tune hyperparameters, since we can search across both preprocessing and modelling steps in one go.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier

# Define a function to perform a grid search, which helps to avoid duplicating code for different models
def run_grid_search(model, param_grid, X_train, y_train, preprocessor, cv=5, verbose=1):
    # Create a pipeline that first applies the data preprocessing steps, then fits the model
    pipe = make_pipeline(preprocessor, model)

    # GridSearchCV will test all possible combinations of parameters defined in 'param_grid'
    grid_search = GridSearchCV(pipe, param_grid, cv=cv, verbose=verbose)

    # Fit the model on the training data with the various parameter combinations
    grid_search.fit(X_train, y_train)

    # Return the trained GridSearchCV object which holds the best parameters and model
    return grid_search

# Define a dictionary of hyperparameters to tune for the decision tree model
dt_param_grid = {
    "columntransformer__num__imputer__strategy": ["mean", "median"],
    "decisiontreeclassifier__max_depth": range(2, 14, 2),
    "decisiontreeclassifier__min_samples_leaf": range(3, 12, 2)
}

# Run the grid search for the DecisionTreeClassifier using the specified parameters
dt_search = run_grid_search(
    DecisionTreeClassifier(random_state=123),
    dt_param_grid,
    X_train,
    y_train,
    preprocessor
)

# Display the process
dt_search

Fitting 5 folds for each of 60 candidates, totalling 300 fits


Once we have trained and tuned our model, the next step is to check how well it performs. Here, we collect a set of evaluation scores for the decision tree and store them in a DataFrame. This gives us a clear summary of key metrics like accuracy, precision, recall, and F1 score, along with others that help us assess the model from different angles. The structure is designed so we can easily add results from other models later, making it simple to compare performance side by side and choose the most suitable approach for our task.

In [None]:
from sklearn.metrics import (
    accuracy_score,
    recall_score,
    precision_score,
    f1_score,
    balanced_accuracy_score,
    cohen_kappa_score
)

# Function to get the scores for our model(s)
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    scores = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Specificity": recall_score(y_test, y_pred, pos_label=0),
        "F1 Score": f1_score(y_test, y_pred),
        "Balanced Accuracy": balanced_accuracy_score(y_test, y_pred),
        "Cohen's Kappa": cohen_kappa_score(y_test, y_pred)
    }
    return scores

# Create an empty DataFrame to store model evaluation results
model_scores_df = pd.DataFrame(columns=[
    "Model", "Accuracy", "Recall", "Precision",
    "Specificity", "F1 Score", "Balanced Accuracy", "Cohen's Kappa"
])

# Evaluate the Decision Tree model
dt_scores = evaluate_model(dt_search, X_test, y_test)
dt_scores["Model"] = "Decision Tree"

# Convert the dictionary to a Series matching the DataFrame columns, then assign as a new row
model_scores_df.loc[len(model_scores_df)] = pd.Series(dt_scores, index=model_scores_df.columns)

# Display the DataFrame
model_scores_df

Unnamed: 0,Model,Accuracy,Recall,Precision,Specificity,F1 Score,Balanced Accuracy,Cohen's Kappa
0,Decision Tree,0.934932,0.714286,0.810811,0.972,0.759494,0.843143,0.722044


## 3.&nbsp;KNN
Now that we have results for the decision tree, we can try a second model using the same setup. This time we are testing a K nearest neighbours classifier. The structure is almost identical to what we used before, same preprocessor, same grid search function, just with a different model and parameter grid. This shows how easy it is to test different approaches when your pipeline is set up in a consistent way.

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Define the hyperparameter grid to be searched by the grid search
knn_param_grid = {
    "kneighborsclassifier__n_neighbors": range(1, 11,2),
    "kneighborsclassifier__weights": ["uniform", "distance"]
}

# Run a grid search to find the optimal combination of hyperparameters
knn_search = run_grid_search(
    KNeighborsClassifier(),
    knn_param_grid,
    X_train,
    y_train,
    preprocessor
)

# Display the grid search results
knn_search

Fitting 5 folds for each of 10 candidates, totalling 50 fits


With the K nearest neighbours model trained and tuned, we can now evaluate its performance and add the results to our scores DataFrame. Since we are using the same evaluation function and column structure, the process is quick and consistent.

In [None]:
# Evaluate the K-Nearest Neighbours (KNN) model using the testing dataset and obtain performance metrics
knn_scores = evaluate_model(knn_search, X_test, y_test)

# Label the metrics to indicate they belong to the KNN model
knn_scores["Model"] = "KNN"

# Append the KNN metrics as a new row to the existing DataFrame of model scores
model_scores_df.loc[len(model_scores_df)] = pd.Series(knn_scores, index=model_scores_df.columns)

# Display the updated DataFrame containing all model performance metrics
model_scores_df

Unnamed: 0,Model,Accuracy,Recall,Precision,Specificity,F1 Score,Balanced Accuracy,Cohen's Kappa
0,Decision Tree,0.934932,0.714286,0.810811,0.972,0.759494,0.843143,0.722044
1,KNN,0.958904,0.738095,0.96875,0.996,0.837838,0.867048,0.814799


## 4.&nbsp;LogisticRegression


In [None]:
from sklearn.linear_model import LogisticRegression
log_param_grid={
    "logisticregression__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    "logisticregression__penalty": ["l1", "l2"],
    "logisticregression__solver": ["liblinear", "saga"]
}
def new_df_scores(model,param_grid, X_train,
    y_train, X_test, y_test, model_scores_df=model_scores_df):
    logistic_search = run_grid_search(
    model,
    param_grid,
    X_train,
    y_train,
    preprocessor)
    y_pred = logistic_search.predict(X_test)
    scores= evaluate_model(logistic_search, X_test, y_test)

    scores["Model"] = f'{model}'
    model_scores_df.loc[len(model_scores_df)] = pd.Series(scores, index=model_scores_df.columns)
    return 'done'



In [None]:
new_df_scores(LogisticRegression(random_state=123),log_param_grid, X_train,
    y_train, X_test, y_test, model_scores_df=model_scores_df)

Fitting 5 folds for each of 28 candidates, totalling 140 fits




'done'

In [None]:
model_scores_df

Unnamed: 0,Model,Accuracy,Recall,Precision,Specificity,F1 Score,Balanced Accuracy,Cohen's Kappa
0,Decision Tree,0.931507,0.642857,0.84375,0.98,0.72973,0.811429,0.691332
1,KNN,0.955479,0.833333,0.853659,0.976,0.843373,0.904667,0.81743
2,LogisticRegression(),0.952055,0.785714,0.868421,0.98,0.825,0.882857,0.797303


# **5. RandomForestClassifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_param_grid={
    "randomforestclassifier__n_estimators": [50, 75, 100],
    'randomforestclassifier__max_depth': [None, 5, 10],
    'randomforestclassifier__min_samples_split': [2, 5, 10],
    'randomforestclassifier__min_samples_leaf': [2, 5, 7],

}
new_df_scores(RandomForestClassifier(random_state=123),rf_param_grid, X_train,
    y_train, X_test, y_test, model_scores_df=model_scores_df)


Fitting 5 folds for each of 81 candidates, totalling 405 fits


KeyboardInterrupt: 

# ***6. SVC***

In [None]:
from sklearn.svm import SVC
sv_param_grid={
    "svc__C": [0.001, 0.01, 0.1, 1, 10],
    'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'svc__gamma': ['scale', 'auto'],
    'svc__degree': [2, 3, 4]
}
new_df_scores(SVC(random_state=123),sv_param_grid, X_train,
    y_train, X_test, y_test, model_scores_df=model_scores_df)


Fitting 5 folds for each of 120 candidates, totalling 600 fits


'done'

# ***7. GaussianNB***

In [None]:
from sklearn.naive_bayes import GaussianNB
nb_param_grid={
    "gaussiannb__var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6],
    'gaussiannb__priors': [None, [0.5, 0.5], [0.6, 0.4]],
}
new_df_scores(GaussianNB(),nb_param_grid, X_train,
    y_train, X_test, y_test, model_scores_df=model_scores_df)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


'done'

# ***8. GradientBoostingClassifier***

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb_param_grid={
    "gradientboostingclassifier__n_estimators": [50,100],
    'gradientboostingclassifier__learning_rate': [0.01, 0.1],
    'gradientboostingclassifier__max_depth': [3, 5, None],
    'gradientboostingclassifier__min_samples_split': [2, 5, 7],
    'gradientboostingclassifier__min_samples_leaf': [1, 2, 5 ,7],
}
new_df_scores(GradientBoostingClassifier(random_state=123),gb_param_grid, X_train,
    y_train, X_test, y_test, model_scores_df=model_scores_df)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


KeyboardInterrupt: 

# ***9. AdaBoostClassifier***

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ab_param_grid={
    "adaboostclassifier__n_estimators": [50, 100],
    'adaboostclassifier__learning_rate': [0.01, 0.1],
    'adaboostclassifier__algorithm': ['SAMME', 'SAMME.R'],
    'adaboostclassifier__estimator': [None, LogisticRegression(), RandomForestClassifier()]



}
new_df_scores(AdaBoostClassifier(random_state=123),ab_param_grid, X_train,
    y_train, X_test, y_test, model_scores_df=model_scores_df)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


60 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/pipeline.py", line 662, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1382, in

'done'

# ***10. VotingClassifier***

In [None]:
from sklearn.ensemble import VotingClassifier
vc_param_grid = {
    # Try hard vs soft voting
    "votingclassifier__voting": ["hard", "soft"],

    # Candidate sets of estimators
    "votingclassifier__estimators": [
        [   # 3-model ensemble
            ("lr",  LogisticRegression(max_iter=1000)),
            ("rf",  RandomForestClassifier(n_estimators=100)),
            ("svc", SVC(probability=True))
        ]
    ],

    # Optional: different weightings (only used if voting='soft' or you want weighted hard voting)
    "votingclassifier__weights": [
        None,             # equal weights
        [2, 1, 1],        # give first estimator twice the vote
        [1, 2, 1],        # give second estimator more influence
        [1, 1, 2],        # give third estimator more influence
    ],
}
estimators = vc_param_grid.get('votingclassifier__estimators', [[]])[0]
new_df_scores(VotingClassifier(estimators=estimators),vc_param_grid, X_train,
    y_train, X_test, y_test, model_scores_df=model_scores_df)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


'done'

In [None]:
model_scores_df

Unnamed: 0,Model,Accuracy,Recall,Precision,Specificity,F1 Score,Balanced Accuracy,Cohen's Kappa
0,Decision Tree,0.931507,0.642857,0.84375,0.98,0.72973,0.811429,0.691332
1,KNN,0.955479,0.833333,0.853659,0.976,0.843373,0.904667,0.81743
2,LogisticRegression(),0.952055,0.785714,0.868421,0.98,0.825,0.882857,0.797303
3,RandomForestClassifier(),0.958904,0.761905,0.941176,0.992,0.842105,0.876952,0.818784
4,SVC(),0.965753,0.857143,0.9,0.984,0.878049,0.920571,0.858142
5,GaussianNB(),0.541096,0.904762,0.22619,0.48,0.361905,0.692381,0.171157
6,GradientBoostingClassifier(),0.94863,0.809524,0.829268,0.972,0.819277,0.890762,0.789342
7,AdaBoostClassifier(random_state=123),0.94863,0.809524,0.829268,0.972,0.819277,0.890762,0.789342
8,"VotingClassifier(estimators=[('lr', LogisticRe...",0.958904,0.785714,0.916667,0.988,0.846154,0.886857,0.8226


# ***11. XGBClassifier***

In [None]:
from xgboost import XGBClassifier
xgb_param_grid={
    "xgbclassifier__n_estimators": [100,200],
    'xgbclassifier__learning_rate': [0.01, 0.1],
    'xgbclassifier__max_depth': [3, 5, None],
}
new_df_scores(XGBClassifier(),xgb_param_grid, X_train,
    y_train, X_test, y_test, model_scores_df=model_scores_df)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


KeyboardInterrupt: 

# ***Testing Data***

In [None]:
url = "https://drive.google.com/file/d/1oPcZi_0Aea-oBLQlWcVBYu7UBPHI2zCD/view?usp=drive_link"
download_url = 'https://drive.google.com/uc?export=download&id=' + url.split('/')[-2]
test_data = pd.read_csv(download_url)

In [None]:
test=test_data.copy()

In [None]:
test.set_index('Id',inplace=True)

In [None]:
svc_search= run_grid_search(SVC(random_state=123),sv_param_grid, X_train, y_train, preprocessor, cv=5, verbose=0)

In [None]:
y_pred = svc_search.predict(test)

In [None]:
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
test['Expensive'] = svc_search.predict(test)

In [None]:
test

Unnamed: 0_level_0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,...,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition,Expensive
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,11622,80.0,882.0,2,0,0,1.0,140,120,RH,...,Unf,TA,TA,Y,,MnPrv,,WD,Normal,0
1462,14267,81.0,1329.0,3,0,0,1.0,393,0,RL,...,Unf,TA,TA,Y,,,Gar2,WD,Normal,0
1463,13830,74.0,928.0,3,1,0,2.0,212,0,RL,...,Fin,TA,TA,Y,,MnPrv,,WD,Normal,0
1464,9978,78.0,926.0,3,1,0,2.0,360,0,RL,...,Fin,TA,TA,Y,,,,WD,Normal,0
1465,5005,43.0,1280.0,2,0,0,2.0,0,144,RL,...,RFn,TA,TA,Y,,,,WD,Normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,1936,21.0,546.0,3,0,0,0.0,0,0,RM,...,,,,Y,,,,WD,Normal,0
2916,1894,21.0,546.0,3,0,0,1.0,0,0,RM,...,Unf,TA,TA,Y,,,,WD,Abnorml,0
2917,20000,160.0,1224.0,4,1,0,2.0,474,0,RL,...,Unf,TA,TA,Y,,,,WD,Abnorml,0
2918,10441,62.0,912.0,3,0,0,0.0,80,0,RL,...,,,,Y,,MnPrv,Shed,WD,Normal,0


In [None]:
test['Expensive'].to_csv('./submission.csv')

In [None]:
# Colab only
from google.colab import files
files.download('./submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>