In [1]:
# %pip install scikit-learn --upgrade

In [2]:
import pandas as pd


data = pd.read_csv('data/housing-classification-iter-0-2.csv')
data

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive
0,8450,65.0,856,3,0,0,2,0,0,0
1,9600,80.0,1262,3,1,0,2,298,0,0
2,11250,68.0,920,3,1,0,2,0,0,0
3,9550,60.0,756,3,1,0,3,0,0,0
4,14260,84.0,1145,4,1,0,3,192,0,0
...,...,...,...,...,...,...,...,...,...,...
1455,7917,62.0,953,3,1,0,2,0,0,0
1456,13175,85.0,1542,3,2,0,2,349,0,0
1457,9042,66.0,1152,4,2,0,1,0,0,1
1458,9717,68.0,1078,2,0,0,1,366,0,0


Creating Pipeline

In [3]:
X = data
y = X.pop('Expensive')

In [4]:
# feature selection
#select only numerical features
X_num = X.select_dtypes(include='number')
X_num

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch
0,8450,65.0,856,3,0,0,2,0,0
1,9600,80.0,1262,3,1,0,2,298,0
2,11250,68.0,920,3,1,0,2,0,0
3,9550,60.0,756,3,1,0,3,0,0
4,14260,84.0,1145,4,1,0,3,192,0
...,...,...,...,...,...,...,...,...,...
1455,7917,62.0,953,3,1,0,2,0,0
1456,13175,85.0,1542,3,2,0,2,349,0
1457,9042,66.0,1152,4,2,0,1,0,0
1458,9717,68.0,1078,2,0,0,1,366,0


In [5]:
# split the data
from sklearn.model_selection import train_test_split

In [6]:
X_num_train, X_num_test, y_train, y_test = train_test_split(X_num, y, test_size=0.2, random_state=123)

In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   LotArea       1460 non-null   int64  
 1   LotFrontage   1201 non-null   float64
 2   TotalBsmtSF   1460 non-null   int64  
 3   BedroomAbvGr  1460 non-null   int64  
 4   Fireplaces    1460 non-null   int64  
 5   PoolArea      1460 non-null   int64  
 6   GarageCars    1460 non-null   int64  
 7   WoodDeckSF    1460 non-null   int64  
 8   ScreenPorch   1460 non-null   int64  
dtypes: float64(1), int64(8)
memory usage: 102.8 KB


In [8]:
# check missing values
X.isna().sum()

LotArea           0
LotFrontage     259
TotalBsmtSF       0
BedroomAbvGr      0
Fireplaces        0
PoolArea          0
GarageCars        0
WoodDeckSF        0
ScreenPorch       0
dtype: int64

In [10]:
# Impute mssing values
from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer().set_output(transform='pandas') # initialize the imputer. transform='pandas' returns a dataframe, otherwise it will return array in numpy
my_imputer.fit(X_num_train) # fit on the train data. You can fit only train data, because you don't want to learn anything from the test data
X_num_imputed_train = my_imputer.transform(X_num_train) # transform the train data
X_num_imputed_test = my_imputer.transform(X_num_test) # transform the test data

Modelling: Decision Tree

In [11]:
from sklearn.tree import DecisionTreeClassifier

In [12]:
my_tree = DecisionTreeClassifier(max_depth=4, min_samples_leaf=10) # initialize the model

In [16]:
my_tree.fit(X =X_num_imputed_train, y = y_train) # fit the model to the train data

Check accuracy on the train set

In [14]:
from sklearn.metrics import accuracy_score

In [17]:
# use the model to rpedict 
y_pred_tree_train = my_tree.predict(X_num_imputed_train)
y_pred_tree_train

array([1, 0, 0, ..., 0, 0, 0])

In [18]:
accuracy_score(y_true = y_train, y_pred = y_pred_tree_train)

0.9238013698630136

check accuracy on test set

In [19]:
# To check whether our model is only good at predicting the values it was trained on (overfitting) or also useful to predict new data:
# use the model and the preprocessed test data to make predictions.
y_pred_tree_test = my_tree.predict(X_num_imputed_test)


In [20]:
# Then, take the predicted values and the data from y test, and compare them with each other.
# Ideally, the accuracy for the train and the test data is similar.
accuracy_score(y_true = y_test, y_pred = y_pred_tree_test)

0.9212328767123288

Creating pipeline