# Training

## Import data and libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn import set_config
set_config(transform_output="pandas")

url = "https://drive.google.com/file/d/1L23SxwgqjdUeTKikW-L246yOcI12q0_D/view?usp=drive_link"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
training_data = pd.read_csv(path)

The column 'Id' does not have useful training data, so we do not want it to be part of the data fed into our pipeline.

At the same time, it will be needed in the competition submission. We will therefore use it as the index.

In [None]:
training_data = training_data.set_index('Id')
training_data

Unnamed: 0_level_0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,8450,65.0,856,3,0,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,9600,80.0,1262,3,1,0,2,298,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,11250,68.0,920,3,1,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
4,9550,60.0,756,3,1,0,3,0,0,0,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
5,14260,84.0,1145,4,1,0,3,192,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,7917,62.0,953,3,1,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1457,13175,85.0,1542,3,2,0,2,349,0,0,...,Attchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
1458,9042,66.0,1152,4,2,0,1,0,0,1,...,Attchd,RFn,TA,TA,Y,,GdPrv,Shed,WD,Normal
1459,9717,68.0,1078,2,0,0,1,366,0,0,...,Attchd,Unf,TA,TA,Y,,,,WD,Normal


## Split data and train model

In [None]:
# Separate label and split data for training and testing
y = training_data['Expensive'].copy()
X = training_data.drop('Expensive', axis=1).copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [None]:
# Separte training data into numerical and text
X_num = X_train.select_dtypes(include='number').columns
X_cat = X_train.select_dtypes(exclude='number').columns

In [None]:
# Create pipeline for numerical data
num_pipe = make_pipeline(SimpleImputer(strategy='mean'))

In [None]:
# Create pipeline for text data
cat_pipe = make_pipeline(SimpleImputer(strategy='constant', fill_value='N_A'),
                         OneHotEncoder(handle_unknown='infrequent_if_exist', sparse_output=False))

In [None]:
# Assemble pipelines into preprocessor and full pipeline
preprocessor = make_column_transformer((num_pipe, X_num),
                                       (cat_pipe, X_cat))

random_forest_pipe = make_pipeline(preprocessor, RandomForestClassifier())

In [None]:
random_forest_pipe.fit(X_train, y_train)

## Assess performance and further train model(s)

In [None]:
accuracy_score(y_true=y_train,
               y_pred=random_forest_pipe.predict(X_train))

1.0

In [None]:
assessment_df = pd.DataFrame(columns=['train', 'test'], index=['random_forest_baseline'])
assessment_df.loc['random_forest_baseline', 'train'] = accuracy_score(y_true=y_train, y_pred=random_forest_pipe.predict(X_train))
assessment_df.loc['random_forest_baseline', 'test'] = accuracy_score(y_true=y_test, y_pred=random_forest_pipe.predict(X_test))
assessment_df

Unnamed: 0,train,test
random_forest_baseline,1.0,0.965753


In [None]:
# Tune hyperparameters, change up your preprocessor and try other models!
# Use cross-validation to get the most out of your training data!

## Fit one last time on full training data

Once you have your best model chosen, fit it one last time on the whole training data.

In [None]:
random_forest_pipe.fit(X, y)

# Predict on testing data

## Import testing data
Remember, everything done up until now has been performed on your training data. The *real* testing data has no labels (visible to you).

In [None]:
url = "https://drive.google.com/file/d/15PfmTxmavQCT-f7iY9tgwWxm9t4GRees/view?usp=drive_link"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
testing_data = pd.read_csv(path)
testing_data = testing_data.set_index('Id')

Use your model's `.predict()` method to create predictions.

In [None]:
random_forest_pipe.predict(testing_data)

array([0, 0, 0, ..., 0, 0, 0])

The output of `.predict()` is an array. Save this as a column on `testing_data`.

In [None]:
testing_data

Unnamed: 0_level_0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,11622,80.0,882.0,2,0,0,1.0,140,120,RH,...,Attchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
1462,14267,81.0,1329.0,3,0,0,1.0,393,0,RL,...,Attchd,Unf,TA,TA,Y,,,Gar2,WD,Normal
1463,13830,74.0,928.0,3,1,0,2.0,212,0,RL,...,Attchd,Fin,TA,TA,Y,,MnPrv,,WD,Normal
1464,9978,78.0,926.0,3,1,0,2.0,360,0,RL,...,Attchd,Fin,TA,TA,Y,,,,WD,Normal
1465,5005,43.0,1280.0,2,0,0,2.0,0,144,RL,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,1936,21.0,546.0,3,0,0,0.0,0,0,RM,...,,,,,Y,,,,WD,Normal
2916,1894,21.0,546.0,3,0,0,1.0,0,0,RM,...,CarPort,Unf,TA,TA,Y,,,,WD,Abnorml
2917,20000,160.0,1224.0,4,1,0,2.0,474,0,RL,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
2918,10441,62.0,912.0,3,0,0,0.0,80,0,RL,...,,,,,Y,,MnPrv,Shed,WD,Normal


In [None]:
testing_data['Expensive'] = random_forest_pipe.predict(testing_data)

In [None]:
# Export the column 'Expensive' along with the index to create a submission file
testing_data['Expensive'].to_csv('./submission.csv')

In [None]:
# Colab only
from google.colab import files
files.download('./submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>