In [None]:
# data manipulation and plotting
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# for saving the pipeline
import joblib

# from Scikit-learn
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Binarizer

# from feature-engine
from feature_engine.imputation import (
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer,
)

from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder,
)

from feature_engine.transformation import LogTransformer

from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

import preprocessors as pp

: 

In [None]:
# load dataset
data = pd.read_csv('data.csv')

# rows and columns of the data
print(data.shape)

# visualise the dataset
data

: 

In [None]:
print(data.dtypes)


: 

In [None]:
# Let's separate into train and test set
# Remember to set the seed (random_state for this sklearn function)

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['Id', 'Rent'], axis=1), # predictive variables
    data['Rent'], # target
    test_size=0.3, # portion of dataset to allocate to test set
    random_state=0, # we are setting the seed here
)

X_train.shape, X_test.shape

: 

In [None]:
y_train = np.log(y_train)
y_test = np.log(y_test)

: 

In [None]:
# numerical variables with NA in train set
NUMERICAL_VARS_WITH_NA = ['KitchenSize', 'BathrmSize', 'BalcnySize']

# variables to map
SOME_VARS = ['Cheaperflrs', 'Electricity', 'RefDisposal',
             'RunWater', 'SecPost', 'Cleaners', 'LodgeGen',
             'Solar', 'GenHouse', 'Parking', 'Wdrobe', 'Balcony']

# categorical variables to encode
LOCATION_VARS = ['Location']

FINISHING_VARS = ['Finishing']

# variable mappings
SOME_MAPPINGS = {'No':1, 'I don\'t know':2,
                 'Yes':3}

LOCATION_MAPPINGS = {'Near Eziobodo Gate':1, 'Near Sekani':2,
                     'Around John Paul\'s Kitchen':3, 'Dombolo':4,
                     'Eziobodo Elu':5}

FINISHING_MAPPINGS = {'Tiles':1, 'Cement':2}

# the selected variables
FEATURES = ['Storeys',
            'Cheaperflrs',
            'Electricity',
            'RefDisposal',
            'RunWater',
            'SecPost',
            'Cleaners',
            'LodgeGen',
            'Solar',
            'LgCond',
            'Age',
            'GenHouse',
            'Parking',
            'Distance',
            'Location',
            'RdCond',
            'SecurityLvl',
            'RmSize',
            'RmCond',
            'Wdrobe',
            'Finishing',
            'Balcony',
            'KitchenSize',
            'BathrmSize',
            'BalcnySize'
]





: 

In [None]:
X_train = X_train[FEATURES]
X_test = X_test[FEATURES]

X_train.shape, X_test.shape

: 

In [None]:
price_pipe = Pipeline([
        # === mappers ===
    ('mapper_SOME', pp.Mapper(
        variables=SOME_VARS, mappings=SOME_MAPPINGS)),
    
    ('mapper_LOCATION', pp.Mapper(
        variables=LOCATION_VARS, mappings=LOCATION_MAPPINGS)),
    ('mapper_FINISHING', pp.Mapper(
        variables=FINISHING_VARS, mappings=FINISHING_MAPPINGS)),
        
    ('scaler', MinMaxScaler()),
#     ('selector', SelectFromModel(Lasso(alpha=0.001, random_state=0))),
    ('Lasso', Lasso(alpha=0.1, random_state=0))
])

: 

In [None]:
# train the pipeline
price_pipe.fit(X_train, y_train)

: 

In [None]:
# evaluate the model:
# ====================

# make predictions for train set
pred = price_pipe.predict(X_train)

# determine mse, rmse and r2
print('train mse: {}'.format(int(
    mean_squared_error(np.exp(y_train), np.exp(pred)))))
print('train rmse: {}'.format(int(
    mean_squared_error(np.exp(y_train), np.exp(pred), squared=False))))
print('train r2: {}'.format(
    r2_score(np.exp(y_train), np.exp(pred))))
print()

# make predictions for test set
pred = price_pipe.predict(X_test)

# determine mse, rmse and r2
print('test mse: {}'.format(int(
    mean_squared_error(np.exp(y_test), np.exp(pred)))))
print('test rmse: {}'.format(int(
    mean_squared_error(np.exp(y_test), np.exp(pred), squared=False))))
print('test r2: {}'.format(
    r2_score(np.exp(y_test), np.exp(pred))))
print()

print('Average lodge price: ', int(np.exp(y_train).median()))

: 

In [None]:
# let's evaluate our predictions respect to the real sale price
plt.scatter(y_test, price_pipe.predict(X_test))
plt.xlabel('True Lodge Price')
plt.ylabel('Predicted Lodge Price')
plt.title('Evaluation of Lasso Predictions')

: 

In [None]:
# let's evaluate the distribution of the errors: 
# they should be fairly normally distributed

y_test.reset_index(drop=True, inplace=True)

preds = pd.Series(price_pipe.predict(X_test))

errors = y_test - preds
errors.hist(bins=30)
plt.show()

: 

In [None]:
# now let's save the scaler

joblib.dump(price_pipe, 'price_pipe.joblib') 

: 

In [None]:
# load the unseen / new dataset
data = pd.read_csv('test.csv')

data.drop('Id', axis=1, inplace=True)

data = data[FEATURES]

print(data.shape)

: 

In [None]:
new_preds = price_pipe.predict(data)

: 

In [None]:
# let's plot the predicted sale prices
pd.Series(np.exp(new_preds)).hist(bins=50)

: 