# Introduction: Business Problem 

In this project we will try to predict the monthly rental price for a condominium. Specifically, this report will be targeted to stakeholders interested in finding the best value in renting a condominium in Singapore.

We will use our data science powers to find optimum rental price and recommend stake holders the best values and similar units for the stakeholders.

# Similar Listing Recommender
Create a tool to recommend similar listing according to users preference using unsupervise machine learning method.

# Imports

In [69]:
#!pip install dill

In [70]:
import pandas as pd
import numpy as np


import pickle
import dill

from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

In [71]:
df_train = pd.read_pickle('Data/df_train')
df_test = pd.read_pickle('Data/df_test')

In [72]:
# we will remove the id 524 as it is an anomoly as found in 03 Feature Modelling
df_test.iloc[524:525]

Unnamed: 0,detailed_address,lat,long,bedrooms,bathrooms,sqft,built_year,amenities,mrt_name,building_name,tenure,link,picture_url,price_month,walking_time,distance,studio,district_number
7442,10 Martin Place,1.293351,103.838452,2.0,4.0,7646,2021.0,"[Water feature, Gym, Pool deck, Lounge, Jacuzz...",Somerset MRT,Martin Modern,0,https://www.99.co/singapore/rent/property/mart...,https://financialtribune.com/sites/default/fil...,6500,25.0,2454.0,0,9


In [73]:
df_test = df_test.drop(index=7442)

In [74]:
#we will merge both df_train & df_test as unsupervise learning does not require it to be split.
df_recommender = pd.concat([df_train, df_test], ignore_index=True)

In [75]:
# remove all amenities in listing except for swimming & gym as it is a common feature that user might be interested in
def relevantamenities(x: list):
    
    relevant = ['Gym', 'Swimming pool']
    
    return [amenity for amenity in x if amenity in relevant]

In [76]:
df_recommender['amenities'] = df_recommender.amenities.apply(relevantamenities)

In [77]:
#transform the amenities into feature
ct = make_column_transformer(
    #to apply CountVectorizer to list of words we should disable analyzer by lambda x:x.
    (CountVectorizer(analyzer=lambda x: x), 'amenities'),
    remainder = 'passthrough',
)

In [78]:
df_recommender = pd.DataFrame(ct.fit_transform(df_recommender), columns=ct.get_feature_names())

In [79]:
#user input ['district', 'mrt', 'built_year', 'walking_time_to_mrt', 'sqft', 'pool', 'gym']

df_recommender = df_recommender.rename(columns={'countvectorizer__Gym':'gym', 
                                                'countvectorizer__Swimming pool':'pool', 
                                                'mrt_name':'mrt', 
                                                "walking_time":'walking_time_to_mrt', 
                                                'district_number':'district'})

In [80]:
#we will create a recommender df that makes prediction according to user input and extract information on main_df to display on web
model_df = df_recommender.copy()
model_df = model_df[['district', 'mrt', 'built_year', 'walking_time_to_mrt', 'sqft', 'pool', 'gym', 'price_month']]

In [81]:
model_df.head(5)

Unnamed: 0,district,mrt,built_year,walking_time_to_mrt,sqft,pool,gym,price_month
0,9,Somerset MRT,2017.0,10.0,1044,1,0,7000
1,2,Tanjong Pagar MRT,2014.0,5.0,603,1,1,4500
2,10,Dover MRT,2005.0,30.0,1346,1,0,4800
3,9,Newton MRT,2016.0,10.0,624,0,1,3500
4,5,Pasir Panjang MRT,2018.0,5.0,603,0,0,3500


In [82]:
#changing datatypes
for col in model_df:
    model_df[col] = pd.to_numeric(model_df[col], errors='ignore')

## Applying  OneHotEncoder & MixMaxScaler to MRT

In [83]:
#one hot encode mrt column
ct = make_column_transformer(
    #to apply CountVectorizer to list of words we should disable analyzer by lambda x:x.
    (OneHotEncoder(handle_unknown='ignore'), ['mrt']),
    (MinMaxScaler(), ['built_year', 'walking_time_to_mrt', 'sqft', 'price_month', 'district']),
    remainder = 'passthrough',
)

In [84]:
ct.fit(model_df);

In [85]:
#note sklearn 1.0.0 has impleted methods that are easier to extract feature named pass in column transformer

#feature of OHE
feature1 = ct.named_transformers_.onehotencoder.get_feature_names().tolist()
#feature of MinMaxScaler
feature2 = ['built_year', 'walking_time_to_mrt', 'sqft', 'price_month', 'district']

#feature of passthrough
indices = ct.transformers_[-1][-1]
feature3 = model_df.columns[indices].tolist()

#combine 
all_feature = feature1 + feature2 + feature3

## Feature Weighting

We will write a class that allow us to adjust importance of each feature based on importance of a feature for a user.

In [86]:
model_df_tr = pd.DataFrame(ct.transform(model_df).toarray(), columns=all_feature)
model_df_tr;

In [87]:
class FeatureImportanceScale(BaseEstimator, TransformerMixin):
    
    def __init__(self, importance:dict):
        
        """
        eg
        'price_month' : 100
        'district' : 100
        
        Note current implementation does not work on feature that is onehotencoded
        """
        
        self.importance = importance        
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        
        X_temp = X.copy()
        
        for feature, scale in self.importance.items():
            X_temp[feature] = X_temp[feature] * scale
        
        return X_temp
            
            
    

In [88]:
importance = {'price_month' : 100,
              'district' : 100}
feature_importance = FeatureImportanceScale(importance)
model_df_tr = feature_importance.fit_transform(model_df_tr)

# NearestNeighbour

In [89]:
neigh = NearestNeighbors()
neigh.fit(model_df_tr);

In [90]:
index = neigh.kneighbors(model_df_tr.iloc[15:16],  return_distance=False)

In [91]:
selected_index = np.reshape(index, -1)

df_recommender.iloc[selected_index]

Unnamed: 0,gym,pool,detailed_address,lat,long,bedrooms,bathrooms,sqft,built_year,mrt,building_name,tenure,link,picture_url,price_month,walking_time_to_mrt,distance,studio,district
15,1,1,21 Claymore Road,1.30792,103.829688,3.0,5.0,1840,2010.0,Orchard MRT,The Tate Residences,1,https://www.99.co/singapore/rent/property/the-...,https://www.99.co/static/v3/icons/clubhouse@3x...,9800,10.0,487.0,0,9
4687,1,1,23 Claymore Road,1.30792,103.829688,3.0,3.0,1850,2010.0,Orchard MRT,The Tate Residences,1,https://www.99.co/singapore/rent/property/the-...,https://www.99.co/static/v3/icons/clubhouse@3x...,9600,10.0,487.0,0,9
5994,1,1,23 Claymore Road,1.30792,103.829688,3.0,2.0,1894,2010.0,Orchard MRT,The Tate Residences,1,https://www.99.co/singapore/rent/property/the-...,https://www.99.co/static/v3/icons/clubhouse@3x...,9500,10.0,487.0,0,9
2601,1,1,27 Claymore Road,1.30792,103.829688,3.0,3.0,2680,1985.0,Orchard MRT,The Claymore,1,https://www.99.co/singapore/rent/property/the-...,https://www.99.co/static/v3/icons/gym@3x.png,10000,10.0,489.0,0,9
1603,1,1,53 Cairnhill Road,1.302997,103.837785,3.0,4.0,2852,1978.0,Orchard MRT,Cairnhill Plaza,1,https://www.99.co/singapore/rent/property/cair...,https://www.99.co/static/v3/icons/gym@3x.png,9500,15.0,866.0,0,9


# Testing User Input

In [92]:
#test user input
user_input = {
    'district' : 7,
    'mrt' : 'Tampines MRT',
    'built_year' : 1980,
    'walking_time_to_mrt' : 22,
    'sqft' : 123,
    'pool' : 1,
    'gym': 1,
    'price_month' : 1489
}

testing2 = pd.DataFrame.from_dict(user_input, orient='index').T

In [93]:
with open("WebApp\static\models\onehotencoder_Transformer.pkl", "rb") as to_load:
    wth = pickle.load(to_load)

In [94]:
user = pd.DataFrame(wth.transform(testing2).toarray(), columns=all_feature)

In [95]:
user = pd.DataFrame(ct.transform(testing2).toarray(), columns=all_feature)

In [96]:
user = feature_importance.transform(user)

In [97]:
index = neigh.kneighbors(user,  return_distance=False)
selected_index = np.reshape(index, -1)
df_recommender.iloc[selected_index]

Unnamed: 0,gym,pool,detailed_address,lat,long,bedrooms,bathrooms,sqft,built_year,mrt,building_name,tenure,link,picture_url,price_month,walking_time_to_mrt,distance,studio,district
6089,1,1,7500A Beach Road,1.298468,103.8581,1.0,1.0,350,1979.0,Nicoll Highway MRT,The Plaza,0,https://www.99.co/singapore/rent/property/the-...,https://www.99.co/static/v3/icons/aircon@3x.png,1299,5.0,300.0,1,7
2133,0,0,200 Jalan Sultan,1.302965,103.861447,1.0,1.0,250,1977.0,Lavender MRT,Textile Centre,0,https://www.99.co/singapore/rent/property/text...,https://pic.99.co/v3/fW3brnGdM5bMwyQDgBFhiW?wi...,1500,10.0,388.0,1,7
2848,0,0,200 Jalan Sultan,1.302965,103.861447,1.0,1.0,250,1977.0,Lavender MRT,Textile Centre,0,https://www.99.co/singapore/rent/property/text...,https://financialtribune.com/sites/default/fil...,1800,10.0,388.0,0,7
1578,0,0,463 Crawford Lane,1.304996,103.862496,1.0,1.0,300,1981.0,Lavender MRT,Crawford Court,0,https://www.99.co/singapore/rent/property/craw...,https://financialtribune.com/sites/default/fil...,1750,5.0,254.0,1,7
4844,0,0,1 Mcnally Street,1.302413,103.851247,1.0,1.0,250,1994.0,Rochor MRT,Lasalle College Of The Arts,0,https://www.99.co/singapore/rent/property/lasa...,https://financialtribune.com/sites/default/fil...,1750,5.0,167.0,1,7


From user input to recommended unit, the recommendation seemed reasonable and following the weightages of the feature importance. Now our content base recommendation can be put into production

# Setting Up for FlaskAPP

In [98]:
# #save transformer
# with open("WebApp\static\models\onehotencoder_Transformer.pkl", "wb") as to_save:
#     pickle.dump(ct, to_save)

# with open("WebApp\static\models\FeatureImportanceScale.joblib", "wb") as to_save:
#     dill.dump(feature_importance, to_save)

# #save model
# with open("WebApp\static\models\Kneighbour.pkl", "wb") as to_save:
#     pickle.dump(neigh, to_save)

# #save recommender database
# with open("WebApp\static\database\df_recommender.pkl", "wb") as to_save:
#     pickle.dump(df_recommender, to_save)



In [99]:
with open("WebApp\static\models\Kneighbour.pkl", "rb") as to_load:
    neigh = pickle.load(to_load)
    
with open("WebApp\static\models\onehotencoder_Transformer.pkl", "rb") as to_load:
    ct = pickle.load(to_load)

with open("WebApp\static\models\FeatureImportanceScale.joblib", "rb") as to_load:
    feature_importance = dill.load(to_load)

with open("WebApp\static\database\df_recommender.pkl", "rb") as to_load:
    df_recommender = pickle.load(to_load)
       

In [100]:
#create a function to combine above transformation steps for Flask app

def customtransformation(X):
    """
    Function to combine
    
    1.OneHotEncoder
    2.MixMaxScaler
    3.FeatureWeighting
    """

    

    #combine feature_name
    #feature of OHE
    feature1 = ct.named_transformers_.onehotencoder.get_feature_names().tolist()
    
    #remaining feature
    feature2 = ['built_year', 'walking_time_to_mrt', 'sqft', 'price_month', 'district', 'pool', 'gym']

    #combine 
    all_feature = feature1 + feature2
    
    X = pd.DataFrame.from_dict(X, orient='index').T
    
    #transform to Dataframe for Feature Importance    
    X_tr = pd.DataFrame(ct.transform(X).toarray(), columns=all_feature)
    X_tr = feature_importance.transform(X_tr)
    
    return X_tr

In [101]:
def recommendlisting(user_input):
    

    X = customtransformation(user_input)
    
    index = neigh.kneighbors(user,  return_distance=False)
    selected_index = np.reshape(index, -1)
    
    sim = df_recommender.iloc[selected_index]
    
    return sim

In [102]:
recommendlisting(user_input)

Unnamed: 0,gym,pool,detailed_address,lat,long,bedrooms,bathrooms,sqft,built_year,mrt,building_name,tenure,link,picture_url,price_month,walking_time_to_mrt,distance,studio,district
6089,1,1,7500A Beach Road,1.298468,103.8581,1.0,1.0,350,1979.0,Nicoll Highway MRT,The Plaza,0,https://www.99.co/singapore/rent/property/the-...,https://www.99.co/static/v3/icons/aircon@3x.png,1299,5.0,300.0,1,7
2133,0,0,200 Jalan Sultan,1.302965,103.861447,1.0,1.0,250,1977.0,Lavender MRT,Textile Centre,0,https://www.99.co/singapore/rent/property/text...,https://pic.99.co/v3/fW3brnGdM5bMwyQDgBFhiW?wi...,1500,10.0,388.0,1,7
2848,0,0,200 Jalan Sultan,1.302965,103.861447,1.0,1.0,250,1977.0,Lavender MRT,Textile Centre,0,https://www.99.co/singapore/rent/property/text...,https://financialtribune.com/sites/default/fil...,1800,10.0,388.0,0,7
1578,0,0,463 Crawford Lane,1.304996,103.862496,1.0,1.0,300,1981.0,Lavender MRT,Crawford Court,0,https://www.99.co/singapore/rent/property/craw...,https://financialtribune.com/sites/default/fil...,1750,5.0,254.0,1,7
4844,0,0,1 Mcnally Street,1.302413,103.851247,1.0,1.0,250,1994.0,Rochor MRT,Lasalle College Of The Arts,0,https://www.99.co/singapore/rent/property/lasa...,https://financialtribune.com/sites/default/fil...,1750,5.0,167.0,1,7
