# Recommendation system similarity matrix generator

For this project, given an apartment name, 3 types of recommendations would be generated:

1. Similar apartments based on nearby locations
2. Similar apartments based on pricing
3. Similar apartments based on facilities provided

In [1]:
import pandas as pd 
import numpy as np
import pickle as pkl
import ast

In [2]:
apartments = pd.read_csv('/home/siddesh/Desktop/Git_Repositories/Property_Price_Analysis_and_Prediction/data/appartments.csv')
apartments.head(5)

Unnamed: 0,PropertyName,PropertySubName,NearbyLocations,LocationAdvantages,Link,PriceDetails,TopFacilities
0,Smartworld One DXP,"2, 3, 4 BHK Apartment in Sector 113, Gurgaon","['Bajghera Road', 'Palam Vihar Halt', 'DPSG Pa...","{'Bajghera Road': '800 Meter', 'Palam Vihar Ha...",https://www.99acres.com/smartworld-one-dxp-sec...,"{'2 BHK': {'building_type': 'Apartment', 'area...","['Swimming Pool', 'Salon', 'Restaurant', 'Spa'..."
1,M3M Crown,"3, 4 BHK Apartment in Sector 111, Gurgaon","['DPSG Palam Vihar Gurugram', 'The NorthCap Un...","{'DPSG Palam Vihar Gurugram': '1.4 Km', 'The N...",https://www.99acres.com/m3m-crown-sector-111-g...,"{'3 BHK': {'building_type': 'Apartment', 'area...","['Bowling Alley', 'Mini Theatre', 'Manicured G..."
2,Adani Brahma Samsara Vilasa,"Land, 3, 4 BHK Independent Floor in Sector 63,...","['AIPL Business Club Sector 62', 'Heritage Xpe...","{'AIPL Business Club Sector 62': '2.7 Km', 'He...",https://www.99acres.com/adani-brahma-samsara-v...,{'3 BHK': {'building_type': 'Independent Floor...,"['Terrace Garden', 'Gazebo', 'Fountain', 'Amph..."
3,Sobha City,"2, 3, 4 BHK Apartment in Sector 108, Gurgaon","['The Shikshiyan School', 'WTC Plaza', 'Luxus ...","{'The Shikshiyan School': '2.9 KM', 'WTC Plaza...",https://www.99acres.com/sobha-city-sector-108-...,"{'2 BHK': {'building_type': 'Apartment', 'area...","['Swimming Pool', 'Volley Ball Court', 'Aerobi..."
4,Signature Global City 93,"2, 3 BHK Independent Floor in Sector 93 Gurgaon","['Pranavananda Int. School', 'DLF Site central...","{'Pranavananda Int. School': '450 m', 'DLF Sit...",https://www.99acres.com/signature-global-city-...,{'2 BHK': {'building_type': 'Independent Floor...,"['Mini Theatre', 'Doctor on Call', 'Concierge ..."


In [3]:
apartments.drop(22, inplace=True)

### 1. Apartment similarity based on Nearby Locations

Upon observation, it can be seen that the 'LocationAdvantages' column is a superset of 'NearbyLocations' column, hence it would be used for generating the similarities.

In [4]:
loc_advantage = apartments[['PropertyName', 'LocationAdvantages']]

In [5]:
loc_advantage.head(5)

Unnamed: 0,PropertyName,LocationAdvantages
0,Smartworld One DXP,"{'Bajghera Road': '800 Meter', 'Palam Vihar Ha..."
1,M3M Crown,"{'DPSG Palam Vihar Gurugram': '1.4 Km', 'The N..."
2,Adani Brahma Samsara Vilasa,"{'AIPL Business Club Sector 62': '2.7 Km', 'He..."
3,Sobha City,"{'The Shikshiyan School': '2.9 KM', 'WTC Plaza..."
4,Signature Global City 93,"{'Pranavananda Int. School': '450 m', 'DLF Sit..."


In [6]:
loc_advantage['LocationAdvantages'][0]

"{'Bajghera Road': '800 Meter', 'Palam Vihar Halt': '2.5 KM', 'DPSG Palam Vihar': '3.1 KM', 'Park Hospital': '3.1 KM', 'Gurgaon Railway Station': '4.9 KM', 'The NorthCap University': '5.4 KM', 'Dwarka Expy': '1.2 KM', 'Hyatt Place Gurgaon Udyog Vihar': '7.7 KM', 'Dwarka Sector 21, Metro Station': '7.2 KM', 'Pacific D21 Mall': '7.4 KM', 'Indira Gandhi International Airport': '14.7 KM', 'Hamoni Golf Camp': '6.2 KM', 'Fun N Food Waterpark': '8.8 KM', 'Accenture DDC5': '9 KM'}"

In [7]:
type(ast.literal_eval(loc_advantage['LocationAdvantages'][0]))

dict

Since the location advantages data is a dict in string format, the following steps would be performed to generate the required similarity matrix:

1. Parse the data as a dict and store with respective apartment name
2. Generate a List of all the Locations from the dict and populate the distances from the respective apartments
3. Use this as the vectorized representation for the Location advantages data for each apartment
4. Perform cosine distance based similarity for each apartment with others and generate similarity matrix

In [8]:
nearby_locations = {}
all_locations = set()

def get_locations(row):
    nearby_locations[row['PropertyName']] = ast.literal_eval(row['LocationAdvantages'])


    #Dealine with the input value cases
    for key, value in nearby_locations[row['PropertyName']].items():

        #eg : Within 900m
        if 'within' in value.lower():
            nearby_locations[row['PropertyName']][key] = value.split(' ')[1]
            value = value.split(' ')[1]

        #eg : Close proximity, in proximity, closeby, nearby, 5min, 10 min --> Setting it to default 500 m
        if 'close proximity' in value.lower() or 'in proximity' in value.lower() or 'closeby' in value.lower() or 'nearby' in value.lower() or 'min' in value.lower():
            nearby_locations[row['PropertyName']][key] = '500 m'
            value = '500 m'

        #Convert any distance in meters to kms.
        #Store only numeric part of the values
        if 'meter' in value.lower() or ('m' in value.lower() and 'km' not in value.lower()):
            if ' ' in value:
                nearby_locations[row['PropertyName']][key] = float(value.split(' ')[0])/1000
            else:
                nearby_locations[row['PropertyName']][key] = float(value.lower().replace('m','').replace('meter',''))
        else:
            if 'km' in value.lower():
                if ' ' in value:
                    nearby_locations[row['PropertyName']][key] = float(value.split(' ')[0])
                else:
                    nearby_locations[row['PropertyName']][key] = float(value.lower().replace('km',''))
            else:
                print(f'Neither KM nor Meter seen. Key is {key}, Value is {value}')
                if value.replace(' ', '').isnumeric():
                    print(f"Storing as {value}")
                    nearby_locations[row['PropertyName']][key] = float(value)
                else:
                    print(f"Storing as 100000")
                    nearby_locations[row['PropertyName']][key] = 100000


                
    all_locations.update(list(ast.literal_eval(row['LocationAdvantages']).keys()))

loc_advantage.apply(get_locations, axis=1)

print(f'Total apartments processed : {len(nearby_locations)}')
print(f'Total locations : {len(all_locations)}')

all_locations

Neither KM nor Meter seen. Key is Gurugram Road, Value is 1.4
Storing as 100000
Neither KM nor Meter seen. Key is INXT High Street, Value is reach
Storing as 100000
Total apartments processed : 246
Total locations : 1070


{'Delhi Public School Sector 84',
 'IGIA Airport',
 'INXT High Street',
 'Peer Baba Ki Mazar',
 'American Express',
 'Nora Solomon Medicenter',
 'Radisson Hotel',
 'Golf Course Extn Road',
 'Minda Industries  Corporate Office',
 'Kheri Railway station',
 'Ramada by Wyndham Gurgaon',
 'Polaris Hospital',
 'Shivai Hospital',
 'Golf Course Ext Road',
 'V-Square Sohna New Residential',
 'ESIC Dispensary',
 'Sector 42-43 Metro station',
 'Surajgarh Gurgaon',
 'Anya Gurgaon',
 'DPS Manesar',
 'Skylark Cricket Academy',
 'BSF Golf Course',
 'Sushant University',
 'NH 352W, Pataudi',
 'Muincipal Corporation of Gurugram',
 'Taxila cricket ground',
 "V'Lante Mall",
 'Leopard hills',
 'KIIT College',
 'The Hive Shopping Mall',
 'Sohna Rd',
 'Gems International School',
 'Gyan Bharti Public School',
 'Hyatt Regency Gurugram',
 'Info Technology Park Phase 2',
 'Faridabad Gurgaon Road',
 'IFFCO Chowk Metro Station',
 'Vallores Pre School',
 'Ektaa Hospital',
 'Pushpanjali Hospital',
 'Shalom Preside

In [9]:
#rows = apartments
#columns = size of the set with all locations
#create a list of lists, each list having the appartment name as first entry followed by the distance to the locations

location_distances = []

for apartment_name in nearby_locations.keys():
    entry = [apartment_name]
    for location in all_locations:
        if location in nearby_locations[apartment_name].keys():
            entry.append(nearby_locations[apartment_name][location])
        else:
            entry.append(100000)
    location_distances.append(entry)



In [10]:
print(f"Computed {len(location_distances)} rows, {len(location_distances[0])} columns")

Computed 246 rows, 1071 columns


In [11]:
location_computed_values = pd.DataFrame(data=location_distances, columns=['PropertyName']+list(all_locations))

In [12]:
location_computed_values.head(5)

Unnamed: 0,PropertyName,Delhi Public School Sector 84,IGIA Airport,INXT High Street,Peer Baba Ki Mazar,American Express,Nora Solomon Medicenter,Radisson Hotel,Golf Course Extn Road,Minda Industries Corporate Office,...,"Metro Hospital, Palam Vihar",MKD Hospital,Sapphire Mall,"Badshahpur Sohna Rd Hwy, Raghav Vatika",Holiday Inn,Yonex Badminton Stadium,Hyatt Regency Hotel,ASF Insignia SEZ,Northern Peripheral Road,Savoy Suites
0,Smartworld One DXP,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,...,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
1,M3M Crown,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,...,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
2,Adani Brahma Samsara Vilasa,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,...,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
3,Sobha City,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,...,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
4,Signature Global City 93,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,...,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0


In [14]:
with open('location_distances.pkl', 'wb') as location_save_file:
    pkl.dump(location_computed_values, location_save_file)

In [13]:
location_computed_values.shape

(246, 1071)

Standard scaling the values since distance based similarity metric is going to be used

In [14]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

location_computed_values.iloc[:,1:] = sc.fit_transform(location_computed_values.iloc[:,1:])

In [15]:
location_computed_values

Unnamed: 0,PropertyName,Vidya Niketan Sr Sec School,PNB ATM,Vasant Kunj,Bharat Petroleum Petrol Pump,Sushant University,Hong Kong Bazaar,NH-8 IMT Manesar,De Adventure Amusement Park,"Park Hospital, Palam Vihar",...,Marigold Secondary School,Golf Course Extn Road,Shree Deep Petrol Pump,Aarvy Healthcare Hospital,IndusInd Bank ATM,The Westin Hotel,The Executive Centre,Discount Department Store,Cricket Academy,"The Phoenix Project, Sohna - Gurgaon Rd"
0,Smartworld One DXP,0.063888,0.063888,0.063888,0.063888,0.358386,0.111111,0.063888,0.063888,0.090536,...,0.063888,0.063888,0.063888,0.144038,0.090536,0.063888,0.090536,0.063888,0.090536,0.063888
1,M3M Crown,0.063888,0.063888,0.063888,0.063888,0.358386,0.111111,0.063888,0.063888,-11.045406,...,0.063888,0.063888,0.063888,0.144038,0.090536,0.063888,0.090536,0.063888,0.090536,0.063888
2,Adani Brahma Samsara Vilasa,0.063888,0.063888,0.063888,0.063888,0.358386,0.111111,0.063888,0.063888,0.090536,...,0.063888,0.063888,0.063888,0.144038,0.090536,0.063888,0.090536,0.063888,0.090536,0.063888
3,Sobha City,0.063888,0.063888,-15.652476,0.063888,0.358386,0.111111,0.063888,0.063888,0.090536,...,0.063888,0.063888,0.063888,0.144038,0.090536,0.063888,0.090536,0.063888,0.090536,0.063888
4,Signature Global City 93,0.063888,0.063888,0.063888,0.063888,0.358386,0.111111,0.063888,0.063888,0.090536,...,0.063888,0.063888,0.063888,0.144038,0.090536,0.063888,0.090536,0.063888,0.090536,0.063888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241,DLF Princeton Estate,0.063888,0.063888,0.063888,0.063888,0.358386,0.111111,0.063888,0.063888,0.090536,...,0.063888,0.063888,0.063888,0.144038,0.090536,0.063888,0.090536,0.063888,0.090536,0.063888
242,Pyramid Urban Homes 2,0.063888,0.063888,0.063888,0.063888,0.358386,0.111111,0.063888,0.063888,0.090536,...,0.063888,0.063888,0.063888,0.144038,0.090536,0.063888,0.090536,0.063888,0.090536,0.063888
243,Satya The Hermitage,0.063888,0.063888,0.063888,0.063888,0.358386,0.111111,0.063888,0.063888,0.090536,...,0.063888,0.063888,0.063888,0.144038,0.090536,0.063888,0.090536,0.063888,0.090536,0.063888
244,BPTP Spacio,0.063888,0.063888,0.063888,0.063888,0.358386,0.111111,0.063888,0.063888,0.090536,...,0.063888,0.063888,0.063888,0.144038,0.090536,0.063888,0.090536,0.063888,0.090536,0.063888


Compute cosine distance to get similarity scores

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_location_advantage = cosine_similarity(location_computed_values.iloc[:,1:])

In [17]:
#Pairwise similarity generated for each property with every other property based on nearby locations data

similarity_location_advantage.shape

(246, 246)

### 2. Apartment similarity based on Pricing details

In [18]:
price_details_df =apartments[['PropertyName','PriceDetails']]

apartments[['PropertyName','PriceDetails']].head(5)

Unnamed: 0,PropertyName,PriceDetails
0,Smartworld One DXP,"{'2 BHK': {'building_type': 'Apartment', 'area..."
1,M3M Crown,"{'3 BHK': {'building_type': 'Apartment', 'area..."
2,Adani Brahma Samsara Vilasa,{'3 BHK': {'building_type': 'Independent Floor...
3,Sobha City,"{'2 BHK': {'building_type': 'Apartment', 'area..."
4,Signature Global City 93,{'2 BHK': {'building_type': 'Independent Floor...


In [19]:
apartments['PriceDetails'][0]

"{'2 BHK': {'building_type': 'Apartment', 'area_type': 'Carpet Area', 'area': '1,370 sq.ft.', 'price-range': '₹ 2 - 2.4 Cr'}, '3 BHK': {'building_type': 'Apartment', 'area_type': 'Carpet Area', 'area': '1,850 - 2,050 sq.ft.', 'price-range': '₹ 2.25 - 3.59 Cr'}, '4 BHK': {'building_type': 'Apartment', 'area_type': 'Carpet Area', 'area': '2,600 sq.ft.', 'price-range': '₹ 3.24 - 4.56 Cr'}}"

The price details column again is a dict with 1 BHK, 2BHK, ... as keys and the information of 'building_type', 'area_type'. 'Carpet area', 'area', 'price-range' for each BHK. This string needs to be parsed as a dict and a Dataframe of these details needs to be created to be able to obtain an embedding.

In [20]:
print(ast.literal_eval(apartments['PriceDetails'][0]))
type(ast.literal_eval(apartments['PriceDetails'][0]))

{'2 BHK': {'building_type': 'Apartment', 'area_type': 'Carpet Area', 'area': '1,370 sq.ft.', 'price-range': '₹ 2 - 2.4 Cr'}, '3 BHK': {'building_type': 'Apartment', 'area_type': 'Carpet Area', 'area': '1,850 - 2,050 sq.ft.', 'price-range': '₹ 2.25 - 3.59 Cr'}, '4 BHK': {'building_type': 'Apartment', 'area_type': 'Carpet Area', 'area': '2,600 sq.ft.', 'price-range': '₹ 3.24 - 4.56 Cr'}}


dict

In [21]:
price_details_keys = ['building_type', 'area_low', 'area_high', 'price_low', 'price_high']

all_keys = set()
apartments_price_data = {}

def parse_price_details(row):
    #For each row, check the following:
    #First check the available number of BHKs or related keys in the dict (1 BHK, 2 BHK, ...)
    #For each available BHK, obtain the details of keys from 'price_details_keys' and create a new key by concatenating the 2 keys, update in all_keys set
    #eg: building_type_1_BHK, ...
    #Populate the apartments_price_data dict with the created key and value obtained from the row parsed

    apartments_price_data[row['PropertyName']] = {}

    #Validation checks : 
    #1. Area data if present must be in sq.ft.
    #2. Price data if present must be in cr
    #3. If range not present, then price low and price high will be marked with same value. Same for area.

    current_dict = ast.literal_eval(row['PriceDetails'])
    for key in current_dict.keys():
        if 'building_type' in current_dict[key].keys():
            all_keys.add('building_type'+key)
            apartments_price_data[row['PropertyName']]['building_type'+key] = current_dict[key]['building_type']
        if 'area' in current_dict[key].keys():
            if not 'sq.ft.' in current_dict[key]['area'].lower():
                print(f"Error, the area value is {current_dict[key]['area'].lower()}")
            else:
                if '-' in current_dict[key]['area']:
                    #Range values present
                    area_values = current_dict[key]['area'].lower().split('-')
                    all_keys.add('area_low'+key)
                    apartments_price_data[row['PropertyName']]['area_low'+key] = float(area_values[0].replace(',','').replace(' ','').replace('sq.ft.',''))
                    all_keys.add('area_high'+key)
                    apartments_price_data[row['PropertyName']]['area_high'+key] = float(area_values[1].replace(',','').replace(' ','').replace('sq.ft.',''))
                else:
                    area_value = current_dict[key]['area'].lower()
                    all_keys.add('area_low'+key)
                    apartments_price_data[row['PropertyName']]['area_low'+key] = float(area_value.replace(',','').replace(' ','').replace('sq.ft.',''))
                    all_keys.add('area_high'+key)
                    apartments_price_data[row['PropertyName']]['area_high'+key] = float(area_value.replace(',','').replace(' ','').replace('sq.ft.',''))
        if 'price-range' in current_dict[key].keys():
            if 'price on request' in current_dict[key]['price-range'].lower():
                print(f"Error, the price value is {current_dict[key]['price-range'].lower()}")
            #if not 'cr' in current_dict[key]['price-range'].lower():
            #    print(f"Error, the price value is {current_dict[key]['price-range'].lower()}")
            else:
                if '-' in current_dict[key]['price-range']:
                    #Range values present
                    price_values = current_dict[key]['price-range'].lower().split('-')
                    all_keys.add('price_low'+key)
                    if 'l' in price_values[0].lower():
                        apartments_price_data[row['PropertyName']]['price_low'+key] = (float(price_values[0].lower().replace('l','').replace(',','').replace(' ','').replace('cr','').replace('₹','')))/100
                    else:
                        apartments_price_data[row['PropertyName']]['price_low'+key] = float(price_values[0].replace(',','').replace(' ','').replace('cr','').replace('₹',''))
                    all_keys.add('price_high'+key)
                    if 'l' in price_values[1].lower():
                        apartments_price_data[row['PropertyName']]['price_high'+key] = (float(price_values[1].lower().replace('l','').replace(',','').replace(' ','').replace('cr','').replace('₹','')))/100
                    else:
                        apartments_price_data[row['PropertyName']]['price_high'+key] = float(price_values[1].replace(',','').replace(' ','').replace('cr','').replace('₹',''))
                else:
                    price_value = current_dict[key]['price-range'].lower()
                    if 'l' in price_value.lower():
                        all_keys.add('price_low'+key)
                        all_keys.add('price_high'+key)
                        apartments_price_data[row['PropertyName']]['price_low'+key] =(float(price_value.lower().replace('l','').replace(',','').replace(' ','').replace('cr','').replace('₹','')))/100
                        apartments_price_data[row['PropertyName']]['price_high'+key] =(float(price_value.lower().replace('l','').replace(',','').replace(' ','').replace('cr','').replace('₹','')))/100
                    else:
                        all_keys.add('price_low'+key)
                        apartments_price_data[row['PropertyName']]['price_low'+key] = float(price_value.replace(',','').replace(' ','').replace('cr','').replace('₹',''))
                        all_keys.add('price_high'+key)
                        apartments_price_data[row['PropertyName']]['price_high'+key] = float(price_value.replace(',','').replace(' ','').replace('cr','').replace('₹',''))
                
                
                    

In [22]:
price_details_df.apply(parse_price_details, axis=1)

Error, the price value is price on request
Error, the price value is price on request
Error, the price value is price on request
Error, the price value is price on request
Error, the price value is price on request
Error, the price value is price on request
Error, the price value is price on request
Error, the price value is price on request
Error, the price value is price on request
Error, the price value is price on request
Error, the price value is price on request
Error, the price value is price on request
Error, the price value is price on request
Error, the price value is price on request
Error, the price value is price on request
Error, the price value is price on request
Error, the price value is price on request
Error, the price value is price on request
Error, the price value is price on request
Error, the price value is price on request
Error, the price value is price on request
Error, the price value is price on request
Error, the price value is price on request
Error, the 

0      None
1      None
2      None
3      None
4      None
       ... 
242    None
243    None
244    None
245    None
246    None
Length: 246, dtype: object

In [23]:
apartments_price_data

{'Smartworld One DXP': {'building_type2 BHK': 'Apartment',
  'area_low2 BHK': 1370.0,
  'area_high2 BHK': 1370.0,
  'price_low2 BHK': 2.0,
  'price_high2 BHK': 2.4,
  'building_type3 BHK': 'Apartment',
  'area_low3 BHK': 1850.0,
  'area_high3 BHK': 2050.0,
  'price_low3 BHK': 2.25,
  'price_high3 BHK': 3.59,
  'building_type4 BHK': 'Apartment',
  'area_low4 BHK': 2600.0,
  'area_high4 BHK': 2600.0,
  'price_low4 BHK': 3.24,
  'price_high4 BHK': 4.56},
 'M3M Crown': {'building_type3 BHK': 'Apartment',
  'area_low3 BHK': 1605.0,
  'area_high3 BHK': 2170.0,
  'price_low3 BHK': 2.2,
  'price_high3 BHK': 3.03,
  'building_type4 BHK': 'Apartment',
  'area_low4 BHK': 2248.0,
  'area_high4 BHK': 2670.0,
  'price_low4 BHK': 3.08,
  'price_high4 BHK': 3.73},
 'Adani Brahma Samsara Vilasa': {'building_type3 BHK': 'Independent Floor',
  'area_low3 BHK': 1800.0,
  'area_high3 BHK': 3150.0,
  'price_low3 BHK': 2.43,
  'price_high3 BHK': 15.75,
  'building_type4 BHK': 'Independent Floor',
  'area_low

In [24]:
len(apartments_price_data)

246

In [25]:
print(len(all_keys))
all_keys

40


{'area_high1 BHK',
 'area_high1 RK',
 'area_high2 BHK',
 'area_high3 BHK',
 'area_high4 BHK',
 'area_high5 BHK',
 'area_high6 BHK',
 'area_highLand',
 'area_low1 BHK',
 'area_low1 RK',
 'area_low2 BHK',
 'area_low3 BHK',
 'area_low4 BHK',
 'area_low5 BHK',
 'area_low6 BHK',
 'area_lowLand',
 'building_type1 BHK',
 'building_type1 RK',
 'building_type2 BHK',
 'building_type3 BHK',
 'building_type4 BHK',
 'building_type5 BHK',
 'building_type6 BHK',
 'building_typeLand',
 'price_high1 BHK',
 'price_high1 RK',
 'price_high2 BHK',
 'price_high3 BHK',
 'price_high4 BHK',
 'price_high5 BHK',
 'price_high6 BHK',
 'price_highLand',
 'price_low1 BHK',
 'price_low1 RK',
 'price_low2 BHK',
 'price_low3 BHK',
 'price_low4 BHK',
 'price_low5 BHK',
 'price_low6 BHK',
 'price_lowLand'}

In [26]:
pricing_data_combined = []

for property, price_data in apartments_price_data.items():
    current_data = [property]
    for key in all_keys:
        if key in price_data.keys():
            current_data.append(price_data[key])
        else:
            current_data.append(0.0)

    pricing_data_combined.append(current_data)


In [27]:
print(f"Rows : {len(pricing_data_combined)} Columns : {len(pricing_data_combined[0])}")

Rows : 246 Columns : 41


In [28]:
pricing_data_combined[0]

['Smartworld One DXP',
 0.0,
 1850.0,
 2600.0,
 'Apartment',
 0.0,
 3.59,
 0.0,
 0.0,
 0.0,
 0.0,
 2.4,
 2050.0,
 3.24,
 0.0,
 1370.0,
 0.0,
 0.0,
 0.0,
 0.0,
 2.25,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 'Apartment',
 2600.0,
 0.0,
 0.0,
 0.0,
 2.0,
 0.0,
 1370.0,
 'Apartment',
 4.56,
 0.0]

Convert above data into dataframe

In [29]:
price_info_df = pd.DataFrame(data=pricing_data_combined, columns = ['PropertyName']+list(all_keys))

In [30]:
price_info_df.head(5)

Unnamed: 0,PropertyName,price_high6 BHK,area_low3 BHK,area_low4 BHK,building_type3 BHK,price_high1 RK,price_high3 BHK,building_typeLand,area_high1 BHK,price_low6 BHK,...,area_high4 BHK,area_highLand,area_high6 BHK,price_high1 BHK,price_low2 BHK,price_low5 BHK,area_high2 BHK,building_type4 BHK,price_high4 BHK,price_highLand
0,Smartworld One DXP,0.0,1850.0,2600.0,Apartment,0.0,3.59,0.0,0.0,0.0,...,2600.0,0.0,0.0,0.0,2.0,0.0,1370.0,Apartment,4.56,0.0
1,M3M Crown,0.0,1605.0,2248.0,Apartment,0.0,3.03,0.0,0.0,0.0,...,2670.0,0.0,0.0,0.0,0.0,0.0,0.0,Apartment,3.73,0.0
2,Adani Brahma Samsara Vilasa,0.0,1800.0,2750.0,Independent Floor,0.0,15.75,,0.0,0.0,...,4500.0,4329.0,0.0,0.0,0.0,0.0,0.0,Independent Floor,22.5,41.13
3,Sobha City,0.0,1711.0,2423.0,Apartment,0.0,4.79,0.0,0.0,0.0,...,2963.0,0.0,0.0,0.0,1.55,0.0,1692.0,Apartment,6.06,0.0
4,Signature Global City 93,0.0,1235.0,0.0,Independent Floor,0.0,1.45,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.9301,0.0,1118.0,0.0,0.0,0.0


In [31]:
price_info_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246 entries, 0 to 245
Data columns (total 41 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   PropertyName        246 non-null    object 
 1   price_high6 BHK     246 non-null    float64
 2   area_low3 BHK       246 non-null    float64
 3   area_low4 BHK       246 non-null    float64
 4   building_type3 BHK  246 non-null    object 
 5   price_high1 RK      246 non-null    float64
 6   price_high3 BHK     246 non-null    float64
 7   building_typeLand   246 non-null    object 
 8   area_high1 BHK      246 non-null    float64
 9   price_low6 BHK      246 non-null    float64
 10  price_lowLand       246 non-null    float64
 11  price_high2 BHK     246 non-null    float64
 12  area_high3 BHK      246 non-null    float64
 13  price_low4 BHK      246 non-null    float64
 14  area_high1 RK       246 non-null    float64
 15  area_low2 BHK       246 non-null    float64
 16  building

All of the columns with 'building_type' prefix are String columns. Hence they need to be one hot encoded to convert from categorical to numeric.

In [32]:
cat_columns = [col for col in price_info_df.columns if price_info_df[col].dtype ==object]

In [33]:
cat_columns

['PropertyName',
 'building_type3 BHK',
 'building_typeLand',
 'building_type6 BHK',
 'building_type1 RK',
 'building_type5 BHK',
 'building_type1 BHK',
 'building_type2 BHK',
 'building_type4 BHK']

In [34]:
cat_columns = cat_columns[1:]

In [35]:
price_info_df_final = pd.get_dummies(price_info_df, columns=cat_columns, drop_first=True, dtype=float)

In [36]:
price_info_df_final.head(5)

Unnamed: 0,PropertyName,price_high6 BHK,area_low3 BHK,area_low4 BHK,price_high1 RK,price_high3 BHK,area_high1 BHK,price_low6 BHK,price_lowLand,price_high2 BHK,...,building_type5 BHK_Independent Floor,building_type5 BHK_Villa,building_type1 BHK_Apartment,building_type1 BHK_Service Apartment,building_type2 BHK_Apartment,building_type2 BHK_Independent Floor,building_type2 BHK_Service Apartment,building_type4 BHK_Apartment,building_type4 BHK_Independent Floor,building_type4 BHK_Villa
0,Smartworld One DXP,0.0,1850.0,2600.0,0.0,3.59,0.0,0.0,0.0,2.4,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,M3M Crown,0.0,1605.0,2248.0,0.0,3.03,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,Adani Brahma Samsara Vilasa,0.0,1800.0,2750.0,0.0,15.75,0.0,0.0,2.05,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,Sobha City,0.0,1711.0,2423.0,0.0,4.79,0.0,0.0,0.0,3.21,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,Signature Global City 93,0.0,1235.0,0.0,0.0,1.45,0.0,0.0,0.0,1.06,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


Standard scaling the values and computing similarity matrix between apartments

In [37]:
sc = StandardScaler()

price_info_df_final.iloc[:,1:] = sc.fit_transform(price_info_df_final.iloc[:,1:])

In [38]:
price_info_df_final.head(5)

Unnamed: 0,PropertyName,price_high6 BHK,area_low3 BHK,area_low4 BHK,price_high1 RK,price_high3 BHK,area_high1 BHK,price_low6 BHK,price_lowLand,price_high2 BHK,...,building_type5 BHK_Independent Floor,building_type5 BHK_Villa,building_type1 BHK_Apartment,building_type1 BHK_Service Apartment,building_type2 BHK_Apartment,building_type2 BHK_Independent Floor,building_type2 BHK_Service Apartment,building_type4 BHK_Apartment,building_type4 BHK_Independent Floor,building_type4 BHK_Villa
0,Smartworld One DXP,-0.073387,0.553787,0.602838,-0.082725,0.406577,-0.169584,-0.077649,-0.216398,1.044465,...,-0.111111,-0.216353,-0.254824,-0.111111,1.198183,-0.28931,-0.063888,1.058626,-0.254824,-0.236208
1,M3M Crown,-0.073387,0.293086,0.382746,-0.082725,0.238159,-0.169584,-0.077649,-0.216398,-0.458823,...,-0.111111,-0.216353,-0.254824,-0.111111,-0.834597,-0.28931,-0.063888,1.058626,-0.254824,-0.236208
2,Adani Brahma Samsara Vilasa,-0.073387,0.500583,0.696627,-0.082725,4.063663,-0.169584,-0.077649,0.613415,-0.458823,...,-0.111111,-0.216353,-0.254824,-0.111111,-0.834597,-0.28931,-0.063888,-0.94462,3.924283,-0.236208
3,Sobha City,-0.073387,0.405879,0.492166,-0.082725,0.767474,-0.169584,-0.077649,-0.216398,1.551825,...,-0.111111,-0.216353,-0.254824,-0.111111,1.198183,-0.28931,-0.063888,1.058626,-0.254824,-0.236208
4,Signature Global City 93,-0.073387,-0.100626,-1.022842,-0.082725,-0.237022,-0.169584,-0.077649,-0.216398,0.20513,...,-0.111111,-0.216353,-0.254824,-0.111111,-0.834597,3.456497,-0.063888,-0.94462,-0.254824,-0.236208


In [39]:
similarity_price_details = cosine_similarity(price_info_df_final.iloc[:,1:])

In [40]:
similarity_price_details.shape

(246, 246)

### 3. Apartment similarity based on Amneties provided

Here, we would create an embedding for each apartment based on the amneties provided, and use this to get a similarity matrix between apartments. The embedding would be a TfIdf based embedding, where each string is the concatenation of the words from the list of facilities.

In [41]:
apartment_facilities = apartments[['PropertyName', 'TopFacilities']]

In [42]:
apartment_facilities.head(5)

Unnamed: 0,PropertyName,TopFacilities
0,Smartworld One DXP,"['Swimming Pool', 'Salon', 'Restaurant', 'Spa'..."
1,M3M Crown,"['Bowling Alley', 'Mini Theatre', 'Manicured G..."
2,Adani Brahma Samsara Vilasa,"['Terrace Garden', 'Gazebo', 'Fountain', 'Amph..."
3,Sobha City,"['Swimming Pool', 'Volley Ball Court', 'Aerobi..."
4,Signature Global City 93,"['Mini Theatre', 'Doctor on Call', 'Concierge ..."


In [43]:
facilities_mapper = {}
all_facilities_sent = []


def parse_facilities(row):
    current_data = ast.literal_eval(row['TopFacilities'])
    if type(current_data) is not list:
        print(f"Error!.. incorrect dtype. value is {current_data}")
    facilities_mapper[row['PropertyName']] = ' '.join(current_data)
    all_facilities_sent.append(' '.join(current_data))

apartment_facilities.apply(parse_facilities, axis=1)

0      None
1      None
2      None
3      None
4      None
       ... 
242    None
243    None
244    None
245    None
246    None
Length: 246, dtype: object

In [44]:
print(f"Total apartments : {len(facilities_mapper)}")

print(f"Total number of unique facilities : {len(all_facilities_sent)}")

Total apartments : 246
Total number of unique facilities : 246


In [45]:
facilities_mapper

{'Smartworld One DXP': 'Swimming Pool Salon Restaurant Spa Cafeteria Sun Deck 24x7 Security Club House Gated Community',
 'M3M Crown': 'Bowling Alley Mini Theatre Manicured Garden Swimming Pool Flower Garden Reading Lounge Golf Course Barbecue Sauna',
 'Adani Brahma Samsara Vilasa': 'Terrace Garden Gazebo Fountain Amphitheatre Party Lawn Basketball Court Badminton Court Yoga/Meditation Area Indoor Games',
 'Sobha City': 'Swimming Pool Volley Ball Court Aerobics Centre Card Room Barbecue Sauna Steam Room Creche/Day care Skating Rink',
 'Signature Global City 93': 'Mini Theatre Doctor on Call Concierge Service Swimming Pool Bar/Chill-Out Lounge Laundry Flower Garden Reflexology Park Salon',
 'Whiteland The Aspen': 'Reflexology Park Card Room High Speed Elevators Sauna Jacuzzi Spa Entrance Lobby Yoga/Meditation Area Club House',
 'Bestech Altura': 'Swimming Pool Football Flower Garden Reading Lounge School Pergola Skating Rink Grocery Shop Squash Court',
 'Elan The Presidential': 'Swimmin

Apply Tf-Idf vectorization to convert the string for each apartment into an embedding vector

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf_vec = TfidfVectorizer(stop_words = 'english', ngram_range=(1,2))
tfidf_result = tfidf_vec.fit_transform(all_facilities_sent)

In [47]:
print(tfidf_vec.get_feature_names_out().shape)
tfidf_vec.get_feature_names_out()

(973,)


array(['24', '24 power', '24 water', '24x7', '24x7 security',
       'acupressure', 'acupressure park', 'aerobics', 'aerobics centre',
       'air', 'air hockey', 'alley', 'alley concierge', 'alley mini',
       'alley swimming', 'amphitheatre', 'amphitheatre badminton',
       'amphitheatre basketball', 'amphitheatre community',
       'amphitheatre earthquake', 'amphitheatre jogging',
       'amphitheatre party', 'amphitheatre power', 'amphitheatre sewage',
       'amphitheatre toddler', 'amphitheatre yoga', 'area', 'area 24',
       'area 24x7', 'area aerobics', 'area atm', 'area car', 'area cctv',
       'area club', 'area garbage', 'area gated', 'area gazebo',
       'area gymnasium', 'area indoor', 'area internal', 'area jogging',
       'area lift', 'area rain', 'area ro', 'area salon', 'area sauna',
       'area school', 'area sewage', 'area swimming', 'atm',
       'atm aerobics', 'atm cafeteria', 'atm card', 'atm creche',
       'atm high', 'atm jacuzzi', 'atm piped', 'atm re

In [48]:
#How many entries for first sentence (row 0)
tfidf_result[0,:].toarray()[0].shape

(973,)

In [49]:
similarity_facilities = cosine_similarity(tfidf_result, tfidf_result)

In [50]:
similarity_facilities.shape

(246, 246)

### Save the similarity matrices to file

In [54]:
import pickle as pkl


#similarity_location_advantage
#similarity_price_details
#similarity_facilities

with open('similarity_facilities.pkl', 'wb') as file:
    pkl.dump(similarity_facilities, file)

with open('similarity_price_details.pkl', 'wb') as file:
    pkl.dump(similarity_price_details, file)

with open('similarity_location_advantage.pkl', 'wb') as file:
    pkl.dump(similarity_location_advantage, file)

with open('apartment_names.pkl', 'wb') as file:
    pkl.dump(apartments['PropertyName'], file)