In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('appartments.csv')

In [3]:
df.iloc[22,:]
# this row is wrong so we will remove it

PropertyName                PropertyName
PropertySubName          PropertySubName
NearbyLocations          NearbyLocations
LocationAdvantages    LocationAdvantages
Link                                Link
PriceDetails                PriceDetails
TopFacilities              TopFacilities
Name: 22, dtype: object

In [4]:
df = df.drop(22)

In [5]:
df.shape

(246, 7)

In [6]:
df.head()

Unnamed: 0,PropertyName,PropertySubName,NearbyLocations,LocationAdvantages,Link,PriceDetails,TopFacilities
0,Smartworld One DXP,"2, 3, 4 BHK Apartment in Sector 113, Gurgaon","['Bajghera Road', 'Palam Vihar Halt', 'DPSG Pa...","{'Bajghera Road': '800 Meter', 'Palam Vihar Ha...",https://www.99acres.com/smartworld-one-dxp-sec...,"{'2 BHK': {'building_type': 'Apartment', 'area...","['Swimming Pool', 'Salon', 'Restaurant', 'Spa'..."
1,M3M Crown,"3, 4 BHK Apartment in Sector 111, Gurgaon","['DPSG Palam Vihar Gurugram', 'The NorthCap Un...","{'DPSG Palam Vihar Gurugram': '1.4 Km', 'The N...",https://www.99acres.com/m3m-crown-sector-111-g...,"{'3 BHK': {'building_type': 'Apartment', 'area...","['Bowling Alley', 'Mini Theatre', 'Manicured G..."
2,Adani Brahma Samsara Vilasa,"Land, 3, 4 BHK Independent Floor in Sector 63,...","['AIPL Business Club Sector 62', 'Heritage Xpe...","{'AIPL Business Club Sector 62': '2.7 Km', 'He...",https://www.99acres.com/adani-brahma-samsara-v...,{'3 BHK': {'building_type': 'Independent Floor...,"['Terrace Garden', 'Gazebo', 'Fountain', 'Amph..."
3,Sobha City,"2, 3, 4 BHK Apartment in Sector 108, Gurgaon","['The Shikshiyan School', 'WTC Plaza', 'Luxus ...","{'The Shikshiyan School': '2.9 KM', 'WTC Plaza...",https://www.99acres.com/sobha-city-sector-108-...,"{'2 BHK': {'building_type': 'Apartment', 'area...","['Swimming Pool', 'Volley Ball Court', 'Aerobi..."
4,Signature Global City 93,"2, 3 BHK Independent Floor in Sector 93 Gurgaon","['Pranavananda Int. School', 'DLF Site central...","{'Pranavananda Int. School': '450 m', 'DLF Sit...",https://www.99acres.com/signature-global-city-...,{'2 BHK': {'building_type': 'Independent Floor...,"['Mini Theatre', 'Doctor on Call', 'Concierge ..."


### There are two types of information we are getting from PropertySubName: 
1. Types of flats
2. Which sector it belongs to

This column seems important

### NearbyLocations. This column seems to be a subset of LocationAdvantages column, so we can drop it

In [7]:
df.iloc[2].NearbyLocations

"['AIPL Business Club Sector 62', 'Heritage Xperiential Learning School', 'CK Birla Hospital', 'Paras Trinity Mall Sector 63', 'Rapid Metro Station Sector 56']"

In [8]:
df.iloc[2].LocationAdvantages

"{'AIPL Business Club Sector 62': '2.7 Km', 'Heritage Xperiential Learning School': '2 Km', 'CK Birla Hospital': '2.5 Km', 'Paras Trinity Mall Sector 63': '3.5 Km', 'Rapid Metro Station Sector 56': '3.8 Km', 'De Adventure Park': '6.8 Km', 'Golf Course Ext Rd': '99 Meter', 'DoubleTree by Hilton Hotel Gurgaon': '3.6 Km', 'KIIT College of Engineering Sohna Road': '8.4 Km', 'Mehrauli-Gurgaon Road': '11.8 Km', 'Indira Gandhi International Airport': '21.1 Km', 'Nirvana Rd': '160 Meter', 'TERI Golf Course': '8.7 Km'}"

In [9]:
df.iloc[1].PriceDetails

# This is giving detailed information from the column PropertySubName

"{'3 BHK': {'building_type': 'Apartment', 'area_type': 'Super Built-up Area', 'area': '1,605 - 2,170 sq.ft.', 'price-range': '₹ 2.2 - 3.03 Cr'}, '4 BHK': {'building_type': 'Apartment', 'area_type': 'Super Built-up Area', 'area': '2,248 - 2,670 sq.ft.', 'price-range': '₹ 3.08 - 3.73 Cr'}}"

In [10]:
# We won't use the PropertySubName and NearbyLocations column in our recommender system, 
# as they are provided in more detailed way in LocationAdvantages and PriceDetails column


In [11]:
df.iloc[2].TopFacilities

# tells the amenties in that society

"['Terrace Garden', 'Gazebo', 'Fountain', 'Amphitheatre', 'Party Lawn', 'Basketball Court', 'Badminton Court', 'Yoga/Meditation Area', 'Indoor Games']"

### We will make 3 recommender systems
1. Based on LocationAdvantages
2. Based on PriceDetails
3. Based on TopFacilities

Suppose now we want to recommend the top 5 similar apartments to a given input.
Then as the user will input a society name, then we will ask all the 3 recommendation systems. Then all 3 of them will give some results.
Then at the end we will merge their results and give final 5 recommendations as output

### We could have made only one single recommender system also. But we want to assign weights to all these 3 recommender systems. We can change their weights anytime we want to modify the result accordingly. This introduces additional flexibility in the system.

In [12]:
df[['PropertyName','TopFacilities']]

Unnamed: 0,PropertyName,TopFacilities
0,Smartworld One DXP,"['Swimming Pool', 'Salon', 'Restaurant', 'Spa'..."
1,M3M Crown,"['Bowling Alley', 'Mini Theatre', 'Manicured G..."
2,Adani Brahma Samsara Vilasa,"['Terrace Garden', 'Gazebo', 'Fountain', 'Amph..."
3,Sobha City,"['Swimming Pool', 'Volley Ball Court', 'Aerobi..."
4,Signature Global City 93,"['Mini Theatre', 'Doctor on Call', 'Concierge ..."
...,...,...
242,DLF Princeton Estate,"['Swimming Pool', 'Medical Centre', 'Laundry',..."
243,Pyramid Urban Homes 2,"['Shopping Centre', 'Community Hall', '24x7 Se..."
244,Satya The Hermitage,"['Bus Shelter', 'Swimming Pool', 'Business Lou..."
245,BPTP Spacio,"['Swimming Pool', 'Card Room', 'Piped Gas', 'P..."


### We can see that each apartment has its TopFacilities mentioned in a list. So we will convert this list into a big string for each apartment. 

Then we will perform Text Vectorization ie the strings of 247 apartments will be converted into 247 vectors
Then we will calculate the distance between one vector with all the vectors.
Now if someone will input a vector, then as we will have the top 5 closest vector for that vector, then we will output the top 5 vectors

ie 3 steps:
1. Vectorization
2. Distances Calculation
3. Recommendation

In [13]:
df[['PropertyName','TopFacilities']]['TopFacilities'][0]

# it is a list inside a string, so we will extract the list from this string

"['Swimming Pool', 'Salon', 'Restaurant', 'Spa', 'Cafeteria', 'Sun Deck', '24x7 Security', 'Club House', 'Gated Community']"

In [14]:
def extract_list(s):
    return re.findall(r"'(.*?)'", s)

df['TopFacilities'] = df['TopFacilities'].apply(extract_list)

In [15]:
df[['PropertyName','TopFacilities']]['TopFacilities'][0]

# now we have got the list from the string

['Swimming Pool',
 'Salon',
 'Restaurant',
 'Spa',
 'Cafeteria',
 'Sun Deck',
 '24x7 Security',
 'Club House',
 'Gated Community']

In [16]:
df.head()

Unnamed: 0,PropertyName,PropertySubName,NearbyLocations,LocationAdvantages,Link,PriceDetails,TopFacilities
0,Smartworld One DXP,"2, 3, 4 BHK Apartment in Sector 113, Gurgaon","['Bajghera Road', 'Palam Vihar Halt', 'DPSG Pa...","{'Bajghera Road': '800 Meter', 'Palam Vihar Ha...",https://www.99acres.com/smartworld-one-dxp-sec...,"{'2 BHK': {'building_type': 'Apartment', 'area...","[Swimming Pool, Salon, Restaurant, Spa, Cafete..."
1,M3M Crown,"3, 4 BHK Apartment in Sector 111, Gurgaon","['DPSG Palam Vihar Gurugram', 'The NorthCap Un...","{'DPSG Palam Vihar Gurugram': '1.4 Km', 'The N...",https://www.99acres.com/m3m-crown-sector-111-g...,"{'3 BHK': {'building_type': 'Apartment', 'area...","[Bowling Alley, Mini Theatre, Manicured Garden..."
2,Adani Brahma Samsara Vilasa,"Land, 3, 4 BHK Independent Floor in Sector 63,...","['AIPL Business Club Sector 62', 'Heritage Xpe...","{'AIPL Business Club Sector 62': '2.7 Km', 'He...",https://www.99acres.com/adani-brahma-samsara-v...,{'3 BHK': {'building_type': 'Independent Floor...,"[Terrace Garden, Gazebo, Fountain, Amphitheatr..."
3,Sobha City,"2, 3, 4 BHK Apartment in Sector 108, Gurgaon","['The Shikshiyan School', 'WTC Plaza', 'Luxus ...","{'The Shikshiyan School': '2.9 KM', 'WTC Plaza...",https://www.99acres.com/sobha-city-sector-108-...,"{'2 BHK': {'building_type': 'Apartment', 'area...","[Swimming Pool, Volley Ball Court, Aerobics Ce..."
4,Signature Global City 93,"2, 3 BHK Independent Floor in Sector 93 Gurgaon","['Pranavananda Int. School', 'DLF Site central...","{'Pranavananda Int. School': '450 m', 'DLF Sit...",https://www.99acres.com/signature-global-city-...,{'2 BHK': {'building_type': 'Independent Floor...,"[Mini Theatre, Doctor on Call, Concierge Servi..."


In [17]:
# converting that list to a string using join function
df['FacilitiesStr'] = df['TopFacilities'].apply(' '.join)

In [18]:
df.head()

Unnamed: 0,PropertyName,PropertySubName,NearbyLocations,LocationAdvantages,Link,PriceDetails,TopFacilities,FacilitiesStr
0,Smartworld One DXP,"2, 3, 4 BHK Apartment in Sector 113, Gurgaon","['Bajghera Road', 'Palam Vihar Halt', 'DPSG Pa...","{'Bajghera Road': '800 Meter', 'Palam Vihar Ha...",https://www.99acres.com/smartworld-one-dxp-sec...,"{'2 BHK': {'building_type': 'Apartment', 'area...","[Swimming Pool, Salon, Restaurant, Spa, Cafete...",Swimming Pool Salon Restaurant Spa Cafeteria S...
1,M3M Crown,"3, 4 BHK Apartment in Sector 111, Gurgaon","['DPSG Palam Vihar Gurugram', 'The NorthCap Un...","{'DPSG Palam Vihar Gurugram': '1.4 Km', 'The N...",https://www.99acres.com/m3m-crown-sector-111-g...,"{'3 BHK': {'building_type': 'Apartment', 'area...","[Bowling Alley, Mini Theatre, Manicured Garden...",Bowling Alley Mini Theatre Manicured Garden Sw...
2,Adani Brahma Samsara Vilasa,"Land, 3, 4 BHK Independent Floor in Sector 63,...","['AIPL Business Club Sector 62', 'Heritage Xpe...","{'AIPL Business Club Sector 62': '2.7 Km', 'He...",https://www.99acres.com/adani-brahma-samsara-v...,{'3 BHK': {'building_type': 'Independent Floor...,"[Terrace Garden, Gazebo, Fountain, Amphitheatr...",Terrace Garden Gazebo Fountain Amphitheatre Pa...
3,Sobha City,"2, 3, 4 BHK Apartment in Sector 108, Gurgaon","['The Shikshiyan School', 'WTC Plaza', 'Luxus ...","{'The Shikshiyan School': '2.9 KM', 'WTC Plaza...",https://www.99acres.com/sobha-city-sector-108-...,"{'2 BHK': {'building_type': 'Apartment', 'area...","[Swimming Pool, Volley Ball Court, Aerobics Ce...",Swimming Pool Volley Ball Court Aerobics Centr...
4,Signature Global City 93,"2, 3 BHK Independent Floor in Sector 93 Gurgaon","['Pranavananda Int. School', 'DLF Site central...","{'Pranavananda Int. School': '450 m', 'DLF Sit...",https://www.99acres.com/signature-global-city-...,{'2 BHK': {'building_type': 'Independent Floor...,"[Mini Theatre, Doctor on Call, Concierge Servi...",Mini Theatre Doctor on Call Concierge Service ...


In [19]:
df['FacilitiesStr'][0]

'Swimming Pool Salon Restaurant Spa Cafeteria Sun Deck 24x7 Security Club House Gated Community'

In [20]:
# now we want to convert these strings into vectors, ie you can use Bag of words, TFIDF, or word2vec. We have used TF-IDF vectorizer

In [21]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))

In [22]:
tfidf_matrix = tfidf_vectorizer.fit_transform(df['FacilitiesStr'])

In [23]:
tfidf_matrix.toarray().shape

(246, 953)

### here all the 247 apartments have been converted into vectors of 953 dimensions each. Now we will calculate the distances. If we have high dimensional data, then Euclidean distance is little ineffective. But angular distance gives good results, so will use cosine similarity

In [24]:
cosine_sim1 = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [25]:
cosine_sim1.shape

# (247, 247) means the distance from each vector with all the 247 vectors

(246, 246)

In [26]:
# this function recommends the top 5 nearest property_names according to the cosine similarity matrix 

def recommend_properties(property_name, cosine_sim=cosine_sim1):
    # Get the index of the property that matches the name
    idx = df.index[df['PropertyName'] == property_name].tolist()[0]

    # Get the pairwise similarity scores with that property
    sim_scores = list(enumerate(cosine_sim1[idx]))

    # Sort the properties based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar properties
    sim_scores = sim_scores[1:6]

    # Get the property indices
    property_indices = [i[0] for i in sim_scores]
    
    recommendations_df = pd.DataFrame({
        'PropertyName': df['PropertyName'].iloc[property_indices],
        'SimilarityScore': sim_scores
    })

    # Return the top 10 most similar properties
    return recommendations_df

In [27]:
recommend_properties("DLF The Arbour")

# this recommendation is shown on the basis of TopFacilities

Unnamed: 0,PropertyName,SimilarityScore
64,Ace Palm Floors,"(63, 0.45293820624419556)"
217,Yashika 104,"(216, 0.41996063229267827)"
93,JMS The Nation,"(92, 0.4166584649363288)"
154,India Rashtra,"(153, 0.39895423468019414)"
0,Smartworld One DXP,"(0, 0.388850461994329)"


### First recommendation system is done on the basis of TopFacilities. Now making on the basis of PriceDetails


In [28]:
df[['PropertyName','PriceDetails']]['PriceDetails'][1]

# it contains hierarchical data. So because of this structure we will do the same thing which is done for TopFacilities column

"{'3 BHK': {'building_type': 'Apartment', 'area_type': 'Super Built-up Area', 'area': '1,605 - 2,170 sq.ft.', 'price-range': '₹ 2.2 - 3.03 Cr'}, '4 BHK': {'building_type': 'Apartment', 'area_type': 'Super Built-up Area', 'area': '2,248 - 2,670 sq.ft.', 'price-range': '₹ 3.08 - 3.73 Cr'}}"

### to bring it into a simplified format, we will convert this information into a table and then convert into vectors. 
eg the table would have columns like building_type_3bhk, area_3bhk, price_3bhk, building_type_4bhk, area_4bhk, price_4bhk for each apartment 

In [29]:
# this is the function which will do this task

import pandas as pd
import json

# Load the dataset
df_appartments = pd.read_csv('appartments.csv').drop(22)

# Function to parse and extract the required features from the PriceDetails column
def refined_parse_modified_v2(detail_str):
    try:
        details = json.loads(detail_str.replace("'", "\""))
    except:
        return {}

    extracted = {}
    for bhk, detail in details.items():
        # Extract building type
        extracted[f'building type_{bhk}'] = detail.get('building_type')

        # Parsing area details
        area = detail.get('area', '')
        area_parts = area.split('-')
        if len(area_parts) == 1:
            try:
                value = float(area_parts[0].replace(',', '').replace(' sq.ft.', '').strip())
                extracted[f'area low {bhk}'] = value
                extracted[f'area high {bhk}'] = value
            except:
                extracted[f'area low {bhk}'] = None
                extracted[f'area high {bhk}'] = None
        elif len(area_parts) == 2:
            try:
                extracted[f'area low {bhk}'] = float(area_parts[0].replace(',', '').replace(' sq.ft.', '').strip())
                extracted[f'area high {bhk}'] = float(area_parts[1].replace(',', '').replace(' sq.ft.', '').strip())
            except:
                extracted[f'area low {bhk}'] = None
                extracted[f'area high {bhk}'] = None

        # Parsing price details
        price_range = detail.get('price-range', '')
        price_parts = price_range.split('-')
        if len(price_parts) == 2:
            try:
                extracted[f'price low {bhk}'] = float(price_parts[0].replace('₹', '').replace(' Cr', '').replace(' L', '').strip())
                extracted[f'price high {bhk}'] = float(price_parts[1].replace('₹', '').replace(' Cr', '').replace(' L', '').strip())
                if 'L' in price_parts[0]:
                    extracted[f'price low {bhk}'] /= 100
                if 'L' in price_parts[1]:
                    extracted[f'price high {bhk}'] /= 100
            except:
                extracted[f'price low {bhk}'] = None
                extracted[f'price high {bhk}'] = None

    return extracted
# Apply the refined parsing and generate the new DataFrame structure
data_refined = []

for _, row in df_appartments.iterrows():
    features = refined_parse_modified_v2(row['PriceDetails'])
    
    # Construct a new row for the transformed dataframe
    new_row = {'PropertyName': row['PropertyName']}
    
    # Populate the new row with extracted features
    for config in ['1 BHK', '2 BHK', '3 BHK', '4 BHK', '5 BHK', '6 BHK', '1 RK', 'Land']:
        new_row[f'building type_{config}'] = features.get(f'building type_{config}')
        new_row[f'area low {config}'] = features.get(f'area low {config}')
        new_row[f'area high {config}'] = features.get(f'area high {config}')
        new_row[f'price low {config}'] = features.get(f'price low {config}')
        new_row[f'price high {config}'] = features.get(f'price high {config}')
    
    data_refined.append(new_row)

df_final_refined_v2 = pd.DataFrame(data_refined).set_index('PropertyName')


In [30]:
df_final_refined_v2['building type_Land'] = df_final_refined_v2['building type_Land'].replace({'':'Land'})

In [31]:
df['PriceDetails'][10]

"{'2 BHK': {'building_type': 'Independent Floor', 'area_type': 'Carpet Area', 'area': '1,055 sq.ft.', 'price-range': '₹ 1.05 - 1.5 Cr'}, '3 BHK': {'building_type': 'Independent Floor', 'area_type': 'Carpet Area', 'area': '1,325 - 1,525 sq.ft.', 'price-range': '₹ 1.35 - 1.84 Cr'}}"

In [32]:
df_final_refined_v2

Unnamed: 0_level_0,building type_1 BHK,area low 1 BHK,area high 1 BHK,price low 1 BHK,price high 1 BHK,building type_2 BHK,area low 2 BHK,area high 2 BHK,price low 2 BHK,price high 2 BHK,...,building type_1 RK,area low 1 RK,area high 1 RK,price low 1 RK,price high 1 RK,building type_Land,area low Land,area high Land,price low Land,price high Land
PropertyName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Smartworld One DXP,,,,,,Apartment,1370.0,1370.0,2.0000,2.40,...,,,,,,,,,,
M3M Crown,,,,,,,,,,,...,,,,,,,,,,
Adani Brahma Samsara Vilasa,,,,,,,,,,,...,,,,,,Land,500.0,4329.0,2.05,41.13
Sobha City,,,,,,Apartment,1381.0,1692.0,1.5500,3.21,...,,,,,,,,,,
Signature Global City 93,,,,,,Independent Floor,981.0,1118.0,0.9301,1.06,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DLF Princeton Estate,,,,,,Apartment,964.0,964.0,,,...,,,,,,,,,,
Pyramid Urban Homes 2,Apartment,335.0,398.0,23.45,0.2786,Apartment,500.0,625.0,,,...,,,,,,,,,,
Satya The Hermitage,,,,,,Apartment,1450.0,1450.0,,,...,,,,,,,,,,
BPTP Spacio,,,,,,Apartment,1000.0,1079.0,,,...,,,,,,,,,,


In [33]:
df_final_refined_v2.columns

Index(['building type_1 BHK', 'area low 1 BHK', 'area high 1 BHK',
       'price low 1 BHK', 'price high 1 BHK', 'building type_2 BHK',
       'area low 2 BHK', 'area high 2 BHK', 'price low 2 BHK',
       'price high 2 BHK', 'building type_3 BHK', 'area low 3 BHK',
       'area high 3 BHK', 'price low 3 BHK', 'price high 3 BHK',
       'building type_4 BHK', 'area low 4 BHK', 'area high 4 BHK',
       'price low 4 BHK', 'price high 4 BHK', 'building type_5 BHK',
       'area low 5 BHK', 'area high 5 BHK', 'price low 5 BHK',
       'price high 5 BHK', 'building type_6 BHK', 'area low 6 BHK',
       'area high 6 BHK', 'price low 6 BHK', 'price high 6 BHK',
       'building type_1 RK', 'area low 1 RK', 'area high 1 RK',
       'price low 1 RK', 'price high 1 RK', 'building type_Land',
       'area low Land', 'area high Land', 'price low Land', 'price high Land'],
      dtype='object')

In [34]:
df['PriceDetails'][0]

"{'2 BHK': {'building_type': 'Apartment', 'area_type': 'Carpet Area', 'area': '1,370 sq.ft.', 'price-range': '₹ 2 - 2.4 Cr'}, '3 BHK': {'building_type': 'Apartment', 'area_type': 'Carpet Area', 'area': '1,850 - 2,050 sq.ft.', 'price-range': '₹ 2.25 - 3.59 Cr'}, '4 BHK': {'building_type': 'Apartment', 'area_type': 'Carpet Area', 'area': '2,600 sq.ft.', 'price-range': '₹ 3.24 - 4.56 Cr'}}"

In [35]:
df_final_refined_v2.iloc[0]

building type_1 BHK         None
area low 1 BHK               NaN
area high 1 BHK              NaN
price low 1 BHK              NaN
price high 1 BHK             NaN
building type_2 BHK    Apartment
area low 2 BHK            1370.0
area high 2 BHK           1370.0
price low 2 BHK              2.0
price high 2 BHK             2.4
building type_3 BHK    Apartment
area low 3 BHK            1850.0
area high 3 BHK           2050.0
price low 3 BHK             2.25
price high 3 BHK            3.59
building type_4 BHK    Apartment
area low 4 BHK            2600.0
area high 4 BHK           2600.0
price low 4 BHK             3.24
price high 4 BHK            4.56
building type_5 BHK         None
area low 5 BHK               NaN
area high 5 BHK              NaN
price low 5 BHK              NaN
price high 5 BHK             NaN
building type_6 BHK         None
area low 6 BHK               NaN
area high 6 BHK              NaN
price low 6 BHK              NaN
price high 6 BHK             NaN
building t

In [36]:
categorical_columns = df_final_refined_v2.select_dtypes(include=['object']).columns.tolist()

In [37]:
categorical_columns

['building type_1 BHK',
 'building type_2 BHK',
 'building type_3 BHK',
 'building type_4 BHK',
 'building type_5 BHK',
 'building type_6 BHK',
 'building type_1 RK',
 'building type_Land']

### We are applying one hot encoding on categorical columns

In [38]:
ohe_df = pd.get_dummies(df_final_refined_v2, columns=categorical_columns, drop_first=True)

In [39]:
ohe_df.fillna(0,inplace=True)

In [40]:
ohe_df

Unnamed: 0_level_0,area low 1 BHK,area high 1 BHK,price low 1 BHK,price high 1 BHK,area low 2 BHK,area high 2 BHK,price low 2 BHK,price high 2 BHK,area low 3 BHK,area high 3 BHK,...,building type_2 BHK_Independent Floor,building type_2 BHK_Service Apartment,building type_3 BHK_Independent Floor,building type_3 BHK_Service Apartment,building type_3 BHK_Villa,building type_4 BHK_Independent Floor,building type_4 BHK_Villa,building type_5 BHK_Independent Floor,building type_5 BHK_Villa,building type_6 BHK_Villa
PropertyName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Smartworld One DXP,0.0,0.0,0.00,0.0000,1370.0,1370.0,2.0000,2.40,1850.0,2050.0,...,False,False,False,False,False,False,False,False,False,False
M3M Crown,0.0,0.0,0.00,0.0000,0.0,0.0,0.0000,0.00,1605.0,2170.0,...,False,False,False,False,False,False,False,False,False,False
Adani Brahma Samsara Vilasa,0.0,0.0,0.00,0.0000,0.0,0.0,0.0000,0.00,1800.0,3150.0,...,False,False,True,False,False,True,False,False,False,False
Sobha City,0.0,0.0,0.00,0.0000,1381.0,1692.0,1.5500,3.21,1711.0,2343.0,...,False,False,False,False,False,False,False,False,False,False
Signature Global City 93,0.0,0.0,0.00,0.0000,981.0,1118.0,0.9301,1.06,1235.0,1530.0,...,True,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DLF Princeton Estate,0.0,0.0,0.00,0.0000,964.0,964.0,0.0000,0.00,1127.0,1127.0,...,False,False,False,False,False,False,False,False,False,False
Pyramid Urban Homes 2,335.0,398.0,23.45,0.2786,500.0,625.0,0.0000,0.00,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
Satya The Hermitage,0.0,0.0,0.00,0.0000,1450.0,1450.0,0.0000,0.00,1991.0,1991.0,...,False,False,False,False,False,False,False,False,False,False
BPTP Spacio,0.0,0.0,0.00,0.0000,1000.0,1079.0,0.0000,0.00,1225.0,1865.0,...,False,False,False,False,False,False,False,False,False,False


In [41]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Apply the scaler to the entire dataframe
ohe_df_normalized = pd.DataFrame(scaler.fit_transform(ohe_df), columns=ohe_df.columns, index=ohe_df.index)

# data is scaled between -1 and 1 so that even if we do anything related to distance then we will have better results

In [42]:
ohe_df_normalized.head()

Unnamed: 0_level_0,area low 1 BHK,area high 1 BHK,price low 1 BHK,price high 1 BHK,area low 2 BHK,area high 2 BHK,price low 2 BHK,price high 2 BHK,area low 3 BHK,area high 3 BHK,...,building type_2 BHK_Independent Floor,building type_2 BHK_Service Apartment,building type_3 BHK_Independent Floor,building type_3 BHK_Service Apartment,building type_3 BHK_Villa,building type_4 BHK_Independent Floor,building type_4 BHK_Villa,building type_5 BHK_Independent Floor,building type_5 BHK_Villa,building type_6 BHK_Villa
PropertyName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Smartworld One DXP,-0.252266,-0.169584,-0.105197,-0.082332,1.223499,1.020101,-0.173712,1.158423,0.553787,0.370864,...,-0.28931,-0.063888,-0.372678,-0.063888,-0.171139,-0.254824,-0.236208,-0.111111,-0.216353,-0.063888
M3M Crown,-0.252266,-0.169584,-0.105197,-0.082332,-0.893541,-0.89666,-0.283546,-0.387986,0.293086,0.472749,...,-0.28931,-0.063888,-0.372678,-0.063888,-0.171139,-0.254824,-0.236208,-0.111111,-0.216353,-0.063888
Adani Brahma Samsara Vilasa,-0.252266,-0.169584,-0.105197,-0.082332,-0.893541,-0.89666,-0.283546,-0.387986,0.500583,1.304803,...,-0.28931,-0.063888,2.683282,-0.063888,-0.171139,3.924283,-0.236208,-0.111111,-0.216353,-0.063888
Sobha City,-0.252266,-0.169584,-0.105197,-0.082332,1.240497,1.47061,-0.198425,1.680336,0.405879,0.619632,...,-0.28931,-0.063888,-0.372678,-0.063888,-0.171139,-0.254824,-0.236208,-0.111111,-0.216353,-0.063888
Signature Global City 93,-0.252266,-0.169584,-0.105197,-0.082332,0.622383,0.667529,-0.232468,0.295011,-0.100626,-0.070634,...,3.456497,-0.063888,2.683282,-0.063888,-0.171139,-0.254824,-0.236208,-0.111111,-0.216353,-0.063888


In [43]:
from sklearn.metrics.pairwise import cosine_similarity
# calculating the cosine similarity of it
# Compute the cosine similarity matrix
cosine_sim2 = cosine_similarity(ohe_df_normalized)

In [44]:
cosine_sim2.shape

(246, 246)

In [45]:
def recommend_properties_with_scores(property_name, top_n=247):
    
    # Get the similarity scores for the property using its name as the index
    sim_scores = list(enumerate(cosine_sim2[ohe_df_normalized.index.get_loc(property_name)]))
    
    # Sort properties based on the similarity scores
    sorted_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the indices and scores of the top_n most similar properties
    top_indices = [i[0] for i in sorted_scores[1:top_n+1]]
    top_scores = [i[1] for i in sorted_scores[1:top_n+1]]
    
    # Retrieve the names of the top properties using the indices
    top_properties = ohe_df_normalized.index[top_indices].tolist()
    
    # Create a dataframe with the results
    recommendations_df = pd.DataFrame({
        'PropertyName': top_properties,
        'SimilarityScore': top_scores
    })
    
    return recommendations_df


In [46]:
# Test the recommender function using a property name
recommend_properties_with_scores('M3M Golf Hills')

Unnamed: 0,PropertyName,SimilarityScore
0,AIPL The Peaceful Homes,0.955462
1,Smartworld One DXP,0.954670
2,Unitech Escape,0.953092
3,M3M Capital,0.951156
4,BPTP Terra,0.943128
...,...,...
240,Golden Park,-0.522391
241,Satya Merano Greens,-0.523660
242,ROF Normanton Park,-0.525129
243,BPTP Green Oaks,-0.525286


### Recommendation system on the basis of PriceDetails is done. Now we will make on the basis of LocationAdvantages

In [47]:
df[['PropertyName','LocationAdvantages']]['LocationAdvantages'][0]

"{'Bajghera Road': '800 Meter', 'Palam Vihar Halt': '2.5 KM', 'DPSG Palam Vihar': '3.1 KM', 'Park Hospital': '3.1 KM', 'Gurgaon Railway Station': '4.9 KM', 'The NorthCap University': '5.4 KM', 'Dwarka Expy': '1.2 KM', 'Hyatt Place Gurgaon Udyog Vihar': '7.7 KM', 'Dwarka Sector 21, Metro Station': '7.2 KM', 'Pacific D21 Mall': '7.4 KM', 'Indira Gandhi International Airport': '14.7 KM', 'Hamoni Golf Camp': '6.2 KM', 'Fun N Food Waterpark': '8.8 KM', 'Accenture DDC5': '9 KM'}"

### We will make each unique landmark as a separate column
Then we will go to each apartment and ask that what is the distance between you and that landmark and then fill the table accordingly

In [50]:
def distance_to_meters(distance_str):
    try:
        if 'Km' in distance_str or 'KM' in distance_str:
            return float(distance_str.split()[0]) * 1000
        elif 'Meter' in distance_str or 'meter' in distance_str:
            return float(distance_str.split()[0])
        else:
            return None
    except:
        return None

In [97]:
# Extract distances for each location
import ast
location_matrix = {}
for index, row in df.iterrows():
    distances = {}
    for location, distance in ast.literal_eval(row['LocationAdvantages']).items():
        distances[location] = distance_to_meters(distance)
    location_matrix[index] = distances

# Convert the dictionary to a dataframe
location_df = pd.DataFrame.from_dict(location_matrix, orient='index')

# Display the first few rows
location_df.head()

Unnamed: 0,Bajghera Road,Palam Vihar Halt,DPSG Palam Vihar,Park Hospital,Gurgaon Railway Station,The NorthCap University,Dwarka Expy,Hyatt Place Gurgaon Udyog Vihar,"Dwarka Sector 21, Metro Station",Pacific D21 Mall,...,MCC Cricket Ground Dhankot,The Shri Ram School Aravali,Taj City Centre Gurugram,Minda Industries Corporate Office,"Rampura Flyover, Naurangpur Rd",Manesar toll plaza - Kherki Daula,"Imt Manesar, Gurugram",Holiday Inn,Sector 84 Road,Skyview Corporate Park
0,800.0,2500.0,3100.0,3100.0,4900.0,5400.0,1200.0,7700.0,7200.0,7400.0,...,,,,,,,,,,
25,550.0,,,,,6700.0,3800.0,,,7500.0,...,,,,,,,,,,
37,5300.0,,,,2500.0,8800.0,,,,,...,,,,,,,,,,
69,1500.0,,,,6500.0,6700.0,5100.0,,,8200.0,...,,,,,,,,,,
9,,,,5500.0,,,,,,,...,,,,,,,,,,


In [98]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [99]:
pd.set_option('display.max_rows', None)


In [100]:

# Reset max columns and max rows to default
pd.reset_option('display.max_columns')
pd.reset_option('display.max_rows')

In [101]:
# # location_df.columns
# # Convert the columns index to a list and display
# columns_list = location_df.columns.tolist()
# columns_list.sort()
# # Print all column names without truncation
# for col in columns_list:
#     print(col)


In [102]:
temp = location_df[['IGI Airport', 'IG International Airport','Delhi International Airport', 'IGIA Airport','Airport', 'International Airport', 'Indira Gandhi Int. Airport', 'Indira Gandhi Airport', 'Indira Gandhi International Airport', 'Indira Gandhi Intl Airport' ]]

In [103]:
temp

Unnamed: 0,IGI Airport,IG International Airport,Delhi International Airport,IGIA Airport,Airport,International Airport,Indira Gandhi Int. Airport,Indira Gandhi Airport,Indira Gandhi International Airport,Indira Gandhi Intl Airport
0,,,,,,,,,14700.0,
25,,,,,,,,,,15600.0
37,,,,,,,,,,20800.0
69,,,,,,,,,,16100.0
9,,,,,,,,,,24800.0
...,...,...,...,...,...,...,...,...,...,...
231,,,,,,,,,,
97,,,,,,,,,,
159,,,,,35100.0,,,,,
226,,,,,,,,,,


In [104]:
temp['final'] = temp.apply(lambda row: row.dropna().values[0] if row.dropna().values.size > 0 else np.nan, axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['final'] = temp.apply(lambda row: row.dropna().values[0] if row.dropna().values.size > 0 else np.nan, axis=1)


In [105]:
temp

Unnamed: 0,IGI Airport,IG International Airport,Delhi International Airport,IGIA Airport,Airport,International Airport,Indira Gandhi Int. Airport,Indira Gandhi Airport,Indira Gandhi International Airport,Indira Gandhi Intl Airport,final
0,,,,,,,,,14700.0,,14700.0
25,,,,,,,,,,15600.0,15600.0
37,,,,,,,,,,20800.0,20800.0
69,,,,,,,,,,16100.0,16100.0
9,,,,,,,,,,24800.0,24800.0
...,...,...,...,...,...,...,...,...,...,...,...
231,,,,,,,,,,,
97,,,,,,,,,,,
159,,,,,35100.0,,,,,,35100.0
226,,,,,,,,,,,


In [106]:
# temp = temp['final']

In [107]:
temp

Unnamed: 0,IGI Airport,IG International Airport,Delhi International Airport,IGIA Airport,Airport,International Airport,Indira Gandhi Int. Airport,Indira Gandhi Airport,Indira Gandhi International Airport,Indira Gandhi Intl Airport,final
0,,,,,,,,,14700.0,,14700.0
25,,,,,,,,,,15600.0,15600.0
37,,,,,,,,,,20800.0,20800.0
69,,,,,,,,,,16100.0,16100.0
9,,,,,,,,,,24800.0,24800.0
...,...,...,...,...,...,...,...,...,...,...,...
231,,,,,,,,,,,
97,,,,,,,,,,,
159,,,,,35100.0,,,,,,35100.0
226,,,,,,,,,,,


In [108]:
# List of columns to drop
columns_to_drop = ['IGI Airport', 'IG International Airport', 'Delhi International Airport', 
                   'IGIA Airport', 'Airport', 'International Airport', 
                   'Indira Gandhi Int. Airport', 'Indira Gandhi Airport', 
                   'Indira Gandhi International Airport', 'Indira Gandhi Intl Airport']

# Drop the specified columns from location_df
location_df = location_df.drop(columns=columns_to_drop, errors='ignore')

# Merge the 'final' column from temp into location_df and rename it
location_df['Indira Gandhi International Airport'] = temp['final']

In [110]:
location_df.shape

(246, 1061)

In [112]:
temp = location_df[['Dwarka', 'Dwarka Expressway', 'Dwarka Expressway Link Road' ,'Dwarka Expy', 'Dwarka expressway']]

In [113]:
temp

Unnamed: 0,Dwarka,Dwarka Expressway,Dwarka Expressway Link Road,Dwarka Expy,Dwarka expressway
0,,,,1200.0,
25,,,,3800.0,
37,,700.0,,,
69,,,,5100.0,
9,,,,,
...,...,...,...,...,...
231,,,,,
97,,,,,
159,,,,,
226,,,,,


In [114]:
temp['final'] = temp.apply(lambda row: row.dropna().values[0] if row.dropna().values.size > 0 else np.nan, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['final'] = temp.apply(lambda row: row.dropna().values[0] if row.dropna().values.size > 0 else np.nan, axis=1)


In [115]:
temp

Unnamed: 0,Dwarka,Dwarka Expressway,Dwarka Expressway Link Road,Dwarka Expy,Dwarka expressway,final
0,,,,1200.0,,1200.0
25,,,,3800.0,,3800.0
37,,700.0,,,,700.0
69,,,,5100.0,,5100.0
9,,,,,,
...,...,...,...,...,...,...
231,,,,,,
97,,,,,,
159,,,,,,
226,,,,,,


In [116]:
# List of columns to drop
columns_to_drop = ['Dwarka', 'Dwarka Expressway', 'Dwarka Expressway Link Road' ,'Dwarka Expy', 'Dwarka expressway']

# Drop the specified columns from location_df
location_df = location_df.drop(columns=columns_to_drop, errors='ignore')

# Merge the 'final' column from temp into location_df and rename it
location_df['Dwarka Expressway'] = temp['final']

In [117]:
location_df

Unnamed: 0,Bajghera Road,Palam Vihar Halt,DPSG Palam Vihar,Park Hospital,Gurgaon Railway Station,The NorthCap University,Hyatt Place Gurgaon Udyog Vihar,"Dwarka Sector 21, Metro Station",Pacific D21 Mall,Hamoni Golf Camp,...,Taj City Centre Gurugram,Minda Industries Corporate Office,"Rampura Flyover, Naurangpur Rd",Manesar toll plaza - Kherki Daula,"Imt Manesar, Gurugram",Holiday Inn,Sector 84 Road,Skyview Corporate Park,Indira Gandhi International Airport,Dwarka Expressway
0,800.0,2500.0,3100.0,3100.0,4900.0,5400.0,7700.0,7200.0,7400.0,6200.0,...,,,,,,,,,14700.0,1200.0
25,550.0,,,,,6700.0,,,7500.0,,...,,,,,,,,,15600.0,3800.0
37,5300.0,,,,2500.0,8800.0,,,,,...,,,,,,,,,20800.0,700.0
69,1500.0,,,,6500.0,6700.0,,,8200.0,8000.0,...,,,,,,,,,16100.0,5100.0
9,,,,5500.0,,,,,,,...,,,,,,,,,24800.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231,,,,,,,,,,,...,,,,,,,,,,
97,,,,,,,,,,,...,,,,,,,,,,
159,,,,,,,,,,,...,,,,,,,,,35100.0,
226,,,,,,,,,,,...,,,,,,,,,,


In [118]:
location_df.shape

(246, 1057)

In [119]:
temp = location_df[['Dwarka Sector 21', 'Dwarka Sector 21 Metro Station', 'Dwarka Sector 21 Metro station', 'Dwarka Sector 21, Metro Station', 'Dwarka sector 21 metro station']]

In [121]:
temp.head()

Unnamed: 0,Dwarka Sector 21,Dwarka Sector 21 Metro Station,Dwarka Sector 21 Metro station,"Dwarka Sector 21, Metro Station",Dwarka sector 21 metro station
0,,,,7200.0,
25,7400.0,,,,
37,,,,,
69,,,8100.0,,
9,,,,,


In [122]:
temp['final'] = temp.apply(lambda row: row.dropna().values[0] if row.dropna().values.size > 0 else np.nan, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['final'] = temp.apply(lambda row: row.dropna().values[0] if row.dropna().values.size > 0 else np.nan, axis=1)


In [123]:
temp.head()

Unnamed: 0,Dwarka Sector 21,Dwarka Sector 21 Metro Station,Dwarka Sector 21 Metro station,"Dwarka Sector 21, Metro Station",Dwarka sector 21 metro station,final
0,,,,7200.0,,7200.0
25,7400.0,,,,,7400.0
37,,,,,,
69,,,8100.0,,,8100.0
9,,,,,,


In [124]:
# List of columns to drop
columns_to_drop = ['Dwarka Sector 21', 'Dwarka Sector 21 Metro Station', 'Dwarka Sector 21 Metro station', 'Dwarka Sector 21, Metro Station', 'Dwarka sector 21 metro station']

# Drop the specified columns from location_df
location_df = location_df.drop(columns=columns_to_drop, errors='ignore')

# Merge the 'final' column from temp into location_df and rename it
location_df['Dwarka Sector 21'] = temp['final']

In [126]:
location_df

Unnamed: 0,Bajghera Road,Palam Vihar Halt,DPSG Palam Vihar,Park Hospital,Gurgaon Railway Station,The NorthCap University,Hyatt Place Gurgaon Udyog Vihar,Pacific D21 Mall,Hamoni Golf Camp,Fun N Food Waterpark,...,Minda Industries Corporate Office,"Rampura Flyover, Naurangpur Rd",Manesar toll plaza - Kherki Daula,"Imt Manesar, Gurugram",Holiday Inn,Sector 84 Road,Skyview Corporate Park,Indira Gandhi International Airport,Dwarka Expressway,Dwarka Sector 21
0,800.0,2500.0,3100.0,3100.0,4900.0,5400.0,7700.0,7400.0,6200.0,8800.0,...,,,,,,,,14700.0,1200.0,7200.0
25,550.0,,,,,6700.0,,7500.0,,,...,,,,,,,,15600.0,3800.0,7400.0
37,5300.0,,,,2500.0,8800.0,,,,,...,,,,,,,,20800.0,700.0,
69,1500.0,,,,6500.0,6700.0,,8200.0,8000.0,,...,,,,,,,,16100.0,5100.0,8100.0
9,,,,5500.0,,,,,,,...,,,,,,,,24800.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231,,,,,,,,,,,...,,,,,,,,,,
97,,,,,,,,,,,...,,,,,,,,,,
159,,,,,,,,,,,...,,,,,,,,35100.0,,
226,,,,,,,,,,,...,,,,,,,,,,


In [127]:
# 'Dwarka', 'Dwarka Expressway', 'Dwarka Expressway Link Road' ,'Dwarka Expy', 'Dwarka expressway'

# 'Dwarka Sector 21', 'Dwarka Sector 21 Metro Station', 'Dwarka Sector 21 Metro station', 'Dwarka Sector 21, Metro Station', 'Dwarka sector 21 metro station'

### it is seen that there are 1070 unique landmarks, so now let's check the landmarks list

In [128]:
location_df.columns[10:50]

Index(['Accenture DDC5', 'DPSG Palam Vihar Gurugram',
       'Park Hospital, Palam Vihar', 'Palam Vihar Halt Railway Station',
       'Fun N Food Water Park', 'Tau DeviLal Sports Complex', 'Hyatt Place',
       'Altrade Business Centre', 'AIPL Business Club Sector 62',
       'Heritage Xperiential Learning School', 'CK Birla Hospital',
       'Paras Trinity Mall Sector 63', 'Rapid Metro Station Sector 56',
       'De Adventure Park', 'Golf Course Ext Rd',
       'DoubleTree by Hilton Hotel Gurgaon',
       'KIIT College of Engineering Sohna Road', 'Mehrauli-Gurgaon Road',
       'Nirvana Rd', 'TERI Golf Course', 'The Shikshiyan School', 'WTC Plaza',
       'Luxus Haritma Resort', 'BSF Golf Course', 'Rions Hospital', 'Gurgaon',
       'Nehru Stadium', 'Fun N Food WaterPark', 'Vasant Kunj',
       'Pranavananda Int. School', 'DLF Site central office',
       'Holiday Inn Gurugram Sector 90', 'Krishna Hospital',
       'Royal Institute Of Science', 'Sapphire 83 Mall', 'NH48',
       'Garh

In [129]:
# there are repetitions in the landmarks, but we will assume that there are no repetitions in this data

In [130]:
location_df.index = df.PropertyName

In [131]:
location_df.head()

Unnamed: 0_level_0,Bajghera Road,Palam Vihar Halt,DPSG Palam Vihar,Park Hospital,Gurgaon Railway Station,The NorthCap University,Hyatt Place Gurgaon Udyog Vihar,Pacific D21 Mall,Hamoni Golf Camp,Fun N Food Waterpark,...,Minda Industries Corporate Office,"Rampura Flyover, Naurangpur Rd",Manesar toll plaza - Kherki Daula,"Imt Manesar, Gurugram",Holiday Inn,Sector 84 Road,Skyview Corporate Park,Indira Gandhi International Airport,Dwarka Expressway,Dwarka Sector 21
PropertyName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Smartworld One DXP,800.0,2500.0,3100.0,3100.0,4900.0,5400.0,7700.0,7400.0,6200.0,8800.0,...,,,,,,,,14700.0,1200.0,7200.0
M3M Crown,550.0,,,,,6700.0,,7500.0,,,...,,,,,,,,15600.0,3800.0,7400.0
Adani Brahma Samsara Vilasa,5300.0,,,,2500.0,8800.0,,,,,...,,,,,,,,20800.0,700.0,
Sobha City,1500.0,,,,6500.0,6700.0,,8200.0,8000.0,,...,,,,,,,,16100.0,5100.0,8100.0
Signature Global City 93,,,,5500.0,,,,,,,...,,,,,,,,24800.0,,


#### here we have to update replace the NaN values with a number. But we can't replace it with 0 as if we do that then that landmark would be the nearest landmark from that apartment. So we can't replace NaN with 0. So we will replace these NaN values with a very big number so that our model understands that this landmark and Apartment are not nearby. 

#### We replace NaN values with the maximum value in this whole table, it is 54000

In [132]:
location_df.fillna(54000,inplace=True)

In [133]:
location_df

Unnamed: 0_level_0,Bajghera Road,Palam Vihar Halt,DPSG Palam Vihar,Park Hospital,Gurgaon Railway Station,The NorthCap University,Hyatt Place Gurgaon Udyog Vihar,Pacific D21 Mall,Hamoni Golf Camp,Fun N Food Waterpark,...,Minda Industries Corporate Office,"Rampura Flyover, Naurangpur Rd",Manesar toll plaza - Kherki Daula,"Imt Manesar, Gurugram",Holiday Inn,Sector 84 Road,Skyview Corporate Park,Indira Gandhi International Airport,Dwarka Expressway,Dwarka Sector 21
PropertyName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Smartworld One DXP,800.0,2500.0,3100.0,3100.0,4900.0,5400.0,7700.0,7400.0,6200.0,8800.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,14700.0,1200.0,7200.0
M3M Crown,550.0,54000.0,54000.0,54000.0,54000.0,6700.0,54000.0,7500.0,54000.0,54000.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,15600.0,3800.0,7400.0
Adani Brahma Samsara Vilasa,5300.0,54000.0,54000.0,54000.0,2500.0,8800.0,54000.0,54000.0,54000.0,54000.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,20800.0,700.0,54000.0
Sobha City,1500.0,54000.0,54000.0,54000.0,6500.0,6700.0,54000.0,8200.0,8000.0,54000.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,16100.0,5100.0,8100.0
Signature Global City 93,54000.0,54000.0,54000.0,5500.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,24800.0,54000.0,54000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DLF Princeton Estate,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0
Pyramid Urban Homes 2,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0
Satya The Hermitage,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,35100.0,54000.0,54000.0
BPTP Spacio,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0


In [134]:
# now scaling this data first and then applying cosine similarity

from sklearn.preprocessing import StandardScaler
# Initialize the scaler
scaler = StandardScaler()

# Apply the scaler to the entire dataframe
location_df_normalized = pd.DataFrame(scaler.fit_transform(location_df), columns=location_df.columns, index=location_df.index)

In [135]:
location_df_normalized

Unnamed: 0_level_0,Bajghera Road,Palam Vihar Halt,DPSG Palam Vihar,Park Hospital,Gurgaon Railway Station,The NorthCap University,Hyatt Place Gurgaon Udyog Vihar,Pacific D21 Mall,Hamoni Golf Camp,Fun N Food Waterpark,...,Minda Industries Corporate Office,"Rampura Flyover, Naurangpur Rd",Manesar toll plaza - Kherki Daula,"Imt Manesar, Gurugram",Holiday Inn,Sector 84 Road,Skyview Corporate Park,Indira Gandhi International Airport,Dwarka Expressway,Dwarka Sector 21
PropertyName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Smartworld One DXP,-7.960979,-15.652476,-15.652476,-3.149592,-2.966108,-3.147217,-10.231739,-6.023233,-5.664722,-15.652476,...,0.063888,0.063888,0.063888,0.063888,0.0,0.063888,0.063888,-1.317402,-1.848661,-4.846057
M3M Crown,-7.998993,0.063888,0.063888,0.328277,0.368941,-3.054053,0.090308,-6.009941,0.182906,0.063888,...,0.063888,0.063888,0.063888,0.063888,0.0,0.063888,0.063888,-1.254980,-1.729377,-4.824424
Adani Brahma Samsara Vilasa,-7.276720,0.063888,0.063888,0.328277,-3.129124,-2.903557,0.090308,0.171073,0.182906,0.063888,...,0.063888,0.063888,0.063888,0.063888,0.0,0.063888,0.063888,-0.894320,-1.871600,0.216250
Sobha City,-7.854539,0.063888,0.063888,0.328277,-2.857430,-3.054053,0.090308,-5.916893,-5.444518,0.063888,...,0.063888,0.063888,0.063888,0.063888,0.0,0.063888,0.063888,-1.220301,-1.669735,-4.748705
Signature Global City 93,0.128476,0.063888,0.063888,-2.985606,0.368941,0.335688,0.090308,0.171073,0.182906,0.063888,...,0.063888,0.063888,0.063888,0.063888,0.0,0.063888,0.063888,-0.616889,0.573724,0.216250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DLF Princeton Estate,0.128476,0.063888,0.063888,0.328277,0.368941,0.335688,0.090308,0.171073,0.182906,0.063888,...,0.063888,0.063888,0.063888,0.063888,0.0,0.063888,0.063888,1.408356,0.573724,0.216250
Pyramid Urban Homes 2,0.128476,0.063888,0.063888,0.328277,0.368941,0.335688,0.090308,0.171073,0.182906,0.063888,...,0.063888,0.063888,0.063888,0.063888,0.0,0.063888,0.063888,1.408356,0.573724,0.216250
Satya The Hermitage,0.128476,0.063888,0.063888,0.328277,0.368941,0.335688,0.090308,0.171073,0.182906,0.063888,...,0.063888,0.063888,0.063888,0.063888,0.0,0.063888,0.063888,0.097496,0.573724,0.216250
BPTP Spacio,0.128476,0.063888,0.063888,0.328277,0.368941,0.335688,0.090308,0.171073,0.182906,0.063888,...,0.063888,0.063888,0.063888,0.063888,0.0,0.063888,0.063888,1.408356,0.573724,0.216250


In [136]:
cosine_sim3 = cosine_similarity(location_df_normalized)

In [137]:
cosine_sim3.shape

(246, 246)

In [138]:
def recommend_properties_with_scores(property_name, top_n=247):
    
    cosine_sim_matrix = 30*cosine_sim1 + 20*cosine_sim2 + 8*cosine_sim3
    # cosine_sim_matrix = cosine_sim3
    
    # Get the similarity scores for the property using its name as the index
    sim_scores = list(enumerate(cosine_sim_matrix[location_df_normalized.index.get_loc(property_name)]))
    
    # Sort properties based on the similarity scores
    sorted_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the indices and scores of the top_n most similar properties
    top_indices = [i[0] for i in sorted_scores[1:top_n+1]]
    top_scores = [i[1] for i in sorted_scores[1:top_n+1]]
    
    # Retrieve the names of the top properties using the indices
    top_properties = location_df_normalized.index[top_indices].tolist()
    
    # Create a dataframe with the results
    recommendations_df = pd.DataFrame({
        'PropertyName': top_properties,
        'SimilarityScore': top_scores
    })
    
    return recommendations_df

In [139]:
# Test the recommender function using a property name
recommend_properties_with_scores('Ireo Victory Valley')

Unnamed: 0,PropertyName,SimilarityScore
0,Pioneer Urban Presidia,28.005005
1,Ambience Creacions,27.827442
2,DLF The Crest,24.232360
3,Pioneer Araya,23.398127
4,Silverglades The Melia,21.043658
...,...,...
240,JMS The Nation,-14.698532
241,Shree Vardhman City,-14.884898
242,JMS Prime Land,-15.022991
243,Vatika Aspiration,-15.118421


### Now we have completed making all the 3 recommendation systems. After this we can use weights to give more importance to a specific feature as follows 

In [140]:
(3*cosine_sim3 + 5*cosine_sim2 + 6*cosine_sim1).shape

(246, 246)

In [141]:
# these weights can be given in the function of recommendation which is defined above

In [142]:
location_df

Unnamed: 0_level_0,Bajghera Road,Palam Vihar Halt,DPSG Palam Vihar,Park Hospital,Gurgaon Railway Station,The NorthCap University,Hyatt Place Gurgaon Udyog Vihar,Pacific D21 Mall,Hamoni Golf Camp,Fun N Food Waterpark,...,Minda Industries Corporate Office,"Rampura Flyover, Naurangpur Rd",Manesar toll plaza - Kherki Daula,"Imt Manesar, Gurugram",Holiday Inn,Sector 84 Road,Skyview Corporate Park,Indira Gandhi International Airport,Dwarka Expressway,Dwarka Sector 21
PropertyName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Smartworld One DXP,800.0,2500.0,3100.0,3100.0,4900.0,5400.0,7700.0,7400.0,6200.0,8800.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,14700.0,1200.0,7200.0
M3M Crown,550.0,54000.0,54000.0,54000.0,54000.0,6700.0,54000.0,7500.0,54000.0,54000.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,15600.0,3800.0,7400.0
Adani Brahma Samsara Vilasa,5300.0,54000.0,54000.0,54000.0,2500.0,8800.0,54000.0,54000.0,54000.0,54000.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,20800.0,700.0,54000.0
Sobha City,1500.0,54000.0,54000.0,54000.0,6500.0,6700.0,54000.0,8200.0,8000.0,54000.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,16100.0,5100.0,8100.0
Signature Global City 93,54000.0,54000.0,54000.0,5500.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,24800.0,54000.0,54000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DLF Princeton Estate,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0
Pyramid Urban Homes 2,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0
Satya The Hermitage,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,35100.0,54000.0,54000.0
BPTP Spacio,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,...,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0,54000.0


In [143]:
import pickle

In [144]:
pickle.dump(location_df, open('updated_mandar_location_distance.pkl', 'wb'))

In [74]:
pickle.dump(cosine_sim1, open('mandar_cosine_sim1.pkl', 'wb'))
pickle.dump(cosine_sim2, open('mandar_cosine_sim2.pkl', 'wb'))
pickle.dump(cosine_sim3, open('mandar_cosine_sim3.pkl', 'wb'))
