In [1]:
import numpy as np
import pandas as pd
import re, os, sys, json, pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Recommendor System based on Top Facilities

In [2]:
file_path = os.path.join(os.path.dirname(sys.path[0]),'data/real_estate_data.csv')

df_facilities = pd.read_csv(file_path)

df_facilities = df_facilities[~(df_facilities['PropertyName'] == 'PropertyName')]

df_facilities.head()

Unnamed: 0,PropertyName,PropertySubName,NearbyLocations,LocationAdvantages,Link,PriceDetails,TopFacilities
0,Smartworld One DXP,"2, 3, 4 BHK Apartment in Sector 113, Gurgaon","['Bajghera Road', 'Palam Vihar Halt', 'DPSG Pa...","{'Bajghera Road': '800 Meter', 'Palam Vihar Ha...",https://www.99acres.com/smartworld-one-dxp-sec...,"{'2 BHK': {'building_type': 'Apartment', 'area...","['Swimming Pool', 'Salon', 'Restaurant', 'Spa'..."
1,M3M Crown,"3, 4 BHK Apartment in Sector 111, Gurgaon","['DPSG Palam Vihar Gurugram', 'The NorthCap Un...","{'DPSG Palam Vihar Gurugram': '1.4 Km', 'The N...",https://www.99acres.com/m3m-crown-sector-111-g...,"{'3 BHK': {'building_type': 'Apartment', 'area...","['Bowling Alley', 'Mini Theatre', 'Manicured G..."
2,Adani Brahma Samsara Vilasa,"Land, 3, 4 BHK Independent Floor in Sector 63,...","['AIPL Business Club Sector 62', 'Heritage Xpe...","{'AIPL Business Club Sector 62': '2.7 Km', 'He...",https://www.99acres.com/adani-brahma-samsara-v...,{'3 BHK': {'building_type': 'Independent Floor...,"['Terrace Garden', 'Gazebo', 'Fountain', 'Amph..."
3,Sobha City,"2, 3, 4 BHK Apartment in Sector 108, Gurgaon","['The Shikshiyan School', 'WTC Plaza', 'Luxus ...","{'The Shikshiyan School': '2.9 KM', 'WTC Plaza...",https://www.99acres.com/sobha-city-sector-108-...,"{'2 BHK': {'building_type': 'Apartment', 'area...","['Swimming Pool', 'Volley Ball Court', 'Aerobi..."
4,Signature Global City 93,"2, 3 BHK Independent Floor in Sector 93 Gurgaon","['Pranavananda Int. School', 'DLF Site central...","{'Pranavananda Int. School': '450 m', 'DLF Sit...",https://www.99acres.com/signature-global-city-...,{'2 BHK': {'building_type': 'Independent Floor...,"['Mini Theatre', 'Doctor on Call', 'Concierge ..."


In [3]:
df_facilities[['PropertyName', 'TopFacilities']]

Unnamed: 0,PropertyName,TopFacilities
0,Smartworld One DXP,"['Swimming Pool', 'Salon', 'Restaurant', 'Spa'..."
1,M3M Crown,"['Bowling Alley', 'Mini Theatre', 'Manicured G..."
2,Adani Brahma Samsara Vilasa,"['Terrace Garden', 'Gazebo', 'Fountain', 'Amph..."
3,Sobha City,"['Swimming Pool', 'Volley Ball Court', 'Aerobi..."
4,Signature Global City 93,"['Mini Theatre', 'Doctor on Call', 'Concierge ..."
...,...,...
242,DLF Princeton Estate,"['Swimming Pool', 'Medical Centre', 'Laundry',..."
243,Pyramid Urban Homes 2,"['Shopping Centre', 'Community Hall', '24x7 Se..."
244,Satya The Hermitage,"['Bus Shelter', 'Swimming Pool', 'Business Lou..."
245,BPTP Spacio,"['Swimming Pool', 'Card Room', 'Piped Gas', 'P..."


In [4]:
df_facilities['TopFacilities'][0]

"['Swimming Pool', 'Salon', 'Restaurant', 'Spa', 'Cafeteria', 'Sun Deck', '24x7 Security', 'Club House', 'Gated Community']"

In [5]:
def extract_list(s):
    
    return re.findall(r"'(.*?)'", s)

df_facilities['TopFacilities_vector'] = df_facilities['TopFacilities'].apply(extract_list).apply(' '.join)

In [6]:
df_facilities['TopFacilities_vector'][0]

'Swimming Pool Salon Restaurant Spa Cafeteria Sun Deck 24x7 Security Club House Gated Community'

In [7]:
tfidfvectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
tfidfmatrix = tfidfvectorizer.fit_transform(df_facilities['TopFacilities_vector']).toarray()

In [8]:
tfidfmatrix.shape

(246, 953)

In [9]:
cosine_sim1 = cosine_similarity(tfidfmatrix, tfidfmatrix)

cosine_sim1.shape

(246, 246)

In [10]:
def recommend_properties(property_name, cosine_sim):
    
    #index with id of property name
    idx = df_facilities.index[df_facilities['PropertyName'] == property_name].tolist()[0]
    
    #similarity score with the property
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    #sort the properties based on similarity score
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse=True)
    
    #Scores of 5 most similar properties
    sim_scores = sim_scores[1:6]
    
    #corresponding property indices
    property_indices = [i[0] for i in sim_scores]
    
    recommendation_df = pd.DataFrame({
        'Property Name': df_facilities['PropertyName'].iloc[property_indices],
        'Similarity Score': sim_scores
        
    })
    
    return recommendation_df

In [11]:
recommend_properties('DLF The Arbour', cosine_sim1)

Unnamed: 0,Property Name,Similarity Score
64,Ace Palm Floors,"(63, 0.45293820624419545)"
217,Yashika 104,"(216, 0.4199606322926783)"
93,JMS The Nation,"(92, 0.41665846493632885)"
154,India Rashtra,"(153, 0.3989542346801941)"
0,Smartworld One DXP,"(0, 0.388850461994329)"


# Recommendor System based on Price

In [12]:
df_price = pd.read_csv(file_path)

df_price = df_price[~(df_price['PropertyName'] == 'PropertyName')]


In [13]:
df_price[['PropertyName', 'PriceDetails']]

Unnamed: 0,PropertyName,PriceDetails
0,Smartworld One DXP,"{'2 BHK': {'building_type': 'Apartment', 'area..."
1,M3M Crown,"{'3 BHK': {'building_type': 'Apartment', 'area..."
2,Adani Brahma Samsara Vilasa,{'3 BHK': {'building_type': 'Independent Floor...
3,Sobha City,"{'2 BHK': {'building_type': 'Apartment', 'area..."
4,Signature Global City 93,{'2 BHK': {'building_type': 'Independent Floor...
...,...,...
242,DLF Princeton Estate,"{'2 BHK': {'building_type': 'Apartment', 'area..."
243,Pyramid Urban Homes 2,"{'1 BHK': {'building_type': 'Apartment', 'area..."
244,Satya The Hermitage,"{'2 BHK': {'building_type': 'Apartment', 'area..."
245,BPTP Spacio,"{'2 BHK': {'building_type': 'Apartment', 'area..."


In [14]:
df_price['PriceDetails'][0]

"{'2 BHK': {'building_type': 'Apartment', 'area_type': 'Carpet Area', 'area': '1,370 sq.ft.', 'price-range': '₹ 2 - 2.4 Cr'}, '3 BHK': {'building_type': 'Apartment', 'area_type': 'Carpet Area', 'area': '1,850 - 2,050 sq.ft.', 'price-range': '₹ 2.25 - 3.59 Cr'}, '4 BHK': {'building_type': 'Apartment', 'area_type': 'Carpet Area', 'area': '2,600 sq.ft.', 'price-range': '₹ 3.24 - 4.56 Cr'}}"

In [15]:
def refined_parse_modified(detail_str):
    try:
        details = json.loads(detail_str.replace("'", "\""))
    except:
        return {}

    extracted = {}
    for bhk, detail in details.items():
        # Extract building type
        extracted[f'building type_{bhk}'] = detail.get('building_type')

        # Parsing area details
        area = detail.get('area', '')
        area_parts = area.split('-')
        if len(area_parts) == 1:
            try:
                value = float(area_parts[0].replace(',', '').replace(' sq.ft.', '').strip())
                extracted[f'area low {bhk}'] = value
                extracted[f'area high {bhk}'] = value
            except:
                extracted[f'area low {bhk}'] = None
                extracted[f'area high {bhk}'] = None
        elif len(area_parts) == 2:
            try:
                extracted[f'area low {bhk}'] = float(area_parts[0].replace(',', '').replace(' sq.ft.', '').strip())
                extracted[f'area high {bhk}'] = float(area_parts[1].replace(',', '').replace(' sq.ft.', '').strip())
            except:
                extracted[f'area low {bhk}'] = None
                extracted[f'area high {bhk}'] = None

        # Parsing price details
        price_range = detail.get('price-range', '')
        price_parts = price_range.split('-')
        if len(price_parts) == 2:
            try:
                extracted[f'price low {bhk}'] = float(price_parts[0].replace('₹', '').replace(' Cr', '').replace(' L', '').strip())
                extracted[f'price high {bhk}'] = float(price_parts[1].replace('₹', '').replace(' Cr', '').replace(' L', '').strip())
                if 'L' in price_parts[0]:
                    extracted[f'price low {bhk}'] /= 100
                if 'L' in price_parts[1]:
                    extracted[f'price high {bhk}'] /= 100
            except:
                extracted[f'price low {bhk}'] = None
                extracted[f'price high {bhk}'] = None

    return extracted
# Apply the refined parsing and generate the new DataFrame structure
data_refined = []

for _, row in df_price.iterrows():
    features = refined_parse_modified(row['PriceDetails'])
    
    # Construct a new row for the transformed dataframe
    new_row = {'PropertyName': row['PropertyName']}
    
    # Populate the new row with extracted features
    for config in ['1 BHK', '2 BHK', '3 BHK', '4 BHK', '5 BHK', '6 BHK', '1 RK', 'Land']:
        new_row[f'building type_{config}'] = features.get(f'building type_{config}')
        new_row[f'area low {config}'] = features.get(f'area low {config}')
        new_row[f'area high {config}'] = features.get(f'area high {config}')
        new_row[f'price low {config}'] = features.get(f'price low {config}')
        new_row[f'price high {config}'] = features.get(f'price high {config}')
    
    data_refined.append(new_row)

df_price = pd.DataFrame(data_refined).set_index('PropertyName')

df_price['building type_Land'] = df_price['building type_Land'].replace({'':'Land'})



In [16]:
df_price.dtypes

building type_1 BHK     object
area low 1 BHK         float64
area high 1 BHK        float64
price low 1 BHK        float64
price high 1 BHK       float64
building type_2 BHK     object
area low 2 BHK         float64
area high 2 BHK        float64
price low 2 BHK        float64
price high 2 BHK       float64
building type_3 BHK     object
area low 3 BHK         float64
area high 3 BHK        float64
price low 3 BHK        float64
price high 3 BHK       float64
building type_4 BHK     object
area low 4 BHK         float64
area high 4 BHK        float64
price low 4 BHK        float64
price high 4 BHK       float64
building type_5 BHK     object
area low 5 BHK         float64
area high 5 BHK        float64
price low 5 BHK        float64
price high 5 BHK       float64
building type_6 BHK     object
area low 6 BHK         float64
area high 6 BHK        float64
price low 6 BHK        float64
price high 6 BHK       float64
building type_1 RK      object
area low 1 RK          float64
area hig

In [17]:
cat_col = df_price.select_dtypes(include=['object']).columns.tolist()

cat_col

['building type_1 BHK',
 'building type_2 BHK',
 'building type_3 BHK',
 'building type_4 BHK',
 'building type_5 BHK',
 'building type_6 BHK',
 'building type_1 RK',
 'building type_Land']

In [18]:
df_price = pd.get_dummies(df_price, columns=cat_col, drop_first=True).fillna(0)

In [19]:
df_price.sample()

Unnamed: 0_level_0,area low 1 BHK,area high 1 BHK,price low 1 BHK,price high 1 BHK,area low 2 BHK,area high 2 BHK,price low 2 BHK,price high 2 BHK,area low 3 BHK,area high 3 BHK,...,building type_2 BHK_Independent Floor,building type_2 BHK_Service Apartment,building type_3 BHK_Independent Floor,building type_3 BHK_Service Apartment,building type_3 BHK_Villa,building type_4 BHK_Independent Floor,building type_4 BHK_Villa,building type_5 BHK_Independent Floor,building type_5 BHK_Villa,building type_6 BHK_Villa
PropertyName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
International City by Sobha Phase 1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3153.0,3153.0,...,False,False,False,False,True,False,True,False,True,False


In [20]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

df_price = pd.DataFrame(scaler.fit_transform(df_price), columns=df_price.columns, index=df_price.index)

In [21]:
df_price.sample()

Unnamed: 0_level_0,area low 1 BHK,area high 1 BHK,price low 1 BHK,price high 1 BHK,area low 2 BHK,area high 2 BHK,price low 2 BHK,price high 2 BHK,area low 3 BHK,area high 3 BHK,...,building type_2 BHK_Independent Floor,building type_2 BHK_Service Apartment,building type_3 BHK_Independent Floor,building type_3 BHK_Service Apartment,building type_3 BHK_Villa,building type_4 BHK_Independent Floor,building type_4 BHK_Villa,building type_5 BHK_Independent Floor,building type_5 BHK_Villa,building type_6 BHK_Villa
PropertyName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bestech Park View Ananda,-0.252266,-0.169584,-0.105197,-0.082332,1.208046,1.00611,-0.283546,-0.387986,0.335649,0.167096,...,-0.28931,-0.063888,-0.372678,-0.063888,-0.171139,-0.254824,-0.236208,-0.111111,4.622081,-0.063888


In [22]:
cosine_sim2 = cosine_similarity(df_price)

cosine_sim2.shape

(246, 246)

In [23]:
def recommend_properties_price(property_name, cosine_sim):
    
    #index with id of property name
    idx = df_price.index.get_loc(property_name)
    
    #similarity score with the property
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    #sort the properties based on similarity score
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse=True)
    
    #Scores of 5 most similar properties
    sim_scores = sim_scores[1:6]
    
    #corresponding property indices
    property_indices = [i[0] for i in sim_scores]
    
    recommendation_df = pd.DataFrame({
        'Property Name': df_facilities['PropertyName'].iloc[property_indices],
        'Similarity Score': sim_scores
        
    })
    
    return recommendation_df

In [24]:
recommend_properties_price('M3M Golf Hills', cosine_sim2)

Unnamed: 0,Property Name,Similarity Score
81,AIPL The Peaceful Homes,"(80, 0.9554620840555186)"
0,Smartworld One DXP,"(0, 0.9546700864873817)"
234,Unitech Escape,"(233, 0.9530919579069698)"
23,M3M Capital,"(22, 0.9511561073306115)"
30,BPTP Terra,"(29, 0.9431284475619686)"


# Recommendation System Based on Location

In [25]:
df_loc = pd.read_csv(file_path)

df_loc = df_loc[~(df_loc['PropertyName'] == 'PropertyName')]

In [26]:
df_loc[['PropertyName', 'LocationAdvantages']].sample()

Unnamed: 0,PropertyName,LocationAdvantages
26,Birla Navya,"{'Capital Cyberscape': '900 Meter', 'Golf Cour..."


In [27]:
df_loc['LocationAdvantages'].sample()

100    {'Park Hospital': '4.3km', 'Omaxe Celebration ...
Name: LocationAdvantages, dtype: object

In [28]:
def distance_to_meters(distance_str):
    try:
        if 'Km' in distance_str or 'KM' in distance_str:
            return float(distance_str.split()[0]) * 1000
        elif 'Meter' in distance_str or 'meter' in distance_str:
            return float(distance_str.split()[0])
        else:
            return None
    except:
        return None

In [29]:
import ast

# Extract distances for each location
location_matrix = {}
for index, row in df_loc.iterrows():
    distances = {}
    for location, distance in ast.literal_eval(row['LocationAdvantages']).items():
        distances[location] = distance_to_meters(distance)
    location_matrix[index] = distances

# Convert the dictionary to a dataframe
location_df = pd.DataFrame.from_dict(location_matrix, orient='index')

location_df.index = df_loc.PropertyName

# Display the first few rows
location_df.head()

Unnamed: 0_level_0,Bajghera Road,Palam Vihar Halt,DPSG Palam Vihar,Park Hospital,Gurgaon Railway Station,The NorthCap University,Dwarka Expy,Hyatt Place Gurgaon Udyog Vihar,"Dwarka Sector 21, Metro Station",Pacific D21 Mall,...,MCC Cricket Ground Dhankot,The Shri Ram School Aravali,Taj City Centre Gurugram,Minda Industries Corporate Office,"Rampura Flyover, Naurangpur Rd",Manesar toll plaza - Kherki Daula,"Imt Manesar, Gurugram",Holiday Inn,Sector 84 Road,Skyview Corporate Park
PropertyName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Smartworld One DXP,800.0,2500.0,3100.0,3100.0,4900.0,5400.0,1200.0,7700.0,7200.0,7400.0,...,,,,,,,,,,
M3M Crown,550.0,,,,,6700.0,3800.0,,,7500.0,...,,,,,,,,,,
Adani Brahma Samsara Vilasa,5300.0,,,,2500.0,8800.0,,,,,...,,,,,,,,,,
Sobha City,1500.0,,,,6500.0,6700.0,5100.0,,,8200.0,...,,,,,,,,,,
Signature Global City 93,,,,5500.0,,,,,,,...,,,,,,,,,,


In [30]:
location_df.max().max()

54500.0

In [31]:
location_df.fillna(location_df.max().max(), inplace=True) #not filling with 0 as it shows distance

In [32]:
location_df.shape

(246, 1070)

In [33]:
cosine_sim3 = cosine_similarity(location_df)

In [34]:
def recommend_properties_location(property_name, cosine_sim):
    
    #index with id of property name
    idx = df_loc.index[df_loc['PropertyName'] == property_name].tolist()[0]
    
    #similarity score with the property
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    #sort the properties based on similarity score
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse=True)
    
    #Scores of 5 most similar properties
    sim_scores = sim_scores[1:6]
    
    #corresponding property indices
    property_indices = [i[0] for i in sim_scores]
    
    recommendation_df = pd.DataFrame({
        'Property Name': df_facilities['PropertyName'].iloc[property_indices],
        'Similarity Score': sim_scores
        
    })
    
    return recommendation_df

In [35]:
recommend_properties_location('BPTP Spacio', cosine_sim3)

Unnamed: 0,Property Name,Similarity Score
17,Suncity Vatsal Valley,"(17, 1.0000000000000033)"
19,Trump Tower,"(19, 1.0000000000000033)"
25,La Vida by Tata Housing,"(24, 1.0000000000000033)"
30,BPTP Terra,"(29, 1.0000000000000033)"
32,MRG The Crown,"(31, 1.0000000000000033)"


In [36]:
final_cosine = (cosine_sim1 + 2 * cosine_sim2 + cosine_sim3) / 4

recommend_properties('M3M Golf Hills', final_cosine)

Unnamed: 0,Property Name,Similarity Score
234,Unitech Escape,"(233, 0.7516066829454944)"
131,Corona Optus,"(130, 0.7512869714184284)"
74,Puri Emerald Bay,"(73, 0.749307166664787)"
30,BPTP Terra,"(29, 0.7465302376579062)"
235,Unitech Harmony,"(234, 0.7402644949520827)"


In [37]:
file_path = os.path.join(os.path.dirname(sys.path[0]), 'final_model_data/')

with open(file_path + 'recommend_loc.pkl', 'wb') as file:
    pickle.dump(location_df, file)

with open(file_path + 'cosine_sim_facilities.pkl', 'wb') as file:
    pickle.dump(cosine_sim1, file)
    
with open(file_path + 'cosine_sim_price.pkl', 'wb') as file:
    pickle.dump(cosine_sim2, file)
    
with open(file_path + 'cosine_sim_loc.pkl', 'wb') as file:
    pickle.dump(cosine_sim3, file)