In [1]:
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import math


In [41]:
from torch.utils.data import DataLoader

### basic preprocessing

In [2]:
df = pd.read_csv("/content/real_estate_data - real_estate_data.csv")
df = df[['PropertyName','TopFacilities']]
df.drop(index=22,inplace=True)
df.drop(index = 185,inplace = True)
df = df.reset_index(drop = True)
df

Unnamed: 0,PropertyName,TopFacilities
0,Smartworld One DXP,"['Swimming Pool', 'Salon', 'Restaurant', 'Spa'..."
1,M3M Crown,"['Bowling Alley', 'Mini Theatre', 'Manicured G..."
2,Adani Brahma Samsara Vilasa,"['Terrace Garden', 'Gazebo', 'Fountain', 'Amph..."
3,Sobha City,"['Swimming Pool', 'Volley Ball Court', 'Aerobi..."
4,Signature Global City 93,"['Mini Theatre', 'Doctor on Call', 'Concierge ..."
...,...,...
240,DLF Princeton Estate,"['Swimming Pool', 'Medical Centre', 'Laundry',..."
241,Pyramid Urban Homes 2,"['Shopping Centre', 'Community Hall', '24x7 Se..."
242,Satya The Hermitage,"['Bus Shelter', 'Swimming Pool', 'Business Lou..."
243,BPTP Spacio,"['Swimming Pool', 'Card Room', 'Piped Gas', 'P..."


In [3]:
import ast
df['TopFacilities'] = df['TopFacilities'].apply(lambda row:ast.literal_eval(row))
df['TopFacilities']

Unnamed: 0,TopFacilities
0,"[Swimming Pool, Salon, Restaurant, Spa, Cafete..."
1,"[Bowling Alley, Mini Theatre, Manicured Garden..."
2,"[Terrace Garden, Gazebo, Fountain, Amphitheatr..."
3,"[Swimming Pool, Volley Ball Court, Aerobics Ce..."
4,"[Mini Theatre, Doctor on Call, Concierge Servi..."
...,...
240,"[Swimming Pool, Medical Centre, Laundry, Salon..."
241,"[Shopping Centre, Community Hall, 24x7 Securit..."
242,"[Bus Shelter, Swimming Pool, Business Lounge, ..."
243,"[Swimming Pool, Card Room, Piped Gas, Pool Tab..."


padding , truncation and attention mask concepts from hugging face tokenizers..

## I will not use HUGGING FACE for this project

In [5]:
df['TopFacilities'].apply(lambda row:len(row)).value_counts

Unnamed: 0_level_0,count
TopFacilities,Unnamed: 1_level_1
9,234
8,3
3,3
5,3
1,1
7,1


In [7]:
df.drop(index=184,inplace=True)
df = df.reset_index(drop = True)

In [11]:
df['TopFacilities'].apply(lambda row:len(row)).value_counts()

Unnamed: 0_level_0,count
TopFacilities,Unnamed: 1_level_1
9,234
8,3
3,3
5,3
7,1


In [9]:
df[df['TopFacilities'].apply(lambda row:len(row) ==1)]

Unnamed: 0,PropertyName,TopFacilities


###lets drop the rows whose length is not 9 for now. later I will do padding and attention masking

In [12]:
final_df = df[df['TopFacilities'].apply(lambda row:len(row)==9)]
final_df= final_df.reset_index(drop = True)
final_df

Unnamed: 0,PropertyName,TopFacilities
0,Smartworld One DXP,"[Swimming Pool, Salon, Restaurant, Spa, Cafete..."
1,M3M Crown,"[Bowling Alley, Mini Theatre, Manicured Garden..."
2,Adani Brahma Samsara Vilasa,"[Terrace Garden, Gazebo, Fountain, Amphitheatr..."
3,Sobha City,"[Swimming Pool, Volley Ball Court, Aerobics Ce..."
4,Signature Global City 93,"[Mini Theatre, Doctor on Call, Concierge Servi..."
...,...,...
229,Suncity Avenue 102,"[Creche/Day care, Property Staff, Multipurpose..."
230,DLF Princeton Estate,"[Swimming Pool, Medical Centre, Laundry, Salon..."
231,Satya The Hermitage,"[Bus Shelter, Swimming Pool, Business Lounge, ..."
232,BPTP Spacio,"[Swimming Pool, Card Room, Piped Gas, Pool Tab..."


### Capturing relationship between the facilities by attention mechanism

The idea here is:

* [Swimming Pool, Salon, Restaurant, Spa, Cafeteria] , Each token will attend to different tokens thus capturing a information that all these tokens occur together..

* swimming pool will capture relation with salon, restaurant ,spa,cafeteria.. similarly salon will capture relation with swimming pool,restaurant ,spa,cafeteria...
so , each token will capture relation with all the tokens present in a row of the dataset.

* some facilities/tokens are also common in between different apartments here..So, those similar facilities/tokens will capture two types of relationship -> one with the tokens/facilities for on apartment and the other relation with tokens of other apartment due to common facilities... thus some simliarity between apartments will also exist..

### lowercase

In [18]:
final_df['TopFacilities'] = final_df['TopFacilities'].apply(lambda row:[row[i].lower() for i in range(len(row))])

final_df

Unnamed: 0,PropertyName,TopFacilities
0,Smartworld One DXP,"[swimming pool, salon, restaurant, spa, cafete..."
1,M3M Crown,"[bowling alley, mini theatre, manicured garden..."
2,Adani Brahma Samsara Vilasa,"[terrace garden, gazebo, fountain, amphitheatr..."
3,Sobha City,"[swimming pool, volley ball court, aerobics ce..."
4,Signature Global City 93,"[mini theatre, doctor on call, concierge servi..."
...,...,...
229,Suncity Avenue 102,"[creche/day care, property staff, multipurpose..."
230,DLF Princeton Estate,"[swimming pool, medical centre, laundry, salon..."
231,Satya The Hermitage,"[bus shelter, swimming pool, business lounge, ..."
232,BPTP Spacio,"[swimming pool, card room, piped gas, pool tab..."


In [19]:
temp = []

for row in final_df['TopFacilities']:
  for s in row:
    temp.append(s)

vocab = set(temp)


w2idx = {word:idx for idx,word in enumerate(vocab)}
idx2w = {idx:word for idx,word in enumerate(vocab)}



In [23]:
dataset = final_df['TopFacilities']

In [26]:
dataset

Unnamed: 0,TopFacilities
0,"[swimming pool, salon, restaurant, spa, cafete..."
1,"[bowling alley, mini theatre, manicured garden..."
2,"[terrace garden, gazebo, fountain, amphitheatr..."
3,"[swimming pool, volley ball court, aerobics ce..."
4,"[mini theatre, doctor on call, concierge servi..."
...,...
229,"[creche/day care, property staff, multipurpose..."
230,"[swimming pool, medical centre, laundry, salon..."
231,"[bus shelter, swimming pool, business lounge, ..."
232,"[swimming pool, card room, piped gas, pool tab..."


In [39]:
def dataset_maker(dataset):
  outer = []
  for i in range(len(dataset)):
    inner = []
    for j in range(9):
      inner.append(torch.LongTensor([w2idx[dataset[i][j]]]))
    outer.append(inner)

  for k in range(len(outer)):
    outer[k] = torch.stack(outer[k])


  outer = torch.stack(outer)
  return outer

In [42]:
dataset_tensor = dataset_maker(dataset)
dataset_tensor

tensor([[[ 19],
         [ 20],
         [ 17],
         ...,
         [ 98],
         [ 82],
         [ 10]],

        [[ 92],
         [  6],
         [ 51],
         ...,
         [ 88],
         [ 48],
         [ 40]],

        [[ 91],
         [  1],
         [ 93],
         ...,
         [ 14],
         [ 46],
         [ 95]],

        ...,

        [[ 37],
         [ 19],
         [ 54],
         ...,
         [ 93],
         [ 43],
         [ 99]],

        [[ 19],
         [ 97],
         [ 58],
         ...,
         [101],
         [ 50],
         [ 64]],

        [[ 19],
         [ 28],
         [ 35],
         ...,
         [ 84],
         [ 31],
         [ 93]]])

### I am not using batches as I am just trying the model to learn the representation..

### INPUT REPRESENTATION

In [51]:
class input_representation(nn.Module):

  def __init__(self,vocab_size,dmodel,T = None):
    super().__init__()
    self.E = nn.Embedding(vocab_size,dmodel)
    self.T = T
    if self.T != None:
      self.posit  = self.Positional_Encoding(self.T,dmodel)

  def Positional_Encoding(T,dmodel):
  # context length == T
    pos_encoding = torch.zeros(T,dmodel)
    pos = torch.arange(0,T).unsqueeze(1)

    log_den = torch.arange(0,dmodel,2)*(-math.log(10000)/dmodel)
    den_ = torch.exp(log_den)

    pos_encoding[:,0::2] = torch.sin(pos*den_)
    pos_encoding[:,1::2] = torch.cos(pos*den_)

    return pos_encoding  ## shape (T X dmodel)

  def forward(self,input_idx):
    # input_idx shape -> batch X context length
    if self.T != None:
      return self.E.weight[input_idx] + self.posit

    return self.E.weight[input_idx]
    # shape -> batch X context length X dmodel


In [49]:
len(vocab)

106

In [52]:
vocab_size = len(vocab)
dmodel = 32
input_rep = input_representation(vocab_size,dmodel)
# no requirement of positional encoding as here order doesn't matters

encoded_inputs = input_rep(dataset_tensor)
encoded_inputs

tensor([[[[ 0.0058, -0.0842, -0.0243,  ..., -1.8584, -0.1664,  1.6636]],

         [[-0.5435, -0.4527, -0.4413,  ...,  1.3122, -0.8474, -0.8239]],

         [[-0.4497,  0.2820, -1.8252,  ..., -0.8159,  0.4377,  1.4789]],

         ...,

         [[-0.0169,  0.1200, -1.5048,  ...,  1.3158,  0.4631,  0.3543]],

         [[ 1.7722, -0.6837, -1.1856,  ...,  1.3323,  1.6829,  0.9221]],

         [[ 0.9087, -0.1401,  0.1708,  ..., -0.0912,  0.0554,  1.1227]]],


        [[[-1.3217,  0.1384,  0.3282,  ..., -0.5869,  0.1854, -0.8928]],

         [[-0.9699,  0.5387, -0.1714,  ..., -0.1417, -0.3648,  1.2954]],

         [[-0.3031,  1.1663,  1.8754,  ...,  0.1452,  0.4386, -0.4806]],

         ...,

         [[-0.0519,  0.5768,  0.7036,  ...,  1.6784,  0.2226,  0.1788]],

         [[ 0.1702,  1.1543,  1.3371,  ...,  0.2971, -0.4265,  1.1840]],

         [[-0.9501, -0.6142, -2.3156,  ...,  0.6665, -0.4170,  0.6924]]],


        [[[ 0.5029, -0.3123, -1.0446,  ..., -2.0379,  0.9495, -1.1329]],

    

In [53]:
encoded_inputs.shape

torch.Size([234, 9, 1, 32])

In [56]:
required_input = encoded_inputs.squeeze()
required_input.shape

torch.Size([234, 9, 32])

### batch(234) X context(9) X dmodel(32)

### Encoder block

In [74]:
class Encoder(nn.Module):

  def __init__(self,dmodel:int,dk:int,
               dv:int,dff:int,n_heads = n_heads,masking = False):
    super().__init__()
    self.dmodel = dmodel
    self.dk = dk
    self.dv = dv
    self.n_heads = n_heads
    self.shape_changer = nn.Linear(n_heads*dv,dmodel,bias = False)
    self.masking = masking

    # feeforward network
    self.f1 = nn.Linear(dmodel,dff)
    self.f2 = nn.Linear(dff,dmodel)

    # layer normalise
    self.first_norm = nn.LayerNorm(dmodel)
    self.second_norm = nn.LayerNorm(dmodel)


  def MultiHead(self,input:torch.tensor):
    self.heads = nn.ModuleList([])
    for i in range(n_heads):
      self.heads.append(self.Single_Head_Attention(self.dmodel,self.dk,self.dv))

    multi_head = []
    for head in self.heads:
      multi_head.append(head.forward(input))
    concat_multi = torch.cat(multi_head,dim = -1)

    return self.shape_changer(concat_multi)  ## linear layer at the end of multihead attention...

  def forward(self,input:torch.tensor):
    # first normalise then send it to attention
    norm1 = self.first_norm(input)
    multihead_out = self.MultiHead(norm1)
    norm2 = self.second_norm(norm1+multihead_out)

    # feedforward
    ffd_out = self.f2(F.relu(self.f1(norm2)))

    return (ffd_out + norm2)


  class Single_Head_Attention(nn.Module):
    def __init__(self,dmodel:int,dk:int,dv:int):
      super().__init__()
      self.W_q = nn.Linear(dmodel,dk,bias = False)
      self.W_k = nn.Linear(dmodel,dk,bias = False)
      self.W_v = nn.Linear(dmodel,dv,bias = False)
      self.dk = dk

    def masked_matrix(self,sim_mat):
      mask = torch.triu(sim_mat,diagonal = 1).bool()

      sim_mat.masked_fill_(mask,-torch.inf)
      # inplace

    def forward(self,input:torch.tensor,masking = False):
      Q = self.W_q(input)
      K = self.W_k(input)
      V = self.W_v(input)
      self.masking = False

      sim_mat = torch.matmul(Q,torch.transpose(Q,1,2))/math.sqrt(self.dk)

      if self.masking:
        self.masked_matrix(sim_mat)
      # call and inplace masking occured sim_mat gets modified inplace

      self.attention_wts = F.softmax(sim_mat,dim = 2) # shape -> (batch size,len(list),len(list))

      head = torch.matmul(self.attention_wts,V)
      return head
      # shape -> (batch size,context length,dv)



In [65]:
import copy

In [75]:
class MyModel(nn.Module):
  def __init__(self,N:int,dmodel:int,dk:int,
               dv:int,dff:int,n_heads = n_heads,masking = False):
    super().__init__()

    self.encoder_stack = nn.Sequential()
    for i in range(N):
      self.encoder_stack.append(copy.deepcopy(Encoder(dmodel,dk,dv,dff,n_heads,masking = False)))

    self.final_norm = nn.LayerNorm(dmodel)

  def forward(self,input:torch.tensor):
    encoder_stack_out = self.encoder_stack(input)
    return self.final_norm(encoder_stack_out)



original transformer had:
* dmodel = 512
* dk = 64 =dv
* n_heads = 8

In [76]:
N = 4 # 4layers of encoder
dmodel = 32
n_heads = 4
dk = int(dmodel/n_heads)
dv = int(dmodel/n_heads)
dff = 4*dmodel

model = MyModel(N,dmodel,dk,dv,dff,n_heads,masking = False)

encoder_blocks_out = model(required_input)
encoder_blocks_out

tensor([[[-4.9691e-01, -1.8758e-01,  6.1168e-01,  ..., -8.7113e-01,
          -2.0125e-01,  1.3912e+00],
         [-5.8492e-01,  2.1534e-01, -2.5333e-01,  ...,  1.6777e+00,
          -4.3604e-01,  3.4496e-01],
         [-3.7410e-01,  1.2472e+00, -1.5694e+00,  ..., -8.2578e-01,
           9.7338e-01,  1.9691e+00],
         ...,
         [-2.7807e-01,  5.0758e-01, -1.4721e+00,  ...,  2.1062e+00,
           3.6413e-01,  1.3482e+00],
         [ 1.7666e+00, -7.8795e-01, -1.0661e+00,  ...,  1.2256e+00,
           1.9554e+00,  7.9742e-01],
         [ 1.6399e+00,  4.4375e-01,  1.1460e-01,  ..., -6.8436e-02,
           7.3093e-01,  1.4926e+00]],

        [[-2.2614e+00,  1.6521e-01,  4.3216e-01,  ..., -2.2800e-01,
           7.5495e-01, -5.7063e-01],
         [-9.8947e-01,  2.5561e-01, -2.1507e-01,  ...,  5.3219e-01,
          -8.1707e-01,  1.6872e+00],
         [-2.1789e-01,  1.0611e+00,  1.5725e+00,  ...,  1.2795e-01,
           8.6768e-02, -4.3761e-01],
         ...,
         [-8.2298e-01, -1

In [77]:
encoder_blocks_out.shape

torch.Size([234, 9, 32])

In [78]:
apartment_rep = torch.mean(encoder_blocks_out,dim = 1)
apartment_rep.shape

torch.Size([234, 32])

In [84]:
apartment_rep[233]

tensor([-0.1057, -0.2865,  0.3137,  0.0053, -0.2072,  0.5996,  0.5415, -0.4741,
        -0.0345, -0.3993, -0.1314,  0.0623,  0.2690, -0.3727, -0.2438,  0.2739,
         0.1424, -0.7215,  0.2645,  0.2097,  0.9283, -0.7289, -0.3330,  0.6282,
         0.0966, -1.1329,  0.0814,  0.0270,  0.4413, -0.5740,  0.4010,  0.4597],
       grad_fn=<SelectBackward0>)

In [85]:
final_df

Unnamed: 0,PropertyName,TopFacilities
0,Smartworld One DXP,"[swimming pool, salon, restaurant, spa, cafete..."
1,M3M Crown,"[bowling alley, mini theatre, manicured garden..."
2,Adani Brahma Samsara Vilasa,"[terrace garden, gazebo, fountain, amphitheatr..."
3,Sobha City,"[swimming pool, volley ball court, aerobics ce..."
4,Signature Global City 93,"[mini theatre, doctor on call, concierge servi..."
...,...,...
229,Suncity Avenue 102,"[creche/day care, property staff, multipurpose..."
230,DLF Princeton Estate,"[swimming pool, medical centre, laundry, salon..."
231,Satya The Hermitage,"[bus shelter, swimming pool, business lounge, ..."
232,BPTP Spacio,"[swimming pool, card room, piped gas, pool tab..."


In [88]:
def recommender(df,final_rep,k):
  user_view = input("Which apartment is user viewing ?")
  curr_vec = final_rep[df[df['PropertyName'] == user_view].index.item()].view(1,-1)

  cos = nn.CosineSimilarity(dim = 1,eps =1e-08)
  sim_scores = cos(curr_vec,final_rep)

  topkvals,topkidx = torch.topk(sim_scores,k = k)

  top_idx = topkidx.detach().numpy()
  # iterate I think is better instead of returning a dataframe
  for i in range(k):
    print(df.iloc[top_idx[i],0],"similarity score:" ,topkvals[i])

In [89]:
recommender(final_df,apartment_rep,6)

Which apartment is user viewing ?SS The Leaf
SS The Leaf similarity score: tensor(1.0000, grad_fn=<SelectBackward0>)
Alpha Corp GurgaonOne 84 similarity score: tensor(0.7774, grad_fn=<SelectBackward0>)
Godrej Aria similarity score: tensor(0.7199, grad_fn=<SelectBackward0>)
Mapsko The Icon 79 similarity score: tensor(0.7120, grad_fn=<SelectBackward0>)
Ramprastha The Edge Towers similarity score: tensor(0.7110, grad_fn=<SelectBackward0>)
Ambience Creacions similarity score: tensor(0.6988, grad_fn=<SelectBackward0>)


In [91]:
pd.set_option("display.max_rows",None)
pd.set_option("display.max_rows",None)
pd.set_option("display.max_colwidth", None)

In [92]:
final_df[final_df['PropertyName'] == 'SS The Leaf']['TopFacilities']

Unnamed: 0,TopFacilities
33,"[valet parking, swimming pool, football, business lounge, medical centre, flower garden, school, squash court, wi-fi connectivity]"


In [93]:
final_df[final_df['PropertyName'] == 'Alpha Corp GurgaonOne 84']['TopFacilities']

Unnamed: 0,TopFacilities
209,"[swimming pool, medical centre, flower garden, aerobics centre, squash court, gazebo, property staff, billiards, sun deck]"


In [94]:
final_df[final_df['PropertyName'] == 'Godrej Aria']['TopFacilities']

Unnamed: 0,TopFacilities
103,"[swimming pool, school, spa, squash court, property staff, library, cricket pitch, lawn tennis court, multipurpose court]"


In [95]:
final_df[final_df['PropertyName'] == 'Mapsko The Icon 79']['TopFacilities']

Unnamed: 0,TopFacilities
135,"[mini theatre, swimming pool, bar/chill-out lounge, business lounge, spa, steam room, squash court, cafeteria, billiards]"


In [96]:
final_df[final_df['PropertyName'] == 'Ramprastha The Edge Towers']['TopFacilities']

Unnamed: 0,TopFacilities
184,"[temple, swimming pool, bar/chill-out lounge, school, restaurant, sauna, piped gas, spa, squash court]"


### This was the basic way to build a basic recommendation program, I will improve it later to capture semantic relationship properly  and may be combine it  with other fields such as finance or something else..For now this was it...