In [1]:
import os
import sklearn
import requests
import json
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.preprocessing import StandardScaler
import itertools



In [2]:
# Get the data
with open("home_data.json", "r") as data:
    home_data = json.load(data) 
# Sample user preferences
user_preferences= {
            "state": "District of Columbia",
            "baths":2,
            "beds":2,
            "cats":True,
            "dogs":True,
            "price_range":"$1751 - $4555",
            "sqft_range":"500+",
            "type":"Apartment",
            "city": "Washington",
            "postal_code": "20024"
        }
# Sample user liked houses
user_liked_houses = ['6786270504', '6541999527']

In [3]:
# Get the data
with open("home_data.json", "r") as data:
    home_data = json.load(data) 
# Sample user preferences
user_preferences= {
            "state": "District of Columbia",
            "baths":2,
            "beds":2,
            "cats":True,
            "dogs":True,
            "price_range":"$1751 - $4555",
            "sqft_range":"500+",
            "type":"Apartment",
            "city": "Washington",
            "postal_code": "20024"
        }
# Sample user liked houses
user_liked_houses = ['6786270504', '6541999527']

In [4]:
# Get the data needed
data_needed = {}
for i in range(0,100):
    details = []
    if home_data["data"]["results"][i]["details"] != None:
        details = home_data["data"]["results"][i]["details"][0]["text"]
    if (home_data["data"]["results"][i]["list_price_min"] != None) and (home_data["data"]["results"][i]["list_price_max"] != None):
        price_range = []
        price_range.append(home_data["data"]["results"][i]["list_price_min"])
        price_range.append(home_data["data"]["results"][i]["list_price_max"])
    else: 
        price_range = home_data["data"]["results"][i]["list_price"]
    if (home_data["data"]["results"][i]["description"]["sqft_min"] != None) and (home_data["data"]["results"][i]["description"]["sqft_max"] != None):
        sqft_range = []
        sqft_range.append(home_data["data"]["results"][i]["description"]["sqft_min"])
        sqft_range.append(home_data["data"]["results"][i]["description"]["sqft_max"])
    else:
        sqft_range = home_data["data"]["results"][i]["description"]["sqft"]
    # get the data needed from the API
    data_needed[f"{i}"] = {
        "property_id": home_data["data"]["results"][i]["property_id"],
        "list_price_min": home_data["data"]["results"][i]["list_price_min"],
        "list_price_max": home_data["data"]["results"][i]["list_price_max"],
        "baths": home_data["data"]["results"][i]["description"]["baths_max"],
        "beds": home_data["data"]["results"][i]["description"]["beds_max"],
        "sqft_max": home_data["data"]["results"][i]["description"]["sqft_max"],
        "type": home_data["data"]["results"][i]["description"]["type"],
        # "street_name": home_data["data"]["results"][i]["location"]["address"]["street_name"],
        "city": home_data["data"]["results"][i]["location"]["address"]["city"],
        "postal_code": home_data["data"]["results"][i]["location"]["address"]["postal_code"],
        "state": home_data["data"]["results"][i]["location"]["address"]["state"],
        "cats": home_data["data"]["results"][i]["pet_policy"]["cats"],
        "dogs": home_data["data"]["results"][i]["pet_policy"]["dogs"],
        # NOTE: THE COMMENTED INFO WILL BE NEEDED FOR ACCUARCY LATER ON
        # "list_price": home_data["data"]["results"][i]["list_price"],
        # "sqft": home_data["data"]["results"][i]["description"]["sqft"],
        # "line": home_data["data"]["results"][i]["location"]["address"]["line"],
        # "price_range": price_range,
        # "sqft_range": sqft_range,
        # "other_details": details,
        # "postal_code": home_data["data"]["results"][i]["location"]["address"]["neighborhoods"],
    }
with open("ml_data.json", "w") as outfile:
    json.dump(data_needed, outfile, indent=3)

In [5]:
# Convert the data price ranges and sqft to int
def process_ranges(df):
    df['list_price_min'] = df['price_range'].str.split('-').str[0].str.replace('$', '').astype(int)
    df['list_price_max'] = df['price_range'].str.split('-').str[1].str.replace('$', '').astype(int)
    df['sqft'] = df['sqft_range'].str.split('+').str[0].astype(int)
    df = df.drop(['price_range', 'sqft_range'], axis=1)
    return df

# Data Preparation
# Create DataFrames from the given data
house_data = pd.DataFrame.from_dict(data_needed).T
# house_print = house_data
user_preferences = pd.DataFrame.from_dict([user_preferences])
user_liked_houses = pd.DataFrame({'property_id': user_liked_houses})
house_data = house_data.dropna()

# Feature Engineering
user_preferences = process_ranges(user_preferences)

# Normalize house data numerical features
scaler = MinMaxScaler()
house_data[['list_price_min', 'list_price_max','sqft_max']] = scaler.fit_transform(house_data[['list_price_min', 'list_price_max','sqft_max']])

# Encode house data categorical features using label encoding
categorical_features = ['state', 'baths', 'beds', 'cats', 'dogs', 'type', 'city', 'state', 'postal_code']
encoder = LabelEncoder()
for cols in categorical_features:
    label = encoder.fit_transform(house_data[cols])
    house_data[cols] = label
    
for cols in categorical_features:
    label = encoder.fit_transform(user_preferences[cols])
    user_preferences[cols] = label
    

# Fill null values in the data with zero
house_data = house_data.fillna(0)

In [6]:
# NOTE: A MORE THOROUGH FILTERING WILL BE USED FOR THIS PART LATER ON. BUT HERE IS A BASIC IMPLEMENTATION THAT WORKS
# Filter houses based on user preferences: Creates a new dataframe containing only houses that matches the user
# Matches all preferences
# print(house_data['state'].values[0])
# print("USER", user_preferences['state'].values[0])
all_filtered_houses = house_data[
    (house_data['state'] == user_preferences['state'].values[0]) &
    (house_data['baths'] == user_preferences['baths'].values[0]) &
    (house_data['beds'] == user_preferences['beds'].values[0]) &
    (house_data['cats'] == user_preferences['cats'].values[0]) &
    (house_data['dogs'] == user_preferences['dogs'].values[0]) &
    (house_data['list_price_min'] >= user_preferences['list_price_min'].values[0]) &
    (house_data['list_price_max'] <= user_preferences['list_price_max'].values[0]) &
    (house_data['sqft_max'] >= user_preferences['sqft'].values[0]) &
    (house_data['type'] == user_preferences['type'].values[0]) &
    (house_data['city'] == user_preferences['city'].values[0]) &
    (house_data['postal_code'] == user_preferences['postal_code'].values[0])
]
# Matches some preferences
some_filtered_houses = house_data[
    (house_data['state'] == user_preferences['state'].values[0]) |
    (house_data['baths'] == user_preferences['baths'].values[0]) |
    (house_data['beds'] == user_preferences['beds'].values[0]) |
    (house_data['cats'] == user_preferences['cats'].values[0]) |
    (house_data['dogs'] == user_preferences['dogs'].values[0]) |
    (house_data['list_price_min'] >= user_preferences['list_price_min'].values[0]) &
    (house_data['list_price_max'] <= user_preferences['list_price_max'].values[0]) &
    (house_data['sqft_max'] >= user_preferences['sqft'].values[0]) |
    (house_data['type'] == user_preferences['type'].values[0]) |
    (house_data['city'] == user_preferences['city'].values[0]) |
    (house_data['postal_code'] == user_preferences['postal_code'].values[0])
]
filtered_houses = pd.concat([all_filtered_houses,some_filtered_houses], axis=0)
filtered_houses.drop_duplicates(inplace=True)

In [7]:
# Content based recommendation
def content_based():
    # Select relevant features for similarity calculation
    features = ['state','city','postal_code','baths', 'cats','dogs','beds', 'list_price_min', 'list_price_max', 'sqft_max','type','list_price_min','list_price_max','sqft_max']
    
    # Create a feature matrix for filtered houses
    feature_matrix = filtered_houses[features]
    
    # Calculate cosine similarity
    similarity_scores = cosine_similarity(feature_matrix)

    # Get indices of liked houses
    liked_house_indices = filtered_houses.index[filtered_houses['property_id'].isin(user_liked_houses['property_id'])].to_list()
    
    #Convert the indices to int
    liked_house_indices = list(map(int,liked_house_indices))

    # Recommend houses based on similarity to liked houses
    recommended_indices = similarity_scores[liked_house_indices].argsort()[:, ::-1]
    recommended_houses = filtered_houses.iloc[recommended_indices[:, 1:].ravel()]  # Exclude already liked houses
    recommended_houses.drop_duplicates(inplace=True)

    # Convert all the indices results from every liked houses into one list
    all_recommended_indices = list(itertools.chain.from_iterable(recommended_indices))
    # delete duplicates indices in the newly created list
    all_recommended_indices = list(set(all_recommended_indices))

    # Calculate similarity score
    scores = similarity_scores[0][all_recommended_indices]

    # sort or rank the houses
    ranked_houses = list(zip(all_recommended_indices,scores)) #save the indices and scores as a tuple
    ranked_houses.sort(key=lambda x: x[1], reverse=True) #rank the results

    # Print the ranked recommendation along with the house details
    # for house_id, score in ranked_houses:
    #     # Get the house details using the house ID        
    #     house_id = str(house_id)
    #     try:
    #         house_details = house_print.loc[house_id]
    #         # Print the house details and score
    #         print(f"Recommended House ID: {house_id}")
    #         print(f"Score: {score}")
    #         print(f"House Details:")
    #         print(house_details)
    #         print("\n")
    #     except:
    #         print(f"House id {house_id} returns an error")
content_based()
# 7,12,85,53,18,71,52,8,65,17,12,43
# TODO: IMPLEMENT TEST CASES AND EVALAUTE AND CALCULATE THE ACCURACY OF THE MODEL