### PreProcess Data

In [107]:
import requests
import pandas as pd
import numpy as np 
# Fetch the data from FastAPI
response = requests.get("http://localhost:8000/get_data/")
if response.status_code == 200:
    data = response.json()
    dataframe = pd.DataFrame(data)
    display(dataframe)
else:
    print("Failed to retrieve data")

Unnamed: 0,Number of Buildings,Bathrooms,Square Footage,Number of Units,Property Type,Median Income,Housing Cost (%),Building Age,Improvement Value
0,3,2,447,2,Single Family Residential,324,40,71,432


In [108]:
dataframe

Unnamed: 0,Number of Buildings,Bathrooms,Square Footage,Number of Units,Property Type,Median Income,Housing Cost (%),Building Age,Improvement Value
0,3,2,447,2,Single Family Residential,324,40,71,432


In [109]:
def map_property_type(df, column_name="Property Type"):
    mapping = {
        "Single Family Residential": 3,
        "Low Density Residential": 2,
        "Condominium": 1
    }
    df["Property Type"] = df[column_name].map(mapping)
    df['Log_Improvement_Value'] = np.log1p(df['Improvement Value'])
    df['Log_Square_Footage'] = np.log1p(df['Square Footage'])

    return df

In [None]:
dataframe=map_property_type(dataframe)

test_data= pd.read_csv("data/LA Prices 2019-2023 and Census.csv")

In [115]:
sample = test_data[['Zip Code','Roll Year']]

sample_df=sample.groupby("Zip Code").mean().reset_index()
sample_df

Unnamed: 0,Zip Code,Roll Year
0,90001,2021.011143
1,90002,2021.008335
2,90003,2021.008422
3,90004,2021.002186
4,90005,2021.037239
...,...,...
276,93551,2021.006554
277,93552,2021.008169
278,93553,2021.024806
279,93563,2020.520833


In [116]:
repeated_df = pd.concat([dataframe]*len(sample_df), ignore_index=True)

# Concatenate along columns
combined = pd.concat([sample_df, repeated_df], axis=1)
combined 

Unnamed: 0,Zip Code,Roll Year,Number of Buildings,Bathrooms,Square Footage,Number of Units,Property Type,Median Income,Housing Cost (%),Building Age,Improvement Value,Log_Improvement_Value,Log_Square_Footage
0,90001,2021.011143,3,2,447,2,3,324,40,71,432,6.070738,6.104793
1,90002,2021.008335,3,2,447,2,3,324,40,71,432,6.070738,6.104793
2,90003,2021.008422,3,2,447,2,3,324,40,71,432,6.070738,6.104793
3,90004,2021.002186,3,2,447,2,3,324,40,71,432,6.070738,6.104793
4,90005,2021.037239,3,2,447,2,3,324,40,71,432,6.070738,6.104793
...,...,...,...,...,...,...,...,...,...,...,...,...,...
276,93551,2021.006554,3,2,447,2,3,324,40,71,432,6.070738,6.104793
277,93552,2021.008169,3,2,447,2,3,324,40,71,432,6.070738,6.104793
278,93553,2021.024806,3,2,447,2,3,324,40,71,432,6.070738,6.104793
279,93563,2020.520833,3,2,447,2,3,324,40,71,432,6.070738,6.104793


In [117]:
dataframe=combined.rename(columns={"Housing Cost (%)": "Housing Cost % of Income",
                          "Property Type": "Property Use Type Encoded"})

In [118]:
dataframe

Unnamed: 0,Zip Code,Roll Year,Number of Buildings,Bathrooms,Square Footage,Number of Units,Property Use Type Encoded,Median Income,Housing Cost % of Income,Building Age,Improvement Value,Log_Improvement_Value,Log_Square_Footage
0,90001,2021.011143,3,2,447,2,3,324,40,71,432,6.070738,6.104793
1,90002,2021.008335,3,2,447,2,3,324,40,71,432,6.070738,6.104793
2,90003,2021.008422,3,2,447,2,3,324,40,71,432,6.070738,6.104793
3,90004,2021.002186,3,2,447,2,3,324,40,71,432,6.070738,6.104793
4,90005,2021.037239,3,2,447,2,3,324,40,71,432,6.070738,6.104793
...,...,...,...,...,...,...,...,...,...,...,...,...,...
276,93551,2021.006554,3,2,447,2,3,324,40,71,432,6.070738,6.104793
277,93552,2021.008169,3,2,447,2,3,324,40,71,432,6.070738,6.104793
278,93553,2021.024806,3,2,447,2,3,324,40,71,432,6.070738,6.104793
279,93563,2020.520833,3,2,447,2,3,324,40,71,432,6.070738,6.104793


In [119]:

feature_columns = [
    "Zip Code",
    "Roll Year",
    "Number of Buildings",
    "Bathrooms",
    "Number of Units",
    "Property Use Type Encoded",
    "Median Income",
    "Housing Cost % of Income",
    "Building Age",
    "Log_Improvement_Value",
    "Log_Square_Footage"
]

In [120]:
import joblib
model = joblib.load("lgbm_model_full.joblib")
categorical_cols = ["Zip Code", "Property Use Type Encoded"]
for col in categorical_cols:
    dataframe[col] = dataframe[col].astype("category")

predictions = model.predict(dataframe[feature_columns]) #predicts on dataframe 

# Display results
dataframe["Predicted Price"] = predictions
# print(test_data[["Zip Code", "Predicted Price"]])
dataframe

Unnamed: 0,Zip Code,Roll Year,Number of Buildings,Bathrooms,Square Footage,Number of Units,Property Use Type Encoded,Median Income,Housing Cost % of Income,Building Age,Improvement Value,Log_Improvement_Value,Log_Square_Footage,Predicted Price
0,90001,2021.011143,3,2,447,2,3,324,40,71,432,6.070738,6.104793,11.262228
1,90002,2021.008335,3,2,447,2,3,324,40,71,432,6.070738,6.104793,11.601475
2,90003,2021.008422,3,2,447,2,3,324,40,71,432,6.070738,6.104793,11.364162
3,90004,2021.002186,3,2,447,2,3,324,40,71,432,6.070738,6.104793,11.971738
4,90005,2021.037239,3,2,447,2,3,324,40,71,432,6.070738,6.104793,12.360485
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276,93551,2021.006554,3,2,447,2,3,324,40,71,432,6.070738,6.104793,11.414935
277,93552,2021.008169,3,2,447,2,3,324,40,71,432,6.070738,6.104793,10.843199
278,93553,2021.024806,3,2,447,2,3,324,40,71,432,6.070738,6.104793,11.448421
279,93563,2020.520833,3,2,447,2,3,324,40,71,432,6.070738,6.104793,11.448421
