# DATA CLEANING AND PREPARATION: 
1. DROP COLUMNS WITH UNUSED DATA
2. DROP ROWS WHERE LISTING PROPERTY IS NOT SINGLE FAMILY
3. DROP ROWS WHERE DATA IS NEEDED
4. REMOVE REPEAT PROPERTY LISTINGS
5. ESTIMATE 'NAN' VALUES
6. CALCULATE DAYS ON MARKET
7. CALCULATE AMENITIES TO PRICE RATIO
8. CALCULATE PRICE PER SQUARE FEET
9. CALCULATE PRICE PER LOT SQUARE FEET
10. CALCULATE TOTAL BATHS
11. CALCULATE PRICE PER SQUARE FOOT RELATIVE TO NEIGHBORHOOD
12. CALCULATE SOLD PRICE AND LIST PRICE RATIO (<1 MEANS HOUSE SOLD FOR LESS THEN LISTED)
13. CLASSIFY DAYS ON MARKET AS QUARTILES
14. CLASSIFY LIST PRICE AS QUARTILES
15. CLASSIFY YES/NO HOA FEE (1=YES, 0=NO)
16. CLASSIFY SEASON LISTING WAS POSTED
17. CLASSIFY SOLD/LIST RATIO AS POSITIVE (2), EQUAL (1), OR NEGATIVE (0)
18. ENCODE CATEGORICAL VARIABLES: SEASON LISTED

In [1]:
import pandas as pd
import os
import sys
import statistics
import numpy as np
import geopandas as gpd
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from shapely import wkt


  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (

import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
pd.set_option('display.max_columns', 500)

In [3]:
#load data
df = pd.read_csv("../data/Los_Angeles_Sold_spatial_join.csv")

In [4]:
#1. DROP COLUMNS WITH UNUSED DATA
df = df.drop(columns=['broker_website','broker_phone','broker', 'agent_phones','agent_email','agent','county']) 


In [5]:
#2. DROP ROWS WHERE LISTING PROPERTY IS NOT SINGLE FAMILY
df = df[df['style'] == 'SINGLE_FAMILY']


In [6]:
#3. DROP ROWS WHERE DATA IS NEEDED
round((df.isnull().sum()/df.shape[0])*100,2)

property_url         0.00
mls                  0.00
mls_id               0.00
status               0.00
text                 1.06
style                0.00
full_street_line     0.05
street               0.03
unit                99.81
city                 0.00
state                0.00
zip_code             0.00
beds                 0.00
full_baths           0.90
half_baths          77.43
sqft                 0.67
year_built           0.34
days_on_mls          9.70
list_price           0.00
list_date            0.00
sold_price           0.00
last_sold_date       0.00
assessed_value       1.25
estimated_value      1.68
lot_sqft             0.13
price_per_sqft       0.67
latitude             0.00
longitude            0.00
neighborhoods        0.08
fips_code            0.00
stories              0.02
hoa_fee              0.13
parking_garage      24.06
nearby_schools       1.52
primary_photo        2.63
alt_photos           2.63
geometry             0.00
neighbourhood        0.00
dtype: float

In [7]:
df = df.dropna(subset=['latitude','longitude','sqft','year_built','full_baths','lot_sqft'])


In [8]:
#4. REMOVE REPEAT PROPERTY LISTINGS
df = df.drop_duplicates(subset=['full_street_line', 'unit'], keep='first')


In [9]:
#5. ESTIMATE 'NAN' VALUES

# assumed 'nan' values:
    ### number of stories - 1
    ### hoa fees - 0
    ### number of parking garages - 0
    ### number of half baths - 0
    
df['stories'] = df['stories'].fillna(1)
df['hoa_fee'] = df['hoa_fee'].fillna(0)
df['parking_garage'] = df['parking_garage'].fillna(0)
df['half_baths'] = df['half_baths'].fillna(0)


In [10]:
#6. CALCULATE DAYS ON MARKET

def days_between_dates(date1, date2):
    date_format = "%Y-%m-%d"
    d1 = datetime.strptime(date1, date_format)
    d2 = datetime.strptime(date2, date_format)
    delta = d2 - d1
    return abs(delta.days)

In [11]:
df['days_on_market'] = df.apply(lambda row: days_between_dates(row['list_date'], row['last_sold_date']), axis=1)

In [12]:
#7. CALCULATE AMENITIES TO PRICE RATIO
#note - price to amenities ratio is not calculated to avoid /0
df["bedcostratio"] = df.beds/df.sold_price
df["bathcostratio"] = (df.full_baths+df.half_baths)/df.sold_price
df["bathbedcostratio"] = (df.full_baths+df.half_baths+df.beds)/df.sold_price
df["amenitiescostratio"] = (df.full_baths+df.half_baths+df.beds+df.parking_garage)/df.sold_price

In [13]:
#8. CALCULATE PRICE PER SQUARE FEET
df["pricepersqft"] = df.sold_price/df.sqft

In [14]:
#9. CALCULATE PRICE PER LOT SQUARE FEET
df["priceperlotsqft"] = df.sold_price/df.lot_sqft

In [15]:
#10. CALCULATE TOTAL BATHS
df["total_baths"] = df.full_baths+df.half_baths

In [16]:
#11. CALCULATE PRICE PER SQUARE FOOT RELATIVE TO NEIGHBORHOOD
neighborhood_avg_price = df.groupby('neighbourhood')['price_per_sqft'].transform('mean')
df['relative_price_per_sqft'] = df['price_per_sqft'] / neighborhood_avg_price


In [17]:
#12. CALCULATE SOLD PRICE AND LIST PRICE RATIO
df["sold_list_ratio"] = df.sold_price/df.list_price

In [18]:
#13. CLASSIFY DAYS ON MARKET AS QUARTILES
def classify_quartile(value):
    if value <= Q1:
        return 'Q1'
    elif value <= Q2:
        return 'Q2'
    elif value <= Q3:
        return 'Q3'
    else:
        return 'Q4'
    
Q1 = df['days_on_market'].quantile(0.25)
Q2 = df['days_on_market'].quantile(0.50)
Q3 = df['days_on_market'].quantile(0.75)

df['days_on_market_quartile'] = df['days_on_market'].apply(classify_quartile)

In [19]:
#14. CLASSIFY LIST PRICE AS QUARTILES
Q1 = df['list_price'].quantile(0.25)
Q2 = df['list_price'].quantile(0.50)
Q3 = df['list_price'].quantile(0.75)
df['list_price_quartile'] = df['list_price'].apply(classify_quartile)


In [20]:
#15. CLASSIFY YES/NO HOA FEE (1=YES, 0=NO)
df["hashoa"] = df["hoa_fee"].apply(lambda x: 1 if x > 0 else x)


In [21]:
#16. CLASSIFY SEASON LISTING WAS POSTED
df['datetime'] = pd.to_datetime(df['list_date'])
def get_season(date):
    year = date.year
    seasons = {'Winter': ((pd.Timestamp(f'{year}-12-21'), pd.Timestamp(f'{year+1}-03-19'))),
               'Spring': ((pd.Timestamp(f'{year}-03-20'), pd.Timestamp(f'{year}-06-20'))),
               'Summer': ((pd.Timestamp(f'{year}-06-21'), pd.Timestamp(f'{year}-09-21'))),
               'Fall': ((pd.Timestamp(f'{year}-09-22'), pd.Timestamp(f'{year}-12-20')))}
    
    for season, (start, end) in seasons.items():
        if start <= date <= end:
            return season
    return 'Winter' 

df['season_listed'] = df['datetime'].apply(get_season)

In [22]:
#17. CLASSIFY SOLD/LIST RATIO AS POSITIVE (2), EQUAL (1), OR NEGATIVE (0)
#def classify_sold_list_ratio(ratio):
df["sold_list_ratio_classified"] = df["sold_list_ratio"].apply(lambda x: 2 if x > 1 else (1 if x == 1 else 0))


In [23]:
#18. ENCODE CATEGORICAL VARIABLES: SEASON LISTED
label_encoder = LabelEncoder()

df['season_listed_encoded'] = label_encoder.fit_transform(df['season_listed'])

df['neighborhood_encoded'] = label_encoder.fit_transform(df['neighbourhood'])


In [24]:
#CHECK DATA
df.head()

Unnamed: 0,property_url,mls,mls_id,status,text,style,full_street_line,street,unit,city,state,zip_code,beds,full_baths,half_baths,sqft,year_built,days_on_mls,list_price,list_date,sold_price,last_sold_date,assessed_value,estimated_value,lot_sqft,price_per_sqft,latitude,longitude,neighborhoods,fips_code,stories,hoa_fee,parking_garage,nearby_schools,primary_photo,alt_photos,geometry,neighbourhood,days_on_market,bedcostratio,bathcostratio,bathbedcostratio,amenitiescostratio,pricepersqft,priceperlotsqft,total_baths,relative_price_per_sqft,sold_list_ratio,days_on_market_quartile,list_price_quartile,hashoa,datetime,season_listed,sold_list_ratio_classified,season_listed_encoded,neighborhood_encoded
1,https://www.realtor.com/realestateandhomes-det...,WECA,24-396131,SOLD,"Welcome to 2445 Nalin Drive, a serene retreat ...",SINGLE_FAMILY,2445 Nalin Dr,2445 Nalin Dr,,Los Angeles,CA,90077,3.0,1.0,1.0,2307.0,1965.0,93.0,1899000,2024-05-28,2306600,2024-08-29,490972.0,1967285.0,15511.0,1000.0,34.120675,-118.466681,"Bel Air, Westside LA",6037.0,2.0,0.0,2.0,Los Angeles Unified School District,http://ap.rdcpix.com/fc5c6498fbb46988960bc4e5f...,http://ap.rdcpix.com/fc5c6498fbb46988960bc4e5f...,POINT (-118.466681 34.120675),Bel-Air,93,1.300616e-06,8.670771e-07,2e-06,3e-06,999.826615,148.707369,2.0,0.86567,1.214639,Q4,Q4,0.0,2024-05-28,Spring,2,1,6
2,https://www.realtor.com/realestateandhomes-det...,MRCA,BB24116300,SOLD,Your new home is here! This fully renovated Me...,SINGLE_FAMILY,20368 Via Medici,20368 Via Medici,,Porter Ranch,CA,91326,5.0,4.0,1.0,4116.0,2001.0,69.0,2399000,2024-06-21,2350000,2024-08-29,1051086.0,2370000.0,9890.0,571.0,34.294593,-118.578361,"Porter Ranch, North Valley",6037.0,2.0,340.0,3.0,Los Angeles Unified School District,,,POINT (-118.578361 34.294593),Chatsworth,69,2.12766e-06,2.12766e-06,4e-06,6e-06,570.942663,237.613751,5.0,1.023715,0.979575,Q3,Q4,1.0,2024-06-21,Summer,0,2,17
3,https://www.realtor.com/realestateandhomes-det...,MRCA,SR24103064,SOLD,Accepting Back Up offers- Charming Del Rey 2 b...,SINGLE_FAMILY,4320 Kenyon Ave,4320 Kenyon Ave,,Los Angeles,CA,90066,3.0,3.0,0.0,1813.0,1948.0,97.0,1800000,2024-05-24,1800000,2024-08-29,625810.0,1796577.0,6650.0,993.0,33.99186,-118.428001,"Silicon Beach, Marina del Rey",6037.0,1.0,0.0,2.0,Los Angeles Unified School District,http://ap.rdcpix.com/ca2b87ead2758f0c0114db943...,http://ap.rdcpix.com/ca2b87ead2758f0c0114db943...,POINT (-118.428001 33.99186),Del Rey,97,1.666667e-06,1.666667e-06,3e-06,4e-06,992.829564,270.676692,3.0,0.967012,1.0,Q4,Q3,0.0,2024-05-24,Spring,1,1,24
5,https://www.realtor.com/realestateandhomes-det...,MRCA,GD24148443,SOLD,This property is a fixer. The house consists o...,SINGLE_FAMILY,19350 Lanark St,19350 Lanark St,,Reseda,CA,91335,3.0,2.0,0.0,1296.0,1954.0,37.0,699000,2024-07-23,750000,2024-08-29,179184.0,735542.0,9771.0,579.0,34.217147,-118.555485,"South Valley, Reseda",6037.0,1.0,0.0,2.0,Valley International Preparatory High District...,http://ap.rdcpix.com/8a45673b3f9102336a3d9a067...,http://ap.rdcpix.com/8a45673b3f9102336a3d9a067...,POINT (-118.555485 34.217147),Reseda,37,4e-06,2.666667e-06,7e-06,9e-06,578.703704,76.757753,2.0,0.903609,1.072961,Q1,Q1,0.0,2024-07-23,Summer,2,2,82
6,https://www.realtor.com/realestateandhomes-det...,WECA,24-433325,SOLD,Sold off market. Property is posted for Compar...,SINGLE_FAMILY,S Saltair Ave,S Saltair Ave,,Los Angeles,CA,90049,3.0,4.0,0.0,2833.0,1928.0,0.0,4800000,2024-08-29,4800000,2024-08-29,328766.0,4591000.0,20067.0,1694.0,34.054641,-118.473162,"Brentwood, Westside LA",6037.0,1.0,0.0,0.0,"New West Charter District, Los Angeles Unified...",,,POINT (-118.473162 34.054641),Brentwood,0,6.25e-07,8.333333e-07,1e-06,1e-06,1694.316978,239.198684,4.0,1.251978,1.0,Q1,Q4,0.0,2024-08-29,Summer,1,2,11


In [25]:
#EXPORT TO CSV
df.to_csv('../data/Los_Angeles_Sold_CLEANED.csv', index=False)


In [26]:
#EXPORT TO GEOJSON

df['geometry'] = df['geometry'].apply(wkt.loads)

gdf = gpd.GeoDataFrame(df, geometry='geometry')

gdf.to_file("../data/Los_Angeles_Sold.geojson", driver="GeoJSON")

In [28]:
print(df.property_url)

1       https://www.realtor.com/realestateandhomes-det...
2       https://www.realtor.com/realestateandhomes-det...
3       https://www.realtor.com/realestateandhomes-det...
5       https://www.realtor.com/realestateandhomes-det...
6       https://www.realtor.com/realestateandhomes-det...
                              ...                        
9869    https://www.realtor.com/realestateandhomes-det...
9870    https://www.realtor.com/realestateandhomes-det...
9872    https://www.realtor.com/realestateandhomes-det...
9875    https://www.realtor.com/realestateandhomes-det...
9880    https://www.realtor.com/realestateandhomes-det...
Name: property_url, Length: 6070, dtype: object
