In [3]:
import json
import re

import pandas as pd
import numpy as np

**Reading Data Files**

In [4]:
building_meta_df =  pd.read_csv("buildings.csv")
building_info_df = pd.read_csv("building_info.csv")
building_price_history_df = pd.read_csv("building_pricing_history.csv")
building_unit_df = pd.read_csv("building_unit_data.csv")

**Listing Column Names with Null Values Count**

In [None]:
building_meta_df.isnull().sum()

In [None]:
building_info_df.isna().sum()

In [None]:
building_price_history_df.isna().sum()

In [None]:
building_unit_df.isna().sum()

**Listing DataFrame Shapes**

In [None]:
building_meta_df.shape

In [None]:
building_info_df.shape

In [None]:
building_price_history_df.shape

In [None]:
building_unit_df.shape

**Merging building_meta_df, building_info_df and building_unit_df**

In [5]:
merged_df = pd.merge(building_meta_df, building_info_df, left_on='id', right_on='id', how='left')
merged_df.rename(columns={'id': 'building_id'}, inplace=True)
merged_df = pd.merge(merged_df, building_unit_df, left_on='building_id', right_on='building_id', how='inner')

In [None]:
merged_df.shape

**Calculating Average Price neighborhood_name**

In [6]:
merged_df['average_price_by_neighborhood'] = merged_df.groupby(['neighborhood_name', 'beds'])['price'].transform('mean')

**Parse nearest_public_transportation into public_transportation and public_transportation_time**

In [7]:
def parse_public_transportation(row):
    try:
        transportation = json.loads(row['nearest_public_transportation'])[0]
        transportation = re.sub(r'\(.*\)|\d+[\s-]+(?:min).*$', '', transportation)
        return re.sub(r'[\s-]+$', '', transportation)
    except:
        return

def parse_public_transportation_time(row):
    try:
        transportation = json.loads(row['nearest_public_transportation'])[0]
        time = re.findall(r'(\d+)[\s-]+min', transportation)
        return time and int(time[0]) or None
    except:
        return

In [8]:
merged_df['public_transportation'] = merged_df.apply(parse_public_transportation, axis=1)
merged_df['public_transportation_time'] = merged_df.apply(parse_public_transportation_time, axis=1)

**Parse nearest_convenience_store into convenience_store and convenience_store_time**

In [9]:
def parse_convenience_store_time(row):
    times = []
    try:
        for _store in json.loads(row['nearest_convenience_store']):
            time = re.findall(r'(\d+)[\s-]+min', _store)
            
            if not time:
                continue
            
            times.append(int(time[0]))

        return min(times)
    except:
        return
    
def parse_convenience_store(row):
    min_time = parse_convenience_store_time(row)
    try:
        for _store in json.loads(row['nearest_convenience_store']):
            time = re.findall(r'(\d+)[\s-]+min', _store)
            time = time and int(time[0]) or None
            
            if time != min_time:
                continue

            return re.sub(r'[\s-]+$', '', re.sub(r'\(.*\)|\d+[\s-]+(?:min).*$', '', _store))
        return json.loads(row['nearest_convenience_store'])[0]
    except:
        return 

In [10]:
merged_df['convenience_store'] = merged_df.apply(parse_convenience_store, axis=1)
merged_df['convenience_store_time'] = merged_df.apply(parse_convenience_store_time, axis=1)

**Parse nearest_grocery_store into grocery_store and grocery_store_time**

In [11]:
def parse_grocery_store_time(row):
    times = []
    try:
        for _store in json.loads(row['nearest_grocery_store']):
            time = re.findall(r'(\d+)[\s-]+min', _store)
            
            if not time:
                continue
            
            times.append(int(time[0]))

        return min(times)
    except:
        return
    
def parse_grocery_store(row):
    min_time = parse_grocery_store_time(row)
    try:
        for _store in json.loads(row['nearest_grocery_store']):
            time = re.findall(r'(\d+)[\s-]+min', _store)
            time = time and int(time[0]) or None
            
            if time != min_time:
                continue

            return re.sub(r'[\s-]+$', '', re.sub(r'\(.*\)|\d+[\s-]+(?:min).*$', '', _store))
        return json.loads(row['nearest_grocery_store'])[0]
    except:
        return 

In [12]:
merged_df['grocery_store'] = merged_df.apply(parse_grocery_store, axis=1)
merged_df['grocery_store_time'] = merged_df.apply(parse_grocery_store_time, axis=1)

**Map unit_0_balcony, unit_1_balcony, unit_2_balcony, unit_2_balcony to unit_balcony**

In [13]:
def map_balcony(row):
    
    if row['unit_0_balcony'] == 'Yes' and row['beds'] == 0:
        return 1
    
    if row['unit_1_balcony'] == 'Yes' and row['beds'] == 1:
        return 1
    
    if row['unit_2_balcony'] == 'Yes' and row['beds'] == 2:
        return 1
    
    if row['unit_3_balcony'] == 'Yes' and row['beds'] == 3:
        return 1
      
    if row['beds'] > 3:
        
        if 'Yes' in [row['unit_0_balcony'],
                     row['unit_1_balcony'],
                     row['unit_2_balcony'],
                     row['unit_3_balcony']]:
            return 1
        return 0
    
    return 0

In [15]:
merged_df['unit_balcony'] = merged_df.apply(map_balcony, axis=1)

**Map concession_0_bed, concession_1_bed, concession_2_bed, concession_3_bed to concession_beds**

In [16]:
def map_concession(row):
    
    if row['beds'] == 0 and  np.isnan(row['concession_0_bed']) == False:
        return row['concession_0_bed']
    
    if  row['beds'] == 1 and  np.isnan(row['concession_1_bed']) == False:
        return row['concession_1_bed']
    
    if  row['beds'] == 2 and  np.isnan(row['concession_2_bed']) == False:
        return row['concession_2_bed']
    
    if  row['beds'] == 3 and  np.isnan(row['concession_3_bed']) == False:
        return row['concession_3_bed']
    
    if row['beds'] > 3:
        min_concession = min([row['concession_0_bed'],
                              row['concession_1_bed'],
                              row['concession_2_bed'],
                              row['concession_3_bed']])
        return min_concession
    
    return None

In [17]:
merged_df['concession_unit'] = merged_df.apply(map_concession, axis=1)
merged_df['concession_unit'].unique()


array([ 0. ,  1. ,  nan,  2. ,  1.5,  3. ,  2.5, 20. ,  0.5])

In [18]:
merged_df['concession_unit'].fillna(merged_df['concession_unit'].mean(), inplace=True)

**Handling Missing Value**

In [19]:
def map_washer_dryer(row):
    if not row['washer_dryer'] in [None, 'No']:
        return 0

    if not row['washer_dryer'] in ['Yes', 'Select Units']:
        return 1
    
    return 0

In [20]:
merged_df['washer_dryer'] = merged_df.apply(map_washer_dryer, axis=1)

In [21]:
def map_pool(row):
    if not row['pool'] in [None, 'No']:
        return 0

    if not row['pool'] == 'Yes':
        return 1
    
    return 0

In [22]:
merged_df['pool'] = merged_df.apply(map_pool, axis=1)

In [23]:
def map_parking(row):
    parking_terms = [
        'Yes', 'Garage Parking', 'Attached Parking', 
        'Offsite Parking', 'Select Units', 'Assigned Parking'
    ]
    
    if not row['parking'] in [None, 'No']:
        return 0

    if not row['parking'] in parking_terms:
        return 1
    
    return 0

In [24]:
merged_df['parking'] = merged_df.apply(map_parking, axis=1)

In [25]:
merged_df.tour_sunday.unique()

array([False, nan, True], dtype=object)

In [26]:
def map_tour_sunday(row):
    if not row['tour_sunday'] in [None, False]:
        return 0

    if not row['tour_sunday'] == True:
        return 1
    
    return 0
    

In [27]:
merged_df['tour_sunday'] = merged_df.apply(map_parking, axis=1)

In [28]:
merged_df.short_term_lease.unique()

array([nan, 'No', 'Sometimes', 'Yes', 'During Peak Season'], dtype=object)

In [29]:
def map_short_term_lease(row):
    if not row['short_term_lease'] in [None, 'No']:
        return 0

    if not row['short_term_lease'] == ['Sometimes', 'Yes', 'During Peak Season']:
        return 1
    
    return 0
    

In [30]:
merged_df['short_term_lease'] = merged_df.apply(map_parking, axis=1)

In [31]:
merged_df['cooperate'].unique()

array([ True, False])

In [32]:
merged_df['cooperate'] = merged_df['cooperate'].map(lambda x: 1 if x == True else 0)

In [33]:
merged_df['grocery_store'].replace([None], 'Unknown', inplace=True)
merged_df['grocery_store_time'].fillna(merged_df['grocery_store_time'].mean(), inplace=True)

In [34]:
merged_df['convenience_store'].replace([None], 'Unknown', inplace=True)
merged_df['convenience_store_time'].fillna(merged_df['convenience_store_time'].mean(), inplace=True)

In [35]:
merged_df['public_transportation'].replace([None], 'Unknown', inplace=True)
merged_df['public_transportation_time'].fillna(merged_df['public_transportation_time'].mean(), inplace=True)

**Label Encoding**

In [36]:
grocery_store_df = pd.get_dummies(merged_df[['grocery_store']], columns=["grocery_store"])
merged_df = merged_df.join(grocery_store_df)

In [37]:
convenience_store_df = pd.get_dummies(merged_df[['convenience_store']], columns=["convenience_store"])
merged_df = merged_df.join(convenience_store_df)

In [38]:
public_transportation_df = pd.get_dummies(merged_df[['public_transportation']], columns=["public_transportation"])
merged_df = merged_df.join(public_transportation_df)

In [41]:
merged_df.type.isna().sum()

414197

**Drop extra columns**

In [42]:
to_drop = ['scraped_on', 
           'scrape_id', 
           'hotspot_description',
           'website_data_in_images',
           'website_needs_ui_navigation', 
           'website_reachable',
           'scraper_complete',
           'data_source',
           #'leasing_type',
           #'architect',
           #'aptamigo',
           #'hotspot_features',
           'hotspot_neighbourhood',
           'hotspot',
           'balcony',
           'unit_0_balcony',
           'unit_convert_balcony',
           'unit_1_balcony',
           'unit_2_balcony',
           'unit_3_balcony',
           'concession_title',
           'concession_0_bed',
           'concession_1_bed',
           'concession_2_bed',
           'concession_3_bed',
           'concession_source',
           'nearest_public_transportation',
           'nearest_convenience_store',
           'nearest_grocery_store',
           #'unit_number',
           'type',
           'email',
           #'updated_on',
           #'created_on',
           #'created_at',
           'summary',
           'description',
           'address',
           'name',
           'website',
           'phone',
           'email',
           'zip',
           #'amenities',
           #'no_of_units',
          ]

In [43]:
merged_df.drop(columns=to_drop, inplace=True)
merged_df.shape

(421620, 143)

In [45]:
merged_df.to_csv('cleaned_output.csv', index=False)

In [46]:
merged_df.head()

Unnamed: 0,building_id,city,state,lat,lng,created_at,cooperate,company_id,company,pet_policy,...,public_transportation_Red Line - Lasalle/Division,public_transportation_Red Line - Loyola (,public_transportation_Red Line - Roosevelt,public_transportation_Red Line- Chicago,public_transportation_Red Line- Grand,public_transportation_Red Line- Roosevelt,public_transportation_Red/Brown/Purple Line - Belmont,public_transportation_Red/Brown/Purple Line - Fullerton,public_transportation_Red/Green/Pink Line - Roosevelt,public_transportation_Unknown
0,216,Chicago,IL,41.889181,-87.618089,2019-08-17 02:47:14,1,333.0,Lincoln Property Company,Allowed with fee and Restrictions,...,0,0,0,0,0,0,0,0,0,1
1,216,Chicago,IL,41.889181,-87.618089,2019-08-17 02:47:14,1,333.0,Lincoln Property Company,Allowed with fee and Restrictions,...,0,0,0,0,0,0,0,0,0,1
2,216,Chicago,IL,41.889181,-87.618089,2019-08-17 02:47:14,1,333.0,Lincoln Property Company,Allowed with fee and Restrictions,...,0,0,0,0,0,0,0,0,0,1
3,216,Chicago,IL,41.889181,-87.618089,2019-08-17 02:47:14,1,333.0,Lincoln Property Company,Allowed with fee and Restrictions,...,0,0,0,0,0,0,0,0,0,1
4,216,Chicago,IL,41.889181,-87.618089,2019-08-17 02:47:14,1,333.0,Lincoln Property Company,Allowed with fee and Restrictions,...,0,0,0,0,0,0,0,0,0,1
