In [1]:
import pandas as pd
import numpy as np
import scipy as sc
import datetime
import os
import re

# DECLARING GLOBAL VARIABLES

constants = set([ 'COMM', 'DUPLEX', 'GARDEN', 'PH', 'RES', 'TH', 'UNIT'])
remove = set(['FLR', 'FL', 'UNIT', 'APT'])
mod_listings = [] # np.zeros((len(listings), len(listings.iloc[0])+1))
mod_closings = [] # np.zeros((len(closings), len(closings.iloc[0])+1))

In [2]:
# READING CSV INTO DATAFRAME

def start():
    listings = pd.read_csv('listings.csv', header='infer', sep=',',index_col=None)
    closings = pd.read_csv('closings.csv', header='infer', sep=',',index_col=None)
    listings['list_date'] = pd.to_datetime(listings['list_date'], format="%Y-%m-%d").dt.floor('d')
    closings['close_date'] = pd.to_datetime(closings['close_date'], format="%Y-%m-%d")
    closings['close_price'] = closings['close_price'].astype('int64')
    listings = listings.dropna()
    closings = closings.dropna()

#     print(listings.dtypes)
#     print()
#     print(closings.dtypes)
    print(listings.head())
    print(closings.head())
    return listings, closings

In [3]:
# VALIDATION AND PARSING FUNCTIONS

def date_validation(listing_date, closing_date):
    return (listing_date < closing_date)

def price_validation(list_price, close_price):
    if close_price < (list_price * 1.4):
        return True
    return False

def parse_digit(apt, unit_nums, apt_nums):
    
    # 105, 1106
    if len(apt) > 2:
        unit_nums.append(str(int(apt[:-2]))) # remaining digits
        apt_nums.append(str(int(apt[-2:]))) # last 2 digits
    
    # 5, 11
    else:
        unit_nums.append("N/A")
        apt_nums.append(apt)
    
    return unit_nums, apt_nums

def parse_alphanumeric(apt, unit_nums, apt_nums):
    
    # APT 10, APT10, 3FLR, 
    for string in remove:
        if string in apt:
            apt = apt.replace(string, "")
            return parse_apartment_number(apt, unit_nums, apt_nums)

    # PH03, GARDENA
    for string in constants:
        if string in apt:
            unit_nums.append(string)
            apt = apt.replace(string, "")
            apt_nums.append(apt)
            return unit_nums, apt_nums
    
    if len(apt) > 0:
        # N6, 31C regex extract char, append resp
        r1 = re.compile("([0-9]+)([a-zA-Z]+)")
        r2 = re.compile("([a-zA-Z]+)([0-9]+)")
        m1 = r1.match(apt)
        m2 = r2.match(apt)
#         l = 0

        if m1:
            unit_nums.append(m1.group(1))
            apt_nums.append(m1.group(2))
            l = len(m1.group(1)) + len(m1.group(2))
#             if(l > 0):
#                 unit_nums, apt_nums = parse_alphanumeric(m1[-l:], unit_nums, apt_nums)
        elif m2:
            unit_nums.append(m2.group(2))
            apt_nums.append(m2.group(1))
            l = len(m2.group(1)) + len(m2.group(2))
#             if(l > 0):
#                 unit_nums, apt_nums = parse_alphanumeric(m2[-l:], unit_nums, apt_nums)
    
    
    # Additive 2EF, 11D12E, 1213A
    
    return unit_nums, apt_nums

def parse_spec_chars(apt, unit_nums, apt_nums):
    
    if('-' in apt):
        apt = apt.replace('-','')
        return parse_apartment_number(apt, unit_nums, apt_nums) 
    
    elif('/' in apt):
        split_apt = apt.split("/")
        
        # 1101/02/03 
        if len(split_apt[0]) > len(split_apt[1]):
            result = [split_apt[0]]
            for element in split_apt[1:]:
                result.append(split_apt[0][:-len(element)] + element)
            for item in result:
                apt_nums.append (item[len(split_apt[1]):])
                unit_nums.append(item[0:len(split_apt[1])])
        
        # 22/23A 2/3CD
        elif len(split_apt[1]) > len(split_apt[0]):
            result = []
            for element in split_apt[0:1]:
                result.append(element + split_apt[1][len(element):] )
                result.append(split_apt[1])
                for item in result:
                    apt_nums.append (item[len(split_apt[0]):])
                    unit_nums.append(item[0:len(split_apt[0])])
        
        # 2/3 22/C
        elif len(split_apt[1]) == len(split_apt[0]):
            unit_nums0, apt_nums0 = parse_apartment_number(split_apt[0], unit_nums, apt_nums)
            unit_nums1, apt_nums1 = parse_apartment_number(split_apt[1], unit_nums, apt_nums)
            unit_nums = unit_nums0 + unit_nums1
            apt_nums = apt_nums0 + apt_nums1
    
    return unit_nums, apt_nums

def parse_apartment_number(apt, unit_nums, apt_nums):
    
    if ('/' in apt or '-' in apt or ',' in apt):
        unit_nums, apt_nums = parse_spec_chars(apt, unit_nums, apt_nums)
    
    elif (apt.isdigit()):
        unit_nums, apt_nums = parse_digit(apt, unit_nums, apt_nums)

    elif (apt.isalnum()):
        apt = apt.upper()
        unit_nums, apt_nums = parse_alphanumeric(apt, unit_nums, apt_nums)

    if len(unit_nums) == len(apt_nums):
        return unit_nums, apt_nums
    
    return None, None

In [4]:
# MODIFYING DATAFRAMES - DENORMALIZATION

def generate_data_point(row_id, bldg_id, unit_nums, apt_nums, price, row_date, table_id):
    
    # Table_id : 1 for Listings
    # Table_id : 2 for Closings
    for i in range(len(apt_nums)):
        building_id = bldg_id
        unit = unit_nums[i]
        apt = apt_nums[i]
        price = price/len(apt_nums) # ASSUMPTION
        date = row_date

        if(table_id==1):
            # append to mod_listings
            mod_listings.append(np.array([price, date, building_id, unit, apt]))
            
        else:
            # append to mod_closings
            mod_closings.append(np.array([price, date, building_id, unit, apt]))

def modify_listings(listings):
    for i in range(len(listings)):
        try:
            unit_nums, apt_nums = parse_apartment_number(listings.iloc[i]['apartment_number'], [], [])
            generate_data_point(listings.iloc[i]['listing_id'], 
                                listings.iloc[i]['building_id'], 
                                unit_nums,
                                apt_nums,
                                listings.iloc[i]['list_price'], 
                                listings.iloc[i]['list_date'], 1)
        except:
            print(listings.iloc[i]['listing_id'], listings.iloc[i]['apartment_number'])
    return mod_listings
    
def modify_closings(closings):
    for i in range(len(closings)):
        try:
            unit_nums, apt_nums = parse_apartment_number(closings.iloc[i]['apartment_number'], [], [])
            generate_date_point(closings.iloc[i]['closing_id'], 
                                closings.iloc[i]['building_id'], 
                                unit_nums,
                                apt_nums,
                                closings.iloc[i]['close_price'], 
                                closings.iloc[i]['close_date'], 2)
        except:
            print(closings.iloc[i]['closing_id'], closings.iloc[i]['apartment_number'])
    return mod_closings

In [5]:
# FUNCTION MAIN

listings, closings = start()
# modify_listings(listings)

list_cols = ["list_price", "list_date", "building_id", "unit", "apartment"]
close_cols = ["close_price", "close_date", "building_id", "unit", "apartment"]


# new_listings = pd.DataFrame(mod_listings)
# new_listings.columns = list_cols
# print(new_listings.head())

   listing_id  building_id apartment_number  list_price  list_date
0           1            7               PH     2300000 2018-08-02
1           2            7               14     2295000 2018-03-26
2           3           14             PH-4      685000 2017-11-21
3           4           14             1206      560000 2015-04-11
4           5           14             1407      585000 2015-04-14
   closing_id  building_id apartment_number  close_price close_date
0           1            6             COMM     19000000 2016-04-15
1           2            6         COMM,RES     30150000 2014-10-27
2           3            6              RES     26750000 2017-07-20
3           4            6              RES     26750000 2017-07-20
4           5            6              RES     26750000 2017-07-20


In [7]:
mod_closings = modify_closings(closings[:20])
new_closings = pd.DataFrame(mod_closings)
print(new_closings.head())

1 COMM
2 COMM,RES
3  RES
4 RES
5 RES
6 FL2
7 5
8 503
9 1208
10 804
11 701
12 1405
13 1305
14 805
15 201
16 1002
17 301
18 807
19 507
20 506
Empty DataFrame
Columns: []
Index: []


In [None]:
print(len(new_listings))
print(new_closings)

In [None]:
unit_nums = []
apt_nums = []
parse_apartment_number("COMM", unit_nums, apt_nums)
parse_apartment_number("503", unit_nums, apt_nums)
parse_apartment_number("31C", unit_nums, apt_nums)
print(unit_nums)
print(apt_nums)

True