# Data cleaning
---

This script takes .cvs as input, and it cleans the data frame by
- Removing unwanted feature
- Convert long string into multiple dummy variables
- Add new features such as longitude and latitudes from addresses

## Import libraries and data

In [1]:
import pandas as pd
import numpy as np
import googlemaps
import geocoder
import math, time

In [2]:
# read data from csv
df = pd.read_csv("data/ny_rental_data.r4.csv", index_col = 0)

## Drop unwanted columns

In [3]:
df.drop("deposit", axis=1, inplace=True)
df.drop("name", axis=1, inplace=True)
df.drop("unit", axis=1, inplace=True)
df.drop("school", axis=1, inplace=True) # drop school for now. The school information could be redundant as location should take these into accounts

## Set constants

In [4]:
# set limits of sqft
sqft_factor_min = 0.5
sqft_min = 100

## Borough

In [5]:
# clean up and capitalize
df["borough"] = df.borough.transform(lambda x: x[:-3].replace('-',' ').title())

## Bedroom and bathroom

In [6]:
# select digits and return
def digit_filter(string):

    digit = ''.join(filter(lambda x: x.isdigit(), str(string)))

    if digit == '':
        return 0
    else:
        return int(digit)

In [7]:
# apply digit_filter to remove any strings
df["bathrooms"] = df["bathrooms"].apply(digit_filter)
df["bedrooms"] = df["bedrooms"].apply(digit_filter)

## Lease length

In [8]:
# find max integer
def find_max(string):

    string_digit = [x for x in str(string).split() if x.isdigit()]

    if string_digit:
        return(int(max(string_digit)))
    else:
        return None

In [9]:
# replace leaseLength with maximum digit
df["leaseLength"] = df["leaseLength"].apply(find_max)

# fill Null entries with the average
df["leaseLength"].fillna(df["leaseLength"].mean(), inplace=True)

# convert float to int
df["leaseLength"] = df["leaseLength"].astype(int)

## Pet policy

In [10]:
# function to apply to pet_policy column
def std_pet_policy(pet):
    # list to temporary hold std pet
    std_pet_list = []

    # remove un-wanted characters
    pet = pet.replace('[','')
    pet = pet.replace(']','')
    pet = pet.replace("'","")

    # create list of pet from the string
    pet_list = pet.split(",")

    # for each pet found in the unit, find std pet from the dictionary
    for std_pet in dic_pet:
        if any(std_pet in pet.lower() for pet in pet_list):
            std_pet_list.append(dic_pet[std_pet])

    # return std pet
    return std_pet_list

In [11]:
# define list of pets to check
with open('data/pet','r') as dic:
    dic_pet = eval(dic.read())

# apply "std_pet_policy"
df["pet_policy"] = df["pet_policy"].apply(std_pet_policy)

# concatenate dummy DataFrame
df_dummy = df["pet_policy"].str.join(sep='*').str.get_dummies(sep='*')

# add prefix to all pet dummy columns
df_dummy = df_dummy.add_prefix("pet_")

# concatenate dummy to the main DataFrame
df = pd.concat([df, df_dummy], axis=1)

# remove pet column since we converted it to dummies
df.drop("pet_policy", axis=1, inplace=True)

## Extract built year and number of units

In [12]:
def built_year(string):
    # extract year
    year = [digit_filter(x) for x in str(string).split('\n') if "built in" in x.lower()]

    # return valid year
    if year:
        return year[0]
    else:
        return None

In [13]:
def property_size(string):
    # extract property size
    string = str(string).replace('/','\n')

    # extract units
    units = [digit_filter(x) for x in str(string).split('\n') if "units" in x.lower()]

    # return valid units
    if units:
        return units[0]
    else:
        return None

In [14]:
# create new column containing built year
df["built_year"] = df["property_info"].apply(built_year)

# fill null value with the mean built year
df["built_year"].fillna(df["built_year"].median(), inplace=True)

# convert float to int
df["built_year"] = df["built_year"].astype(int)

# create new column containing property size
df["property_size"] = df["property_info"].apply(property_size)

# fill null value with the mean property_size
df["property_size"].fillna(df["property_size"].median(), inplace=True)

# convert float to int
df["property_size"] = df["property_size"].astype(int)

# remove property_info after extracting desired info
df.drop("property_info", axis=1, inplace=True)

## Property rating

In [15]:
# extract rating number
df["rating"] = df["rating"].apply(digit_filter)

# replace no rating with Nan
df["rating"] = df["rating"].apply(lambda x: None if x == 0 else x)

# replace Nan with average
df["rating"].fillna(df["rating"].mean(), inplace=True)

# convert to int
df["rating"] = df["rating"].astype(int)

## Rent

In [16]:
# find the mean rent of a property
def find_mean(string):

    # temporary list
    list_temp = [digit_filter(x) for x in str(string).split()]

    # return mean of the temp
    if not list_temp[0] == 0:

        # remove 0 from list
        list_temp = [x for x in list_temp if not x == 0]

        # return mean
        return sum(list_temp) / len(list_temp)

    # if list_temp is empty, return null
    else:
        return None

In [17]:
# apply find_mean
df["rent"] = df["rent"].apply(find_mean)

# drop rows without rent
df.dropna(subset=["rent"], inplace=True)

## Extract address and title from rental_title

In [18]:
def extract_city(string):
    city = string.split(",")[-2:-1][0].strip().lower()
    return city

In [19]:
def extract_state(string):
    state = string.split(",")[-1:][0].strip()
    return state

In [20]:
def extract_street(string):
    street = string.split(",")[0].strip()
    return street

In [21]:
def extract_title(string):
    title = string.split(",")[0].strip()
    return title

In [22]:
# change city to lower case
df["city"] = df["city"].transform(lambda x: str(x).lower() if type(x) is str else None)

# extract city from rental_title and merge with existing "city" column
df["city"] = df["city"].combine_first(df["rental_title"].apply(extract_city))

# extract state from rental_title and merge with existing "state" column
df["state"] = df["state"].combine_first(df["rental_title"].apply(extract_state))

# extract street from rental_title and merge with existing "street" column
df["street_address"] = df["street_address"].combine_first(df["rental_title"].apply(extract_street))

# extract rental title from rental_title and replace the original rental_title
df["rental_title"] = df["rental_title"].apply(extract_title)

## Sqft

In [23]:
# apply "find_mean" to sqft column
df["sqft"] = df["sqft"].apply(find_mean)

# replace un-realistic sqft with None
df.loc[(df["sqft"] / ((df["bedrooms"] + 1) * df["rent"])) > sqft_factor_min, "sqft"] = None # sqft too large
df.loc[df.sqft < sqft_min, "sqft"] = None # sqft too small

# fill null with average sqft of apartments with the same number of bedrooms
df["sqft"] = df.groupby(["bedrooms"])["sqft"].transform(lambda x: x.fillna(x.mean()))

## Amenities

In [24]:
# function to apply to amenity column
def std_amenity(amenity):
    # list to temporary hold std amenties
    std_amenity_list = []

    # remove un-wanted characters
    amenity = amenity.replace('[','')
    amenity = amenity.replace(']','')
    amenity = amenity.replace("'","")

    # create list of amenties from the string
    amenity_list = amenity.split(",")

    # for each amenity found in the unit, find std amenity from the dictionary
    for sam in dic_amenity:
        if any(sam in am.lower() for am in amenity_list):
            std_amenity_list.append(dic_amenity[sam])

    # return std amenity
    return std_amenity_list

In [25]:
# define list of amenties to check
with open('data/amenities','r') as dic:
    dic_amenity = eval(dic.read())

# apply "std_amenity"
df["amenity"] = df["amenity"].apply(std_amenity)

# concatenate dummy DataFrame
df_dummy = df["amenity"].str.join(sep='*').str.get_dummies(sep='*')

# add prefix to all amenity dummy columns
df_dummy = df_dummy.add_prefix("amenity_")

# concatenate dummy to the main DataFrame
df = pd.concat([df, df_dummy], axis=1)

In [26]:
# remove amenty column since we converted it to dummies or don't want to count
df.drop("amenity", axis=1, inplace=True)

## Full address

In [27]:
# Create full address column
df["full_address"] = df[['street_address', 'city', 'state']].apply(lambda x: ', '.join(x), axis=1)

## Remove duplicates

In [28]:
# remove rows with the same number of bedrooms, bathrooms and address
df = df.drop_duplicates(subset=["full_address","bedrooms","bathrooms"])

## Longitude and latitude

In [29]:
def geocoder_request(address):
    return geocoder.google(address)
    #return geocoder.mapquest(address)

In [None]:
# setup google maps API
gmaps = googlemaps.Client(key='AIzaSyDy19n_bTTNnI7pb_MdtknhpFosczScsF8')

In [None]:
longitude = []
latitude = []

# loop over all addresses, get longitude, latitude
for address in df["full_address"]:
    
    # request geocode
    geocode = geocoder_request(address)
   
    # if we are over API limit, pause for 30 minutes
    if geocode.status == "OVER_QUERY_LIMIT":
        time.sleep(180) # sleep for 30 minutes

    # append good longitude and latitude
    if geocode.status == "OK":
        longitude.append(geocode.lng)
        latitude.append(geocode.lat)
        
    # if status code is not okay, print address
    else:
        print("Unknown status from %s" % address)
        
        # place holder
        longitude.append("")
        latitude.append("")

        
df["longitude"] = longitude
df["latitude"] = latitude

Unknown status from 555 10th Ave, new york, NY
Unknown status from 555 10th Ave, new york, NY
Unknown status from 555 10th Ave, new york, NY
Unknown status from 555 10th Ave, new york, NY
Unknown status from 100 W 31st St, new york, NY
Unknown status from 100 W 31st St, new york, NY
Unknown status from 625 W 57th St, new york, NY
Unknown status from 625 W 57th St, new york, NY
Unknown status from 625 W 57th St, new york, NY
Unknown status from 435 W 31st St, new york, NY
Unknown status from 435 W 31st St, new york, NY
Unknown status from 435 W 31st St, new york, NY
Unknown status from 435 W 31st St, new york, NY
Unknown status from 155 W 68th St, new york, NY
Unknown status from 155 W 68th St, new york, NY
Unknown status from 155 W 68th St, new york, NY
Unknown status from 225 E 95th St, new york, NY
Unknown status from 225 E 95th St, new york, NY
Unknown status from 225 E 95th St, new york, NY
Unknown status from 225 E 95th St, new york, NY
Unknown status from 225 E 95th St, new york,

## Save DataFrame as csv

In [None]:
# check before save to csv
print(df.head())

In [None]:
# save as csv
df.to_csv("data/ny_rental_data_cleaned.csv")