# Processing and deduplicating Features

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from collections import defaultdict, Counter

In [57]:
train_df = pd.read_json("../input/train.json")
test_df = pd.read_json("../input/test.json")
train_df['features'] = train_df["features"].apply(lambda x: ["_".join(i.strip().lower().split(" ")) for i in x])
test_df['features'] = test_df["features"].apply(lambda x: ["_".join(i.strip().lower().split(" ")) for i in x])
train_test = pd.concat([train_df, test_df], 0)

In [58]:
train_test.head(2)

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[doorman, elevator, fitness_center, cats_allow...",low,40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue


I'm going do assume you've taken a look at the data and noticed that some features are duplicates and that some are also rare (occurs very few times).

That being said lets try to drop rare features and also try to deduplicate similar features using first-k-chars. as a hash.

First we'll read in data, lowercase everything and remove any whitespace

In [25]:
# features = train_test[["features"]].apply(
#     lambda _: [list(map(str.strip, map(str.lower, x))) for x in _])

In [59]:
# features = train_test[["features"]].apply(
#     lambda x: ["_".join(str(i).strip().lower().split(" ")) for i in x])

In [61]:
features = train_test[["features"]]
features.head()

Unnamed: 0,features
10,[]
10000,"[doorman, elevator, fitness_center, cats_allow..."
100004,"[laundry_in_building, dishwasher, hardwood_flo..."
100007,"[hardwood_floors, no_fee]"
100013,[pre-war]


In [72]:
# count features and drop features with less than n counts

n = 5

feature_counts = Counter()
for feature in features.features:
    feature_counts.update(feature)
feature = map(lambda x: x.encode('utf-8'), sorted([k for (k,v) in feature_counts.items() if v > n]))
feature[:10]

['$250_security_deposit',
 '1_month_free',
 '24-hour_doorman',
 '24/7_concierge',
 '24/7_doorman',
 '24/7_doorman_concierge',
 '24_hour_concierge/doorman',
 '24_hour_doorman',
 '24hr_doorman',
 '2_full_baths']


Notice that we see like variations of `24-hour` and we will see later that thats not the only duplicate features 

## Hashing cleaned up data using first4 characters.

We will first do some manual work to simplify the strings and use the first four characters as a key for each feature. then we will use that key to deduce data.

In [73]:
def clean(x):
    x = s.replace("-", "")
    x = x.replace(" ", "")
    x = x.replace("twenty four hour", "24")
    x = x.replace("24/7", "24")
    x = x.replace("24hr", "24")
    x = x.replace("24-hour", "24")
    x = x.replace("24hour", "24")
    x = x.replace("24 hour", "24")
    x = x.replace("common", "cm")
    x = x.replace("concierge", "doorman")
    x = x.replace("bicycle", "bike")
    x = x.replace("private", "pv")
    x = x.replace("deco", "dc")
    x = x.replace("decorative", "dc")
    x = x.replace("onsite", "os")
    x = x.replace("outdoor", "od")
    x = x.replace("ss appliances", "stainless")
    return x

def feature_hash(x):
    cleaned = clean(x, uniq)
    key = cleaned[:4].strip()
    return key

In [74]:
key2original = defaultdict(list)
k = 4
for f in feature:
    cleaned = clean(f)
    key = cleaned[:k].strip()
    key2original[key].append(f)

Lets take a look at the dedups! Don't worry about the key, but just take a look at what values are in the same key

In [75]:
key2original

defaultdict(list,
            {'$250': ['$250_security_deposit'],
             '1_mo': ['1_month_free'],
             '24_d': ['24-hour_doorman',
              '24/7_concierge',
              '24/7_doorman',
              '24/7_doorman_concierge',
              '24hr_doorman'],
             '24_h': ['24_hour_concierge/doorman', '24_hour_doorman'],
             '2_fu': ['2_full_baths'],
             'a_fu': ['a_full_service_luxury_highrise'],
             'actu': ['actual_apt._photos', 'actual_photos!'],
             'air_': ['air_conditioning'],
             'all_': ['all_pets_ok', 'all_utilities_included'],
             'assi': ['assigned-parking-space'],
             'atte': ['attended_lobby'],
             'back': ['backyard'],
             'balc': ['balcony'],
             'base': ['basement_storage'],
             'bask': ['basketball_court'],
             'bike': ['bicycle_room', 'bike_room', 'bike_storage'],
             'bill': ['billiards',
              'billiards_room',
    

In [36]:
print("number of deduped features:", len(key2original))
print("number of old features:", len(feature))

In order to make this easier to use, I'll output this as a CSV of the original feature and the deduced string.

In [45]:
def to_tuples():
    for f in feature:
        key = clean(f)[:k].strip()
        yield (f, key2original[key][0])
        
deduped = list(to_tuples())
df = pd.DataFrame(deduped, columns=["original_feature", "unique_feature"])

In [46]:
df.head()

In [47]:
df.to_csv("feature_deduplication.csv", index=False)