In [1]:
import cPickle as pickle
import pandas as pd

In [2]:
with open('df_amenities_neighbourhood_all_processed.pkl', 'rb') as f:
    df = pickle.load(f)

<code>
For implementing the apriori algorithm in Orange,
we need to re-organize the data as 'market basket'. Example:
Bread, Milk
Bread, Diapers, Beer, Eggs
Milk, Diapers, Beer, Cola
Bread, Milk, Diapers, Beer
Bread, Milk, Diapers, Cola

Ref: http://docs.orange.biolab.si/2/reference/rst/Orange.associate.html
</code>

In [3]:
# We only require the columns which are in-house amenities (which are binary)
# Subset the data to include only these columns
cols = [u'Elevator in building', u'Internet', u'Family/kid friendly', u'Wireless Internet',
        u'Buzzer/wireless intercom', u'Kitchen', u'Doorman', u'Wheelchair accessible',
        u'Cable TV', u'Hot tub', u'Gym', u'Pool', u'TV', u'Dryer', u'Washer', u'Essentials', u'Shampoo',
        u'Heating', u'Air conditioning', u'Pets allowed', u'Suitable for events', 
        u'Smoking allowed', u'Indoor fireplace', u'Breakfast', u'Laptop friendly workspace', u'Iron',u'Hangers',
        u'Hair dryer',u'Private living room',u'Private entrance']
df = df.ix[:,cols]

In [4]:
record = df.loc[0] # Sample record
record

Elevator in building         0
Internet                     1
Family/kid friendly          1
Wireless Internet            1
Buzzer/wireless intercom     0
Kitchen                      1
Doorman                      0
Wheelchair accessible        0
Cable TV                     0
Hot tub                      0
Gym                          0
Pool                         0
TV                           0
Dryer                        1
Washer                       1
Essentials                   1
Shampoo                      1
Heating                      1
Air conditioning             1
Pets allowed                 0
Suitable for events          0
Smoking allowed              0
Indoor fireplace             0
Breakfast                    0
Laptop friendly workspace    1
Iron                         1
Hangers                      1
Hair dryer                   1
Private living room          0
Private entrance             0
Name: 0, dtype: int64

In [5]:
# We require only the amenities that are present (value = 1)
', '.join(record[record==1].index) # Comma-separated

u'Internet, Family/kid friendly, Wireless Internet, Kitchen, Dryer, Washer, Essentials, Shampoo, Heating, Air conditioning, Laptop friendly workspace, Iron, Hangers, Hair dryer'

In [6]:
# Store this info in a string
# One line for each record 
amenities_string = ''
for i in range(df.shape[0]):
    record = df.loc[i] 
    amenities_string += ', '.join(record[record==1].index) + '\n'

In [7]:
print amenities_string[:1000]

Internet, Family/kid friendly, Wireless Internet, Kitchen, Dryer, Washer, Essentials, Shampoo, Heating, Air conditioning, Laptop friendly workspace, Iron, Hangers, Hair dryer
Family/kid friendly, Wireless Internet, Kitchen, Gym, Pool, TV, Dryer, Washer, Essentials, Shampoo, Heating, Air conditioning, Indoor fireplace, Laptop friendly workspace, Hangers
Elevator in building, Internet, Family/kid friendly, Wireless Internet, Kitchen, Wheelchair accessible, Dryer, Washer, Essentials, Shampoo, Heating, Laptop friendly workspace, Iron, Hangers
Internet, Family/kid friendly, Wireless Internet, Kitchen, Cable TV, Dryer, Washer, Essentials, Shampoo, Heating, Air conditioning, Laptop friendly workspace, Iron, Hangers, Hair dryer
Internet, Wireless Internet, Kitchen, Cable TV, Gym, TV, Dryer, Washer, Essentials, Shampoo, Heating, Air conditioning, Indoor fireplace, Laptop friendly workspace, Iron, Hangers, Hair dryer
Wireless Internet, Kitchen, TV, Dryer, Washer, Essentials, Shampoo, Heating, Ai

In [8]:
with open('association-mining/airbnb-amenities.basket', 'w') as f: # Save string as .basket (orange format)
    f.write(amenities_string)

### Association Rule Mining in Orange

In [1]:
import Orange

In [3]:
# Get frequent itemsets
data = Orange.data.Table("association-mining/airbnb-amenities.basket")

s_threshold = 0.5
ind = Orange.associate.AssociationRulesSparseInducer(support=s_threshold, storeExamples = True) # Specify the support threshold
itemsets = ind.get_itemsets(data)

print "Number of Frequent Itemsets that satisfy the support threshold of (%4.2f) is %d" % (s_threshold, len(itemsets))

Number of Frequent Itemsets that satisfy the support threshold of (0.50) is 578


In [11]:
for itemset, tids in itemsets[:15]: # First 15 itemsets
    print "(%4.2f) %s" % (len(tids)/float(len(data)),
                          ", ".join(data.domain[item].name for item in itemset))

(0.69) TV
(0.60) TV, Air conditioning
(0.59) TV, Air conditioning, Heating
(0.54) TV, Air conditioning, Heating, Essentials
(0.53) TV, Air conditioning, Heating, Essentials, Kitchen
(0.52) TV, Air conditioning, Heating, Essentials, Kitchen, Wireless Internet
(0.53) TV, Air conditioning, Heating, Essentials, Wireless Internet
(0.51) TV, Air conditioning, Heating, Washer
(0.51) TV, Air conditioning, Heating, Washer, Dryer
(0.50) TV, Air conditioning, Heating, Washer, Dryer, Kitchen
(0.50) TV, Air conditioning, Heating, Washer, Dryer, Wireless Internet
(0.51) TV, Air conditioning, Heating, Washer, Kitchen
(0.50) TV, Air conditioning, Heating, Washer, Kitchen, Wireless Internet
(0.51) TV, Air conditioning, Heating, Washer, Wireless Internet
(0.51) TV, Air conditioning, Heating, Dryer


In [12]:
conf_threshold = 0.9
rules = Orange.associate.AssociationRulesSparseInducer(data, support=s_threshold, confidence=conf_threshold)

print "Number of Rules that satisfy the support threshold of (%4.2f) and confidence threshold of (%4.2f) is %d" % (s_threshold, conf_threshold, len(rules))

print "%4s %4s  %s" % ("Supp", "Conf", "Rule")
for r in rules[:15]: # First 15 rules
    print "%4.1f %4.1f  %s" % (r.support, r.confidence, r)

Number of Rules that satisfy the support threshold of (0.50) and confidence threshold of (0.90) is 386
Supp Conf  Rule
 0.7  1.0  TV -> Heating
 0.7  1.0  TV -> Heating Kitchen
 0.7  1.0  TV Heating -> Kitchen
 0.7  1.0  TV Kitchen -> Heating
 0.7  0.9  TV -> Heating Kitchen Wireless Internet
 0.7  1.0  TV Heating -> Kitchen Wireless Internet
 0.7  1.0  TV Heating Kitchen -> Wireless Internet
 0.7  1.0  TV Heating Wireless Internet -> Kitchen
 0.7  1.0  TV Kitchen -> Heating Wireless Internet
 0.7  1.0  TV Kitchen Wireless Internet -> Heating
 0.7  1.0  TV Wireless Internet -> Heating Kitchen
 0.7  1.0  TV -> Heating Wireless Internet
 0.7  1.0  TV Heating -> Wireless Internet
 0.7  1.0  TV Wireless Internet -> Heating
 0.6  0.9  TV -> Essentials


In [13]:
# Store all frequent itemsets and rules into strings and save them as txt files
itemsets_str = "Support \t Itemset \n"
for itemset, tids in itemsets: 
    itemsets_str += "(%4.2f) \t %s" % (len(tids)/float(len(data)),
                          ", ".join(data.domain[item].name for item in itemset)) + '\n'

with open('association-mining/frequent_itemsets_airbnb_amenities.txt', 'w') as f:
    f.write(itemsets_str)
    
rules_str = "Supp, Conf, Rule \n"
for r in rules:
    rules_str += "%4.1f %4.1f  %s" % (r.support, r.confidence, r) + '\n'
    
with open('association-mining/rules_airbnb_amenities.txt', 'w') as f:
    f.write(rules_str)
