In [29]:
import numpy as np, scipy as sp,pandas as pd, matplotlib.pyplot as plt
import matplotlib, sklearn
import os,sys,csv
import util

In [1]:
origin_headers = ['date_time', 
           'site_name',
           'posa_continent',
           'user_location_country',
           'user_location_region',
           'user_location_city',
           'orig_destination_distance',
           'user_id',
           'is_mobile',
           'is_package',
           'channel',
           'srch_ci',
           'srch_co',
           'srch_adults_cnt',
           'srch_children_cnt',
           'srch_rm_cnt',
           'srch_destination_id',
           'srch_destination_type_id',
           'hotel_continent',
           'hotel_country',
           'hotel_market',
           'is_booking',
           'cnt',
           'hotel_cluster']

updated_headers = ['date_time',
           'in_date',
           'in_days',
           'site_name',
           'posa_continent',
           'user_location_country',
           'user_location_region',
           'user_location_city',
           'hotel_continent',
           'hotel_country',
           'hotel_market',
           'srch_destination_id',
           'orig_destination_distance',
           'user_id',
           'is_mobile',
           'is_package',
           'channel',
           'srch_adults_cnt',
           'srch_children_cnt',
           'srch_rm_cnt',
           'srch_destination_type_id',
           'is_booking',
           'cnt',
           'hotel_cluster']

In [3]:
def uniqify(seq, idfun=None): 
    # order preserving
    if idfun is None:
        def idfun(x): return x
    seen = {}
    result = []
    for item in seq:
        marker = idfun(item)
        if marker in seen: 
            continue
        seen[marker] = 1
        result.append(item)
    return result

## Geospatial Analysis

In [4]:
import collections
import cPickle as pk

def geo_stat(fpath, isBooking = True):
    fname, ext = os.path.splitext(fpath)
    with open(fpath, "r") as csvfile:
        reader = csv.reader(csvfile)
        header = reader.next()
        book_item = header.index('is_booking')
        destination_item = header.index('srch_destination_id')
        country_item = header.index('hotel_country')
        hotel_item = header.index('hotel_cluster')
        
        dest_map = {}
        country_map = {}
        
        count = 0
        for line in reader:
            count += 1
            if count%1000000 == 0:
                print "Processed:", str(count)
            
            if isBooking and line[book_item] != '1':
                continue
                
            destination_id = int(line[destination_item])
            country_id = int(line[country_item])
            hotel_cluster = int(line[hotel_item])
            
            value1 = dest_map.setdefault(destination_id, [])
            value1.append(hotel_cluster)
            
            value2 = country_map.setdefault(country_id, [])
            value2.append(hotel_cluster)          
            
    print 'IO Done. Now Processing...'
    
    for k, v in dest_map.iteritems():
        v.sort(key=collections.Counter(v).get, reverse=True)
        dest_map[k] = uniqify(v)
    
    for k, v in country_map.iteritems():
        v.sort(key=collections.Counter(v).get, reverse=True)
        country_map[k] = uniqify(v)
        
    print 'Done'
    return dest_map, country_map


def save_geo_stat(dstpath, dst_map, cntry_map):
    with open(dstpath,'wb') as fp:
        pk.dump(dst_map ,fp)
        pk.dump(cntry_map,fp)
        
def load_geo_stat(srcpath):
    with open(srcpath,'rb') as fp:
        dst_map = pk.load(fp)
        cntry_map = pk.load(fp)
        return dst_map, cntry_map
    

In [None]:
# Get the list of hotel sorted by frequency in different destination region
full_trainpath = '../data/train.csv'
d_map, c_map = geo_stat(full_trainpath, False)
save_geo_stat('../data/geo_stat_all.p', d_map, c_map)

In [6]:

def csv_map_by_key(srcpath, dstdir, key='hotel_country'):
    if not os.path.exists(dstdir):
        os.makedirs(dstdir)
        
    data = pd.read_csv(srcpath)
    for name, group in data.groupby(key):
        filepath = os.path.join(dstdir, str(name) + '.csv')
        skipHeader = os.path.exists(filepath) 
        with open(filepath, "a") as csvfile:
            group.to_csv(csvfile, mode='a', header=(not skipHeader))
            
def loop_map_by_key(dstdir):
    #fname, ext = os.path.splitext(fpath)
    for i in range(8):
        print i
        datapath = '../data/train_' + str(i) + '.csv'
        csv_map_by_key(datapath, dstdir)
        

In [7]:
# Map and save data according to country
loop_map_by_key('../data/byCountry')

0


KeyboardInterrupt: 

## Modeling & Ranking

In [16]:
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.cross_validation import cross_val_score

### 1. RandomForest

In [39]:
def cross_validation(df, feaTitles, split = 5, n_est = 10):
    
    X = df[feaTitles].as_matrix()
    y = df.hotel_cluster.as_matrix()
    
    if n_est == 0 or n_est == None:
        n_est = len(feaTitles)
    estimator = Pipeline([("imputer", Imputer(missing_values=0,
                                          strategy="median",
                                          axis=0)),
                      ("forest", RandomForestClassifier(n_estimators=n_est,
                                                       n_jobs=3))])
    from util import map_5_scorer
    score = cross_val_score(estimator, X, y, cv=split, scoring=map_5_scorer)
    return score.mean(), len(df)

In [40]:
def test_model(datadir, feaTitles, isBooking = False, split = 5):
    scores = []
    freqs = []
    for i in range(213):
        print i
        datapath = os.path.join( datadir,str(i) + '.csv')
        if not os.path.exists(datapath):
            continue
        
        df = pd.read_csv(datapath)
        
        if isBooking:
            df = df[df.is_booking == 1]        
        
        util.time_feature_processing(df, True)
        util.nan_feature_processing(df,'orig_destination_distance')
        
        for  dst, group in df.groupby('srch_destination_id'):
                score, count = cross_validation(group, feaTitles, split)
                scores.append(score)
                freqs.append(count)
        
        return np.average(scores, weights=freqs)
            

In [17]:
sample = pd.read_csv('../data/booking_train.csv')

In [41]:
fea_header = ['date_time',
           'in_date',
           'in_days',
           'site_name',
           'posa_continent',
           'user_location_country',
           'user_location_region',
           'user_location_city',
           'orig_destination_distance',
           'user_id',
           'is_mobile',
           'is_package',
           'channel',
           'srch_adults_cnt',
           'srch_children_cnt',
           'srch_rm_cnt',
           'srch_destination_type_id']
test_model('../data/byCountry', feaTitles=fea_header,isBooking=True)
util.

0


ImportError: cannot import name map_5_scorer

In [19]:
len(sample.srch_destination_id.unique())

36933

In [21]:
len(sample)

3000693

In [25]:
sample[sample['hotel_cluster']== 5]

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,srch_children_cnt,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster
199,2013-09-14 00:00:59,24,2,3,50,5703,,14117,1,0,...,0,1,23894,6,1,1,6,77,1882,5
301,2014-02-07 02:40:04,24,2,231,68,42296,,22730,0,0,...,0,1,19940,6,1,1,3,99,88,5
330,2014-10-23 20:21:41,11,3,205,385,46709,,26246,1,0,...,0,1,20925,6,1,1,6,204,1774,5
389,2014-06-13 17:59:33,2,3,77,824,25620,,30850,0,0,...,0,1,23237,1,1,1,3,106,756,5
490,2014-12-29 15:08:05,25,2,23,48,4924,,35370,0,0,...,1,1,23237,1,1,1,3,106,756,5
497,2013-10-11 09:28:34,2,3,66,442,35390,9282.0341,36305,0,0,...,0,1,23386,6,1,1,3,162,1490,5
629,2014-08-12 12:22:43,2,3,66,174,11816,,47886,0,0,...,0,1,14041,6,1,1,6,105,35,5
799,2013-05-31 14:55:23,34,3,205,354,25315,5797.7663,55953,0,0,...,0,1,8247,1,1,1,3,0,1500,5
800,2013-05-31 15:01:25,34,3,205,354,25315,5797.7663,55953,0,0,...,0,1,8247,1,1,1,3,0,1500,5
812,2014-06-18 17:46:28,2,3,66,258,35380,44.2142,56336,1,0,...,0,1,14964,1,1,1,2,50,441,5
