In [1]:
import numpy as np, scipy as sp,pandas as pd, matplotlib.pyplot as plt
import matplotlib, sklearn
import os,sys,csv
import util

In [2]:
origin_headers = ['date_time', 
           'site_name',
           'posa_continent',
           'user_location_country',
           'user_location_region',
           'user_location_city',
           'orig_destination_distance',
           'user_id',
           'is_mobile',
           'is_package',
           'channel',
           'srch_ci',
           'srch_co',
           'srch_adults_cnt',
           'srch_children_cnt',
           'srch_rm_cnt',
           'srch_destination_id',
           'srch_destination_type_id',
           'hotel_continent',
           'hotel_country',
           'hotel_market',
           'is_booking',
           'cnt',
           'hotel_cluster']

updated_headers = ['date_time',
           'in_date',
           'in_days',
           'site_name',
           'posa_continent',
           'user_location_country',
           'user_location_region',
           'user_location_city',
           'hotel_continent',
           'hotel_country',
           'hotel_market',
           'srch_destination_id',
           'orig_destination_distance',
           'user_id',
           'is_mobile',
           'is_package',
           'channel',
           'srch_adults_cnt',
           'srch_children_cnt',
           'srch_rm_cnt',
           'srch_destination_type_id',
           'is_booking',
           'cnt',
           'hotel_cluster']

In [3]:
def uniqify(seq, idfun=None): 
    # order preserving
    if idfun is None:
        def idfun(x): return x
    seen = {}
    result = []
    for item in seq:
        marker = idfun(item)
        if marker in seen: 
            continue
        seen[marker] = 1
        result.append(item)
    return result

## Geospatial Analysis

In [4]:
import collections
import cPickle as pk

def geo_stat(fpath, isBooking = True):
    fname, ext = os.path.splitext(fpath)
    with open(fpath, "r") as csvfile:
        reader = csv.reader(csvfile)
        header = reader.next()
        book_item = header.index('is_booking')
        destination_item = header.index('srch_destination_id')
        country_item = header.index('hotel_country')
        hotel_item = header.index('hotel_cluster')
        
        dest_map = {}
        country_map = {}
        
        count = 0
        for line in reader:
            count += 1
            if count%1000000 == 0:
                print "Processed:", str(count)
            
            if isBooking and line[book_item] != '1':
                continue
                
            destination_id = int(line[destination_item])
            country_id = int(line[country_item])
            hotel_cluster = int(line[hotel_item])
            
            value1 = dest_map.setdefault(destination_id, [])
            value1.append(hotel_cluster)
            
            value2 = country_map.setdefault(country_id, [])
            value2.append(hotel_cluster)          
            
    print 'IO Done. Now Processing...'
    
    for k, v in dest_map.iteritems():
        v.sort(key=collections.Counter(v).get, reverse=True)
        dest_map[k] = uniqify(v)
    
    for k, v in country_map.iteritems():
        v.sort(key=collections.Counter(v).get, reverse=True)
        country_map[k] = uniqify(v)
        
    print 'Done'
    return dest_map, country_map


def save_geo_stat(dstpath, dst_map, cntry_map):
    with open(dstpath,'wb') as fp:
        pk.dump(dst_map ,fp)
        pk.dump(cntry_map,fp)
        
def load_geo_stat(srcpath):
    with open(srcpath,'rb') as fp:
        dst_map = pk.load(fp)
        cntry_map = pk.load(fp)
        return dst_map, cntry_map
    

In [None]:
# Get the list of hotel sorted by frequency in different destination region
full_trainpath = '../data/train.csv'
d_map, c_map = geo_stat(full_trainpath, False)
save_geo_stat('../data/geo_stat_all.p', d_map, c_map)

In [5]:

def csv_map_by_key(srcpath, dstdir, key='hotel_country'):
    if not os.path.exists(dstdir):
        os.makedirs(dstdir)
        
    data = pd.read_csv(srcpath)
    for name, group in data.groupby(key):
        filepath = os.path.join(dstdir, str(name) + '.csv')
        skipHeader = os.path.exists(filepath) 
        with open(filepath, "a") as csvfile:
            group.to_csv(csvfile, mode='a', header=(not skipHeader))
            
def loop_map_by_key(dstdir):
    #fname, ext = os.path.splitext(fpath)
    for i in range(8):
        print i
        datapath = '../data/train_' + str(i) + '.csv'
        csv_map_by_key(datapath, dstdir)
        

In [None]:
# Map and save data according to country
loop_map_by_key('../data/byCountry')

## Modeling & Ranking

In [6]:
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.cross_validation import cross_val_score

In [7]:
import ml_metrics as mtr
def map_5_scorer(estimator, X, y):
    if X.shape[0] == 0:
        return 1
    prob = estimator.predict_proba(X)
    labels = np.array(estimator.classes_)
    
    def top5(prob):
        indice = sorted(range(len(prob)), key=lambda k: prob[k], reverse=True)
        return labels[indice].tolist()
    
    y = map(lambda x:[x], y)
    y_pred = np.apply_along_axis(top5, axis=1, arr=prob)
    return mtr.mapk(y, y_pred, 5) 

### 0. Recommend Most Popular

In [8]:
def test_freqRanking(datadir, isBooking = True):
    dst_map, cntry_map = load_geo_stat('../data/geo_stat_booking.p')
    
    scores = []
    freqs = []
    for i in range(8):
        
        datapath = os.path.join( datadir, "train_" + str(i) + '.csv')
        print i, datapath
        if not os.path.exists(datapath):
            continue
        if not os.path.exists(datapath):
            continue
        
        df = pd.read_csv(datapath)
        
        if isBooking:
            df = df[df.is_booking == 1]
        
        for  dst, group in df.groupby('srch_destination_id'):
            
            y = group['hotel_cluster'].values.tolist()
            y = map(lambda x:[x], y)
            y_pred = [dst_map[dst][:5] for k in range(len(group))]
            
            assert(len(y) == len(y_pred))
            
            scores.append(mtr.mapk(y, y_pred, 5) )
            freqs.append(len(group))
            
            print i, dst, scores[-1], freqs[-1]
        
    return np.average(scores, weights=freqs)
        

In [None]:
print test_freqRanking('../data/', isBooking =True)


### 1. RandomForest

In [9]:
def cross_validation(df, feaTitles, split = 5, n_est = 10):
    
    X = df[feaTitles].as_matrix()
    y = df.hotel_cluster.as_matrix()
    
    if X.shape[0] <= split*split:
        return 1, 0
        
    if n_est == 0 or n_est == None:
        n_est = len(feaTitles)
    estimator = Pipeline([("imputer", Imputer(missing_values=0,
                                          strategy="median",
                                          axis=0)),
                      ("forest", RandomForestClassifier(n_estimators=n_est,
                                                       n_jobs=3))])
    #from util import map_5_scorer
    score = cross_val_score(estimator, X, y, cv=split, scoring=map_5_scorer)
    return score.mean(), len(df)

In [10]:
def test_model(datadir, feaTitles, isBooking = False, split = 3, dstfile = None):
    scores = []
    freqs = []
    
    if dstfile!=None:
        writer = csv.writer(open(dstfile, 'ab'))
    else:
        writer= None
        
    for i in range(213):
        print i
        datapath = os.path.join( datadir,str(i) + '.csv')
        if not os.path.exists(datapath):
            continue
        
        df = pd.read_csv(datapath)
        
        if isBooking:
            df = df[df.is_booking == 1]        
        
        util.time_feature_processing(df, True)
        util.nan_feature_processing(df,'orig_destination_distance')
        
        for  dst, group in df.groupby('srch_destination_id'):
                
                score, count = cross_validation(group, feaTitles, split)
                scores.append(score)
                freqs.append(count)
                
                print dst, score, count
                if writer!= None:
                    writer.writerow([str(i), str(dst), str(score), str(count)])
                    
    return np.average(scores, weights=freqs)
            

In [None]:
#sample = pd.read_csv('../data/booking_train.csv')

In [11]:
fea_header = ['date_time',
           'in_date',
           'in_days',
           'site_name',
           'posa_continent',
           'user_location_country',
           'user_location_region',
           'user_location_city',
           'orig_destination_distance',
           'user_id',
           'is_mobile',
           'is_package',
           'channel',
           'srch_adults_cnt',
           'srch_children_cnt',
           'srch_rm_cnt',
           'srch_destination_type_id']
test_model('../data/byCountry', feaTitles=fea_header,isBooking=True)

0


KeyboardInterrupt: 

In [None]:
df = pd.read_csv('../data/byCountry/113.csv')
util.nan_feature_processing(df, 'orig_destination_distance')
print df

In [None]:
len(sample)

In [None]:
g = sample.groupby('srch_destination_id')
g.get_group(31682)

In [None]:
x = list(range(0,10,2))

In [None]:
x = np.array(x)
x 

In [12]:
def rand_sample(df, ratio=0.3):
    msk = np.random.rand(len(df)) < ratio
    return df[msk], df[~msk]

In [13]:
def split_data(datadir, lst, traindir, testfile, sr = 0.2):
    
    for i in lst:
        print i
        datapath = os.path.join( datadir,str(i) + '.csv')
        if not os.path.exists(datapath):
            continue
        
        
        df = pd.read_csv(datapath)
        print len(df),
        booking = df[df.is_booking == 1] 
        test, booking = rand_sample(booking, ratio=sr)
        df = pd.concat([booking, df[df.is_booking!=1]])
        print len(test), len(df)
        
        skipHeader = os.path.exists(testfile) 
        with open(testfile, "a") as csvfile:
            test.to_csv(csvfile, mode='a', header=(not skipHeader))
            
        trainpath = os.path.join(traindir,str(i) + '.csv')
        with open(trainpath, "w") as csvfile:
            df.to_csv(csvfile)

In [14]:
split_data('../data/byCountry', [50], '../data/byCountry/train', '../data/byCountry/test.csv')

2
4839 103 4736
3
7817 141 7676
4
11610 267 11343
5
325064 2748 322316
6
497 19 478
7
90774 1610 89164
8
1914010 16123 1897887
9
7687 123 7564
10
1812 12 1800
11
38289 1005 37284
12
47530 766 46764
13
72083 1237 70846
14
1539 15 1524
15
65654 1376 64278
16
206 2 204
17
73605 1304 72301
18
922 25 897
19
2275 63 2212
20
5502 119 5383
21
81445 1729 79716
22
273081 3291 269790
23
1767 32 1735
24
2235 59 2176
25
19478 459 19019
26
13877 63 13814
27
2938 66 2872
28
1852 33 1819
29
1089 16 1073
30
524 11 513
31
135237 2533 132704
32
58406 388 58018
33
265 9 256
34
138754 2869 135885
35
16338 100 16238
36
18076 190 17886
37
263 7 256
38
2939 88 2851
39
588 17 571
40
50 0 50
41
27 0 27
42
4719 51 4668
43
1383 40 1343
44
2751 34 2717
45
25413 536 24877
46
144343 3071 141272
47
210009 1198 208811
48
374351 4537 369814
49
100 1 99


In [None]:
def built_model(datadir, feaTitles, isBooking = False, sr = 0.2, dstfile = None):
    scores = []
    freqs = []
    
    
    for i in range(213):
        print i
        datapath = os.path.join( datadir,str(i) + '.csv')
        if not os.path.exists(datapath):
            continue
        
        
        df = pd.read_csv(datapath)
        booking = df[df.is_booking == 1] 
        if sr > 0:
            test, booking = 
        if dstfile!=None:
            skipHeader = os.path.exists(dstfile) 
            with open(filepath, "a") as csvfile:
                group.to_csv(csvfile, mode='a', header=(not skipHeader))
        
        if isBooking:
            df = df[df.is_booking == 1]        
        
        util.time_feature_processing(df, True)
        util.nan_feature_processing(df,'orig_destination_distance')
        
        for  dst, group in df.groupby('srch_destination_id'):
                
                score, count = cross_validation(group, feaTitles, split)
                scores.append(score)
                freqs.append(count)
                
                print dst, score, count
                if writer!= None:
                    writer.writerow([str(i), str(dst), str(score), str(count)])
                    
    return np.average(scores, weights=freqs)
 