In [1]:
import numpy as np, scipy as sp,pandas as pd, matplotlib.pyplot as plt
import matplotlib, sklearn
import os,sys,csv
import util

In [2]:
origin_headers = ['date_time', 
           'site_name',
           'posa_continent',
           'user_location_country',
           'user_location_region',
           'user_location_city',
           'orig_destination_distance',
           'user_id',
           'is_mobile',
           'is_package',
           'channel',
           'srch_ci',
           'srch_co',
           'srch_adults_cnt',
           'srch_children_cnt',
           'srch_rm_cnt',
           'srch_destination_id',
           'srch_destination_type_id',
           'hotel_continent',
           'hotel_country',
           'hotel_market',
           'is_booking',
           'cnt',
           'hotel_cluster']

updated_headers = ['date_time',
           'in_date',
           'in_days',
           'site_name',
           'posa_continent',
           'user_location_country',
           'user_location_region',
           'user_location_city',
           'hotel_continent',
           'hotel_country',
           'hotel_market',
           'srch_destination_id',
           'orig_destination_distance',
           'user_id',
           'is_mobile',
           'is_package',
           'channel',
           'srch_adults_cnt',
           'srch_children_cnt',
           'srch_rm_cnt',
           'srch_destination_type_id',
           'is_booking',
           'cnt',
           'hotel_cluster']

In [3]:
def uniqify(seq, idfun=None): 
    # order preserving
    if idfun is None:
        def idfun(x): return x
    seen = {}
    result = []
    for item in seq:
        marker = idfun(item)
        if marker in seen: 
            continue
        seen[marker] = 1
        result.append(item)
    return result

## Geospatial Analysis

In [4]:
import collections
import cPickle as pk

def geo_stat(fpath, isBooking = True):
    fname, ext = os.path.splitext(fpath)
    with open(fpath, "r") as csvfile:
        reader = csv.reader(csvfile)
        header = reader.next()
        book_item = header.index('is_booking')
        destination_item = header.index('srch_destination_id')
        country_item = header.index('hotel_country')
        hotel_item = header.index('hotel_cluster')
        
        dest_map = {}
        country_map = {}
        
        count = 0
        for line in reader:
            count += 1
            if count%1000000 == 0:
                print "Processed:", str(count)
            
            if isBooking and line[book_item] != '1':
                continue
                
            destination_id = int(line[destination_item])
            country_id = int(line[country_item])
            hotel_cluster = int(line[hotel_item])
            
            value1 = dest_map.setdefault(destination_id, [])
            value1.append(hotel_cluster)
            
            value2 = country_map.setdefault(country_id, [])
            value2.append(hotel_cluster)          
            
    print 'IO Done. Now Processing...'
    
    for k, v in dest_map.iteritems():
        v.sort(key=collections.Counter(v).get, reverse=True)
        dest_map[k] = uniqify(v)
    
    for k, v in country_map.iteritems():
        v.sort(key=collections.Counter(v).get, reverse=True)
        country_map[k] = uniqify(v)
        
    print 'Done'
    return dest_map, country_map


def save_geo_stat(dstpath, dst_map, cntry_map):
    with open(dstpath,'wb') as fp:
        pk.dump(dst_map ,fp)
        pk.dump(cntry_map,fp)
        
def load_geo_stat(srcpath):
    with open(srcpath,'rb') as fp:
        dst_map = pk.load(fp)
        cntry_map = pk.load(fp)
        return dst_map, cntry_map
    

In [None]:
# Get the list of hotel sorted by frequency in different destination region
full_trainpath = '../data/train.csv'
d_map, c_map = geo_stat(full_trainpath, False)
save_geo_stat('../data/geo_stat_all.p', d_map, c_map)

In [5]:

def csv_map_by_key(srcpath, dstdir, key='hotel_country'):
    if not os.path.exists(dstdir):
        os.makedirs(dstdir)
        
    data = pd.read_csv(srcpath)
    for name, group in data.groupby(key):
        filepath = os.path.join(dstdir, str(name) + '.csv')
        skipHeader = os.path.exists(filepath) 
        with open(filepath, "a") as csvfile:
            group.to_csv(csvfile, mode='a', header=(not skipHeader))
            
def loop_map_by_key(dstdir):
    #fname, ext = os.path.splitext(fpath)
    for i in range(8):
        print i
        datapath = '../data/train_' + str(i) + '.csv'
        csv_map_by_key(datapath, dstdir)
        

In [None]:
# Map and save data according to country
loop_map_by_key('../data/byCountry')

## Modeling & Ranking

In [6]:
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.cross_validation import cross_val_score

In [7]:
import ml_metrics as mtr
def map_5_scorer(estimator, X, y):
    if X.shape[0] == 0:
        return 1
    prob = estimator.predict_proba(X)
    labels = np.array(estimator.classes_)
    
    def top5(prob):
        indice = sorted(range(len(prob)), key=lambda k: prob[k], reverse=True)
        return labels[indice].tolist()
    
    y = map(lambda x:[x], y)
    y_pred = np.apply_along_axis(top5, axis=1, arr=prob)
    return mtr.mapk(y, y_pred, 5) 

### 0. Recommend Most Popular

In [8]:
def test_freqRanking(datadir, isBooking = True):
    dst_map, cntry_map = load_geo_stat('../data/geo_stat_booking.p')
    
    scores = []
    freqs = []
    for i in range(8):
        
        datapath = os.path.join( datadir, "train_" + str(i) + '.csv')
        print i, datapath
        if not os.path.exists(datapath):
            continue
        if not os.path.exists(datapath):
            continue
        
        df = pd.read_csv(datapath)
        
        if isBooking:
            df = df[df.is_booking == 1]
        
        for  dst, group in df.groupby('srch_destination_id'):
            
            y = group['hotel_cluster'].values.tolist()
            y = map(lambda x:[x], y)
            y_pred = [dst_map[dst][:5] for k in range(len(group))]
            
            assert(len(y) == len(y_pred))
            
            scores.append(mtr.mapk(y, y_pred, 5) )
            freqs.append(len(group))
            
            print i, dst, scores[-1], freqs[-1]
        
    return np.average(scores, weights=freqs)
        

In [None]:
print test_freqRanking('../data/', isBooking =True)


### 1. RandomForest

In [9]:
def cross_validation(df, feaTitles, split = 5, n_est = 10):
    
    X = df[feaTitles].as_matrix()
    y = df.hotel_cluster.as_matrix()
    
    if X.shape[0] <= split*split:
        return 1, 0
        
    if n_est == 0 or n_est == None:
        n_est = len(feaTitles)
    estimator = Pipeline([("imputer", Imputer(missing_values=0,
                                          strategy="median",
                                          axis=0)),
                      ("forest", RandomForestClassifier(n_estimators=n_est,
                                                       n_jobs=3))])
    #from util import map_5_scorer
    score = cross_val_score(estimator, X, y, cv=split, scoring=map_5_scorer)
    return score.mean(), len(df)

In [10]:
def test_model(datadir, feaTitles, isBooking = False, split = 3, dstfile = None):
    scores = []
    freqs = []
    
    if dstfile!=None:
        writer = csv.writer(open(dstfile, 'ab'))
    else:
        writer= None
        
    for i in range(213):
        print i
        datapath = os.path.join( datadir,str(i) + '.csv')
        if not os.path.exists(datapath):
            continue
        
        df = pd.read_csv(datapath)
        
        if isBooking:
            df = df[df.is_booking == 1]        
        
        util.time_feature_processing(df, True)
        util.nan_feature_processing(df,'orig_destination_distance')
        
        for  dst, group in df.groupby('srch_destination_id'):
                
                score, count = cross_validation(group, feaTitles, split)
                scores.append(score)
                freqs.append(count)
                
                print dst, score, count
                if writer!= None:
                    writer.writerow([str(i), str(dst), str(score), str(count)])
                    
    return np.average(scores, weights=freqs)
            

In [None]:
#sample = pd.read_csv('../data/booking_train.csv')
fea_header = ['date_time',
           'in_date',
           'in_days',
           'site_name',
           'posa_continent',
           'user_location_country',
           'user_location_region',
           'user_location_city',
           'orig_destination_distance',
           'user_id',
           'is_mobile',
           'is_package',
           'channel',
           'srch_adults_cnt',
           'srch_children_cnt',
           'srch_rm_cnt',
           'srch_destination_type_id']
#test_model('../data/byCountry', feaTitles=fea_header,isBooking=True)

### Cross Validation Data Sampling

In [12]:
def rand_sample(df, ratio=0.3):
    msk = np.random.rand(len(df)) < ratio
    return df[msk], df[~msk]

In [13]:
def split_data(datadir, lst, traindir, testfile, sr = 0.2):
    
    for i in lst:
        print i
        datapath = os.path.join( datadir,str(i) + '.csv')
        if not os.path.exists(datapath):
            continue
        
        
        df = pd.read_csv(datapath)
        print len(df),
        booking = df[df.is_booking == 1] 
        test, booking = rand_sample(booking, ratio=sr)
        df = pd.concat([booking, df[df.is_booking!=1]])
        print len(test), len(df)
        
        skipHeader = os.path.exists(testfile) 
        with open(testfile, "a") as csvfile:
            test.to_csv(csvfile, mode='a', header=(not skipHeader))
            
        trainpath = os.path.join(traindir,str(i) + '.csv')
        with open(trainpath, "w") as csvfile:
            df.to_csv(csvfile)

In [14]:
def sample_big_file(datafile, trainfile, testfile, sr = 0.2):

    with open(datafile, "r") as csvfile:
        reader = csv.reader(csvfile)
        header = reader.next()
        idx = header.index('is_booking')
        
        writer1 = csv.writer(open(testfile, 'ab'))
        writer2 = csv.writer(open(trainfile, 'wb'))
        writer2.writerow(header)
        count = 0
        for line in reader:
            if line[idx]=='1' and np.random.rand(1) <= sr:
                writer1.writerow([''] + line)
            else:
                writer2.writerow(line)
            count += 1
            if count%1000000 == 0:
                print "Progress:", str(count)

In [15]:
#split_data('../data/byCountry', range(51,213), '../data/byCountry/train', '../data/byCountry/test.csv')
#sample_big_file('../data/byCountry/50.csv', '../data/byCountry/train/50.csv', '../data/byCountry/test.csv')

In [16]:
from sklearn.externals import joblib
def train_model(datadir, feaTitles, lst, modeldir, isBooking = False, startGroup=None):
    if not os.path.exists(modeldir):
        os.makedirs(modeldir)
    
    for i in lst:
        print i
        datapath = os.path.join( datadir,str(i) + '.csv')
        if not os.path.exists(datapath):
            continue
        
        
        df = pd.read_csv(datapath)
        
        util.time_feature_processing(df, True)
        util.nan_feature_processing(df,'orig_destination_distance')
        
        
        skip = True
        for  dst, group in df.groupby('srch_destination_id'):
            print i, dst,
            
            if dst == startGroup:
                    skip = False
            if startGroup!=None and skip:
                    continue       
            
            X = group[feaTitles].as_matrix()
            y = group.hotel_cluster.as_matrix()
            
            weight = np.array(group['is_booking'].tolist())
            if not isBooking:
                weight = 3 * weight + 1
                
            clf = RandomForestClassifier(n_estimators=10, n_jobs=3)
            clf.fit(X, y, weight)
            modelfile = os.path.join(modeldir, str(dst)+'.pkl')
            print modelfile,
            joblib.dump(clf, modelfile)
            
            print "Done"
            

### 3. Hybrid Method

In [18]:
def predict_rank(testfile, modeldir1, modeldir2, feaTitles, logfile=None, startGroup=None):
    df = pd.read_csv(testfile)
    dst_map, cntry_map = load_geo_stat('../data/geo_stat_booking.p')
    #dst_map2, cntry_map2 = load_geo_stat('../data/geo_stat_booking.p')
    print "Loading Done"

    #util.time_feature_processing(df, True)
    #df.to_csv('processed_test.csv')
    #util.nan_feature_processing(df,'orig_destination_distance')
    #df.to_csv('processed_test.csv')
    #print "Preprocessing Finished"
    
    
    #truth = np.empty((0,1))
    #predict = np.empty((0,5))
    truth = []
    predict = []
    
    if logfile!= None:
        writer = csv.writer(open(logfile, 'ab'))
    else:
        writer= None
    
    skip = True
    for  dst, group in df.groupby('srch_destination_id'):
        print dst,
        
        if dst == startGroup:
            skip = False
        if startGroup!=None and skip:
            continue   
        
        X = group[feaTitles].as_matrix()
        y = group.hotel_cluster.as_matrix()
        y = map(lambda k:[k], y)

        
        modelfile1 = os.path.join(modeldir1, str(dst)+'.pkl')
        modelfile2 = os.path.join(modeldir2, str(dst)+'.pkl')
        if os.path.exists(modelfile1):
            clf = joblib.load(modelfile1)
        elif os.path.exists(modelfile2):
            clf = joblib.load(modelfile2)
        else:
            clf = None
        
        if clf !=None:
            prob = clf.predict_proba(X)
            labels = np.array(clf.classes_)
            
            def top5(prob):
                indice = sorted(range(len(prob)), key=lambda k: prob[k], reverse=True)
                return labels[indice].tolist()
            
            y_pred = np.apply_along_axis(top5, axis=1, arr=prob)
        else:
            top = dst_map[dst]
            y_pred = [top[:5] for k in range(len(group))]
    
        #truth = np.vstack((truth, y))
        #print predict.shape, np.array(y_pred).shape
        #predict = np.vstack((predict, y_pred))
        truth = truth + y
        predict = predict + list(y_pred)
        count = len(group)
        
        score = mtr.mapk(y, y_pred, 5)
        if writer!=None:
            writer.writerow([str(dst), str(score), str(count)])
        print  score, count
    
    return truth, predict

In [22]:
feaH = ['date_time',
           'in_date',
           'in_days',
           'site_name',
           'posa_continent',
           'user_location_city',
           'orig_destination_distance',
           'user_id',
           'is_mobile',
           'is_package',
           'channel',
           'srch_children_cnt',
           'srch_rm_cnt']
t, pred = predict_rank('processed_test.csv', '../models/all_data',  '../models/all_data', feaH, logfile='../eval/hybrid_eval.csv',startGroup=23429)

Loading Done
1 4 7 8 9 10 11 14 16 19 21 24 25 26 27 30 33 37 39 40 43 46 51 53 54 57 59 62 63 66 68 70 72 73 76 81 84 87 88 89 95 96 99 100 101 102 103 104 105 106 108 109 110 111 112 114 116 117 118 119 120 121 123 124 127 128 129 131 134 135 137 138 140 141 142 145 147 148 152 154 159 161 164 165 170 172 177 181 182 186 188 191 194 196 208 209 210 211 212 218 221 223 226 227 228 235 236 238 241 243 244 245 246 248 250 251 255 256 257 259 260 262 263 268 269 281 284 285 286 287 289 290 293 295 296 298 300 302 303 304 305 307 309 310 315 316 317 318 319 321 323 325 327 328 329 330 331 335 337 339 343 347 348 354 355 356 357 358 360 363 365 366 367 368 369 371 375 379 380 381 382 383 385 386 389 390 392 393 399 402 405 406 407 408 409 410 411 412 415 416 420 421 422 424 425 426 427 429 431 432 434 435 436 438 439 440 445 446 448 449 453 457 460 466 467 468 470 474 475 477 480 482 484 485 486 488 489 493 496 498 500 501 503 505 506 508 511 512 513 516 520 521 526 528 529 530 531 532 533

In [23]:
evalue = pd.read_csv('../eval/hybrid_eval.csv')

In [24]:
score = evalue.ix[:, 1]
weight1 = evalue.ix[:, 2]
weight2 = np.where(weight1 == 0, 1., weight1)

In [25]:
print np.average(score, weights=weight1)
print np.average(score, weights=weight2)

0.467149645368
0.467149645368
