In [None]:
import numpy as np, scipy as sp,pandas as pd, matplotlib.pyplot as plt
import matplotlib, sklearn
import os,sys,csv

## Data IO

In [None]:
trainPath= '../data/train.csv'
testPath= '../test.csv'

In [None]:
def csv_nrows(fpath):
    count = 0
    with open(fpath, "r") as csvfile:
        reader = csv.reader(csvfile)
        reader.next()
        '''
        for line in reader:
            count += 1
        '''
        count = sum(1 for row in reader)
    return count

In [None]:
def csv_split(fpath, nrows=5000000):
    count = csv_nrows(fpath)
    fname, ext = os.path.splitext(fpath)
    with open(fpath, "r") as csvfile:
        reader = csv.reader(csvfile)
        header = reader.next()
        
        count = 0
        for line in reader:
            if count% nrows == 0:
                print "New Patch", int(count//nrows)
                dpath = fname + '_' + str(count//nrows) + ext
                writer = csv.writer(open(dpath, 'wb'))
                writer.writerow(header)
            writer.writerow(line)
            count += 1


    print count
    print 'Done'

In [None]:
csv_split(trainPath)

In [None]:
print csv_nrows(trainPath)

In [None]:

train_data = pd.read_csv(trainPath)
test_data = pd.read_csv(testPath)

## Feature Processing

In [None]:
headers = ['date_time', 
           'site_name',
           'posa_continent',
           'user_location_country',
           'user_location_region',
           'user_location_city',
           'orig_destination_distance',
           'user_id',
           'is_mobile',
           'is_package',
           'channel',
           'srch_ci',
           'srch_co',
           'srch_adults_cnt',
           'srch_children_cnt',
           'srch_rm_cnt',
           'srch_destination_id',
           'srch_destination_type_id',
           'hotel_continent',
           'hotel_country',
           'hotel_market',
           'is_booking',
           'cnt',
           'hotel_cluster']

new_fields = ['in_days', 'in_date']
modified = ['date_time']

updated = ['date_time',
           'in_date',
           'in_days',
           'site_name',
           'posa_continent',
           'user_location_country',
           'user_location_region',
           'user_location_city',
           'hotel_continent',
           'hotel_country',
           'hotel_market',
           'srch_destination_id',
           'orig_destination_distance',
           'user_id',
           'is_mobile',
           'is_package',
           'channel',
           'srch_adults_cnt',
           'srch_children_cnt',
           'srch_rm_cnt',
           'srch_destination_type_id',
           'is_booking',
           'cnt',
           'hotel_cluster']

In [None]:
sample = pd.read_csv('../data/train_0.csv')   

In [None]:
def time_feature_processing(df):
    def time_interval(row):
        if pd.isnull(row['srch_ci']) or pd.isnull(row['srch_co']):
            return 1
        st = pd.Period(row['srch_ci'],freq='D')
        et = pd.Period(row['srch_co'],freq='D') 
        return et-st
    
    def time_midpoint(row):
        if pd.isnull(row['srch_ci']) or pd.isnull(row['srch_co']):
            return 0
        st = pd.Period(row['srch_ci'],freq='D')
        et = pd.Period(row['srch_co'],freq='D') 

        date = st + (et - st) / 2
        return  date.dayofyear - 1
    
    def time_dayofyear(row):
        if pd.isnull(row['date_time']):
            return 0
        tstamp = pd.to_datetime(row['date_time'])
        #print tstamp
        return tstamp.dayofyear - 1 + tstamp.hour / 24.
    
    
    df['in_days'] = df.apply(time_interval, axis= 1)
    df['in_date'] = df.apply(time_midpoint, axis= 1)
    df['date_time'] = df.apply(time_dayofyear, axis = 1)
    df.drop('srch_ci', axis=1, inplace=True)
    df.drop('srch_co', axis=1, inplace=True)

In [None]:
def nan_feature_processing(df, colname):
    med = df[colname].median()
    df[colname] = df[colname].fillna(med)

In [None]:
part_df = sample[:50000]
time_feature_processing(part_df)
nan_feature_processing(part_df,'orig_destination_distance')
#part_df[updated]

## Feature Analysis

In [None]:
corr_mat = part_df[updated].corr()
plt.matshow(corr_mat)
plt.colorbar(shrink=0.5)
plt.xticks(np.arange(len(updated)),updated, rotation='vertical')
plt.yticks(np.arange(len(updated)),updated)
plt.show()

## Model

### Random Forest

In [None]:
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.cross_validation import cross_val_score

import ml_metrics as mtr

In [None]:
X = part_df[updated[:-1]].as_matrix()
y = part_df.hotel_cluster.as_matrix()

In [None]:
def map_5_scorer(estimator, X, y):
    prob = estimator.predict_proba(X)
    def top5(row):
        return sorted(range(len(row)), key=lambda k: row[k], reverse=True)
    
    y = map(lambda x:[x], y)
    y_pred = np.apply_along_axis(top5, axis=1, arr=prob)
    return mtr.mapk(y, y_pred, 5) 

In [None]:
estimator = Pipeline([("imputer", Imputer(missing_values=0,
                                          strategy="median",
                                          axis=0)),
                      ("forest", RandomForestClassifier(n_estimators=100,
                                                       n_jobs=3))])
score = cross_val_score(estimator, X, y, cv=5, scoring=map_5_scorer)
#print("Score = %.2f" % score)
print score, score.mean()