In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import seaborn as sns
from matplotlib import rcParams

from sklearn.ensemble import RandomForestClassifier

import datetime
from operator import itemgetter
import csv

pd.set_option('display.max_columns',75)
pd.set_option('display.max_rows',75)

%matplotlib inline

## Sites to look at

https://www.dataquest.io/blog/kaggle-tutorial/

Need one-hot encoding for our categorical data if using scikit randomforest

https://roamanalytics.com/2016/10/28/are-categorical-variables-getting-lost-in-your-random-forests/

## K-fold cross-validation of Random Forest Classifier with added features

##### Load dataset

In [2]:
df = pd.read_csv('../../training_set_VU_DM_2014.csv')

##### Convert time-fields to usable features

In [3]:
#convert date_time to datetime
df["date_time"] = pd.to_datetime(df["date_time"])

#add column for the starting date of the booking
df["book_start"] = df["date_time"] + pd.to_timedelta(df['srch_booking_window'], unit='D')
df["book_end"] = df["book_start"] + pd.to_timedelta(df['srch_length_of_stay'], unit='D')

#extract usable features
df["srch_weekday"] = df["date_time"].dt.weekday
df["srch_month"] = df["date_time"].dt.month
df["srch_quarter"] = df["date_time"].dt.quarter
df["srch_year"] = df["date_time"].dt.year
df["book_start_weekday"] = df["book_start"].dt.weekday
df["book_start_month"] = df["book_start"].dt.month
df["book_start_quarter"] = df["book_start"].dt.quarter
df["book_start_year"] = df["book_start"].dt.year
df["book_end_weekday"] = df["book_end"].dt.weekday
df["book_end_month"] = df["book_end"].dt.month
df["book_end_quarter"] = df["book_end"].dt.quarter
df["book_end_year"] = df["book_end"].dt.year


##### Some feature engineering with competitor columns

In [4]:
#Take minimum of comp#_rate columns. If Expedia is cheaper than all competitors (all are 1), this will be 1.
#If one competitor is cheaper (-1), this column will equal -1 (and have less chance of being booked at Expedia!)
df["comp"] = df[["comp%d_rate"%i for i in range(1,9)]].min(axis=1)

In [5]:
#multiply comp_rate and comp_rate_percent_diff and drop the old columns
for i in range(1,9):
    df["comp%d"%i] = df["comp%d_rate"%i] * df["comp%d_rate_percent_diff"%i]
    df.drop(["comp%d_rate"%i, "comp%d_rate_percent_diff"%i],axis=1,inplace=True)

In [6]:
#https://stackoverflow.com/a/30949063

df['avg_prop_starrating'] = df.groupby('srch_id')['prop_starrating'].transform('mean')
df['avg_prop_location_score1'] = df.groupby('srch_id')['prop_location_score1'].transform('mean')
df['avg_prop_location_score2'] = df.groupby('srch_id')['prop_location_score2'].transform('mean')
df['avg_price_usd'] = df.groupby('srch_id')['price_usd'].transform('mean')


In [7]:
#create difference columns, comparing the average with the value of each row
df['avg_prop_starrating_diff'] = df['prop_starrating'] - df['avg_prop_starrating']
df['avg_prop_location_score1_diff'] = df['prop_location_score1'] - df['avg_prop_location_score1']
df['avg_prop_location_score2_diff'] = df['prop_location_score2'] - df['avg_prop_location_score2']
df['avg_price_usd_diff'] = df['avg_price_usd'] - df['price_usd'] #cheaper is better!

##### Split training set into training and test set for evaluating performance

We don't have book_bool in the test set provided, so we can do this to estimate performance?

In [8]:
print("TOTAL SEARCHES IN DATASET: %d"%len(df["srch_id"].unique()))

TOTAL SEARCHES IN DATASET: 199795


In [9]:
#perform 10-fold cross-validation over a subset of the data
splits = 10
ids = df["srch_id"].unique()
np.random.shuffle(ids)
ids = ids[0:int(len(ids)/splits)*splits] #drop some srch_ids to keep test set sizes equal
split_ids = np.split(ids,splits)

result = []

for n,i in enumerate(split_ids):
    
    test_set = df.loc[df["srch_id"].isin(i)]
    training_set = df.loc[~(df["srch_id"].isin(i)) & (df["srch_id"].isin(ids))]
#     print(len(test_set["srch_id"].unique()))
#     print(len(training_set["srch_id"].unique()))
    
    #do something with NaN? For now, just set to zero
    
    #this code is based on the benchmark code given at
    #https://github.com/benhamner/ExpediaPersonalizedSortCompetition/
    
    test_set.fillna(0,inplace=True)
    training_set.fillna(0,inplace=True)
    
    #train model
    feature_names = list(training_set.columns)
    feature_names.remove("date_time")
    feature_names.remove("book_start")
    feature_names.remove("book_end")
    feature_names.remove("position")
    feature_names.remove("click_bool")
    feature_names.remove("gross_bookings_usd")
    feature_names.remove("random_bool")
    feature_names.remove("booking_bool")
    features = training_set[feature_names].values
    
    target = training_set["booking_bool"].values
    classifier = RandomForestClassifier(n_estimators=50, 
                                        verbose=1,
                                        n_jobs=4,
                                        min_samples_split=10,
                                        random_state=1)
    classifier.fit(features, target)
    
    #test model
    feature_names = list(test_set.columns)
    feature_names.remove("date_time")
    feature_names.remove("book_start")
    feature_names.remove("book_end")
    feature_names.remove("position")
    feature_names.remove("click_bool")
    feature_names.remove("gross_bookings_usd")
    feature_names.remove("random_bool")
    feature_names.remove("booking_bool")
    features = test_set[feature_names].values
    
    predictions = classifier.predict_proba(features)[:,1]
    predictions = list(-1.0*predictions)
    recommendations = zip(test_set["srch_id"], test_set["prop_id"], test_set["booking_bool"], test_set["click_bool"], predictions)
    
    rows = [(srch_id, prop_id,booking_bool,click_bool)
        for srch_id, prop_id,booking_bool,click_bool, rank_float
        in sorted(recommendations, key=itemgetter(0,4))]
    
    with open("predict%d.csv"%n, "w") as outfile:
        writer = csv.writer(outfile, lineterminator="\n")
        writer.writerow(("SearchId", "PropertyId", "BookingBool","ClickBool"))
        writer.writerows(rows)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  5.4min
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:  6.3min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.3s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.7s finished
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  5.4min
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:  6.3min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.3s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.8s finished
A value is trying to be set on a cop

In [12]:
def relevance(row):
    if row["BookingBool"]:
        return 5
    elif row["ClickBool"]:
        return 1
    else:
        return 0

#https://www.kaggle.com/wendykan/ndcg-example
""" Reference from https://gist.github.com/bwhite/3726239
"""
def dcg_at_k(r, k):
    """Score is discounted cumulative gain (dcg)
    Relevance is positive real values.  Can use binary
    as the previous methods.
    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Discounted cumulative gain
    """
    r = np.asfarray(r)[:k]
    if r.size:
        return np.sum((2 ** r -1)/ np.log2(np.arange(2, r.size + 2)))
    return 0.


def ndcg_at_k(r, k):
    """Score is normalized discounted cumulative gain (ndcg)
    Relevance is positive real values.  Can use binary
    as the previous methods.
    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Normalized discounted cumulative gain
    """
    dcg_max = dcg_at_k(sorted(r, reverse=True), k)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k) / dcg_max

score = []
for n in range(splits):
    result = pd.read_csv('predict%d.csv'%n)
    result['rel'] = result.apply(relevance,axis=1)
    score_n = 0
    for srch_id in result["SearchId"].unique():
        #use NDCG@38 as per Kaggle site
        score_n += ndcg_at_k(result.loc[result["SearchId"]==srch_id,"rel"].values,38)
    score.append(score_n / len(result["SearchId"].unique()))
print(score)
print("AVERAGED SCORE: %s"%np.mean(score))

[0.4760020961993411, 0.4766235035408516, 0.47522252433971174, 0.4774141403494979, 0.4762552127576981, 0.47598171981198145, 0.47683992258586944, 0.4778673001731456, 0.47843638233800967, 0.47843907176041184]
AVERAGED SCORE: 0.4769081873856519


In [13]:
for n,i in enumerate(score):
    print("SPLIT %d: %.16f"%(n,i))
print("AVERAGED SCORE: %.16f +/- %.16f"%(np.mean(score),1.96*np.std(score)))

SPLIT 0: 0.4760020961993411
SPLIT 1: 0.4766235035408516
SPLIT 2: 0.4752225243397117
SPLIT 3: 0.4774141403494979
SPLIT 4: 0.4762552127576981
SPLIT 5: 0.4759817198119815
SPLIT 6: 0.4768399225858694
SPLIT 7: 0.4778673001731456
SPLIT 8: 0.4784363823380097
SPLIT 9: 0.4784390717604118
AVERAGED SCORE: 0.4769081873856519 +/- 0.0020451505083519


# Exploration

##### show first few rows

In [3]:
df.head(50)

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,...,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool
0,1,2013-04-04 08:32:15,12,187,,,219,893,3,3.5,1,2.83,0.0438,4.95,27,104.77,0,23246,1,0,4,0,1,1,,...,,0.0,0.0,,0.0,0.0,,,,,0.0,0.0,,,,,,,,0.0,0.0,,0,,0
1,1,2013-04-04 08:32:15,12,187,,,219,10404,4,4.0,1,2.2,0.0149,5.03,26,170.74,0,23246,1,0,4,0,1,1,,...,,,,,0.0,0.0,,,,,0.0,1.0,,,,,,,,0.0,0.0,,0,,0
2,1,2013-04-04 08:32:15,12,187,,,219,21315,3,4.5,1,2.2,0.0245,4.92,21,179.8,0,23246,1,0,4,0,1,1,,...,,0.0,0.0,,0.0,0.0,,,,,0.0,0.0,,,,,,,,0.0,0.0,,0,,0
3,1,2013-04-04 08:32:15,12,187,,,219,27348,2,4.0,1,2.83,0.0125,4.39,34,602.77,0,23246,1,0,4,0,1,1,,...,,-1.0,0.0,5.0,-1.0,0.0,5.0,,,,0.0,1.0,,,,,,,,-1.0,0.0,5.0,0,,0
4,1,2013-04-04 08:32:15,12,187,,,219,29604,4,3.5,1,2.64,0.1241,4.93,4,143.58,0,23246,1,0,4,0,1,1,,...,,0.0,0.0,,0.0,0.0,,,,,0.0,0.0,,,,,,,,0.0,0.0,,0,,0
5,1,2013-04-04 08:32:15,12,187,,,219,30184,4,4.5,1,2.77,0.1302,5.2,7,195.32,0,23246,1,0,4,0,1,1,,...,,,,,0.0,0.0,7.0,,,,0.0,0.0,,,,,,,,0.0,0.0,7.0,0,,0
6,1,2013-04-04 08:32:15,12,187,,,219,44147,3,3.5,1,2.2,0.0356,4.81,18,129.35,0,23246,1,0,4,0,1,1,,...,,0.0,0.0,,0.0,0.0,,,,,,,,,,,,,,0.0,0.0,,0,,0
7,1,2013-04-04 08:32:15,12,187,,,219,50984,2,0.0,0,1.61,,4.14,35,85.37,0,23246,1,0,4,0,1,1,,...,,,,,,,,,,,,,,,,,,,,,,,0,,0
8,1,2013-04-04 08:32:15,12,187,,,219,53341,4,4.0,1,2.56,0.1238,5.18,3,150.05,0,23246,1,0,4,0,1,1,,...,,0.0,0.0,,0.0,0.0,,,,,,,,,,,,,,0.0,0.0,6.0,0,,0
9,1,2013-04-04 08:32:15,12,187,,,219,56880,4,4.0,1,2.83,0.1028,5.15,10,280.69,0,23246,1,0,4,0,1,1,,...,,0.0,0.0,,0.0,0.0,,,,,0.0,0.0,,,,,,,,0.0,0.0,,0,,0


##### show column names

In [4]:
print(df.columns.values)

['srch_id' 'date_time' 'site_id' 'visitor_location_country_id'
 'visitor_hist_starrating' 'visitor_hist_adr_usd' 'prop_country_id'
 'prop_id' 'prop_starrating' 'prop_review_score' 'prop_brand_bool'
 'prop_location_score1' 'prop_location_score2' 'prop_log_historical_price'
 'position' 'price_usd' 'promotion_flag' 'srch_destination_id'
 'srch_length_of_stay' 'srch_booking_window' 'srch_adults_count'
 'srch_children_count' 'srch_room_count' 'srch_saturday_night_bool'
 'srch_query_affinity_score' 'orig_destination_distance' 'random_bool'
 'comp1_rate' 'comp1_inv' 'comp1_rate_percent_diff' 'comp2_rate'
 'comp2_inv' 'comp2_rate_percent_diff' 'comp3_rate' 'comp3_inv'
 'comp3_rate_percent_diff' 'comp4_rate' 'comp4_inv'
 'comp4_rate_percent_diff' 'comp5_rate' 'comp5_inv'
 'comp5_rate_percent_diff' 'comp6_rate' 'comp6_inv'
 'comp6_rate_percent_diff' 'comp7_rate' 'comp7_inv'
 'comp7_rate_percent_diff' 'comp8_rate' 'comp8_inv'
 'comp8_rate_percent_diff' 'click_bool' 'gross_bookings_usd'
 'booking_

##### show column types

In [5]:
df.dtypes

srch_id                          int64
date_time                       object
site_id                          int64
visitor_location_country_id      int64
visitor_hist_starrating        float64
visitor_hist_adr_usd           float64
prop_country_id                  int64
prop_id                          int64
prop_starrating                  int64
prop_review_score              float64
prop_brand_bool                  int64
prop_location_score1           float64
prop_location_score2           float64
prop_log_historical_price      float64
position                         int64
price_usd                      float64
promotion_flag                   int64
srch_destination_id              int64
srch_length_of_stay              int64
srch_booking_window              int64
srch_adults_count                int64
srch_children_count              int64
srch_room_count                  int64
srch_saturday_night_bool         int64
srch_query_affinity_score      float64
                         

##### show short description per column

In [12]:
df.describe()

Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool
count,4958347.0,4958347.0,4958347.0,251866.0,252988.0,4958347.0,4958347.0,4958347.0,4950983.0,4958347.0,4958347.0,3867999.0,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0,317406.0,3350565.0,4958347.0,119930.0,129559.0,94439.0,2024672.0,2130269.0,556238.0,1534288.0,1650990.0,472797.0,307378.0,343663.0,131086.0,2222373.0,2360020.0,841099.0,240157.0,260976.0,96174.0,315348.0,356422.0,138515.0,1916654.0,1987503.0,614730.0,4958347.0,138390.0,4958347.0
mean,166366.6,9.953133,175.3405,3.374334,176.022659,173.9739,70079.18,3.180525,3.777777,0.6346994,2.872589,0.1303852,4.317913,16.85624,254.2096,0.2156198,14042.63,2.385427,37.47417,1.972637,0.3504918,1.110525,0.5022127,-24.146418,1301.234,0.2959004,0.479788,0.031059,244.229916,0.04418987,0.03931006,18.490732,0.0172901,0.05776897,27.071559,-0.016543,0.096653,175.3165,0.02267621,0.0514288,29.851903,0.128329,0.075957,17.250473,0.145969,0.083202,19.433267,-0.06089936,0.009962752,22.430384,0.04474858,386.283316,0.02791051
std,96112.23,7.64689,65.91625,0.692519,107.254493,68.34525,40609.92,1.051024,1.050329,0.4815144,1.531011,0.1594634,1.834869,10.42566,16001.24,0.4112517,8111.843,2.053243,51.99341,0.8570628,0.7325692,0.417229,0.4999952,15.743238,2023.951,0.4564465,0.641565,0.229688,1165.448634,0.4484236,0.2342388,240.113851,0.4575808,0.2948202,1012.919911,0.578718,0.337165,5757.74,0.5217575,0.2678363,1340.262382,0.559841,0.302668,31.160313,0.578202,0.316722,54.370221,0.4691723,0.2029142,895.965854,0.2067514,821.190577,0.1647165
min,1.0,1.0,1.0,1.41,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,0.0,1.0,0.0,1.0,0.0,-326.5675,0.01,0.0,-1.0,-1.0,2.0,-1.0,-1.0,2.0,-1.0,-1.0,2.0,-1.0,-1.0,2.0,-1.0,-1.0,2.0,-1.0,-1.0,2.0,-1.0,-1.0,2.0,-1.0,-1.0,2.0,0.0,0.0,0.0
25%,82936.0,5.0,100.0,2.92,109.81,100.0,35010.0,3.0,3.5,0.0,1.79,0.019,4.45,8.0,85.0,0.0,7101.0,1.0,4.0,2.0,0.0,1.0,0.0,-30.774775,139.8,0.0,0.0,0.0,7.0,0.0,0.0,7.0,0.0,0.0,7.0,0.0,0.0,7.0,0.0,0.0,7.0,0.0,0.0,6.0,0.0,0.0,7.0,0.0,0.0,7.0,0.0,124.0,0.0
50%,166507.0,5.0,219.0,3.45,152.24,219.0,69638.0,3.0,4.0,1.0,2.77,0.069,4.91,16.0,122.0,0.0,13541.0,2.0,17.0,2.0,0.0,1.0,1.0,-20.4513,386.6,0.0,1.0,0.0,10.0,0.0,0.0,11.0,0.0,0.0,11.0,0.0,0.0,11.0,0.0,0.0,12.0,0.0,0.0,11.0,0.0,0.0,12.0,0.0,0.0,11.0,0.0,218.4,0.0
75%,249724.0,14.0,219.0,3.93,213.49,219.0,105168.0,4.0,4.5,1.0,4.04,0.1805,5.31,26.0,184.96,0.0,21084.0,3.0,48.0,2.0,0.0,1.0,1.0,-13.350625,1500.67,1.0,1.0,0.0,16.0,0.0,0.0,20.0,0.0,0.0,18.0,0.0,0.0,19.0,0.0,0.0,21.0,0.0,0.0,18.0,1.0,0.0,20.0,0.0,0.0,17.0,0.0,429.79,0.0
max,332785.0,34.0,231.0,5.0,1958.7,230.0,140821.0,5.0,5.0,1.0,6.98,1.0,6.21,40.0,19726330.0,1.0,28416.0,57.0,492.0,9.0,9.0,8.0,1.0,-2.4941,11666.64,1.0,1.0,1.0,30389.0,1.0,1.0,168893.0,1.0,1.0,199266.0,1.0,1.0,1001584.0,1.0,1.0,607561.0,1.0,1.0,1620.0,1.0,1.0,9900.0,1.0,1.0,149400.0,1.0,159292.38,1.0


In [None]:
len(df["srch_id"].unique())

In [None]:
df.groupby("srch_id").count()["date_time"].max()

##### show correlations with the target booking_bool

In [14]:
df.corr()["booking_bool"]

srch_id                        0.000487
site_id                       -0.001354
visitor_location_country_id    0.002524
visitor_hist_starrating       -0.002044
visitor_hist_adr_usd           0.000602
prop_country_id                0.001227
prop_id                       -0.000508
prop_starrating                0.021206
prop_review_score              0.025800
prop_brand_bool                0.009991
prop_location_score1          -0.003273
prop_location_score2           0.066405
prop_log_historical_price     -0.000807
position                      -0.147918
price_usd                      0.000067
promotion_flag                 0.036047
srch_destination_id            0.000800
srch_length_of_stay           -0.024412
srch_booking_window           -0.019582
srch_adults_count             -0.005376
srch_children_count            0.003872
srch_room_count                0.007948
srch_saturday_night_bool       0.005478
srch_query_affinity_score      0.025524
orig_destination_distance     -0.003687


##### comp#_inv has some weird values

In [None]:
#what are -1 values? Expedia has no availability, but competitor does? Why would Expedia then show the hotel?
#(Create new feature based on availablility?)
print(df["comp1_inv"].value_counts())
print("Number of NaNs: %d"%df["comp1_inv"].isnull().sum())

##### Not all searches lead to a booking, but all do have a clicked item!

In [3]:
id_book = set(df.loc[df["booking_bool"]==1,"srch_id"].unique())
ids = set(df["srch_id"].unique())
print("Number of unique searches: %d"%len(ids))
print("Number of unique searches resulting in booking: %d"%len(id_book))
print("Number of unique searches without booking: %d"%len(ids-id_book))
id_click = set(df.loc[df["click_bool"]==1,"srch_id"].unique())
print("Number of unique searches with clicks: %d"%len(id_click))
print("Number of unique searches without clicks: %d"%len(ids-id_click))

Number of unique searches: 199795
Number of unique searches resulting in booking: 138390
Number of unique searches without booking: 61405
Number of unique searches with clicks: 199795
Number of unique searches without clicks: 0


##### Get N largest values from column

In [7]:
df.loc[np.argsort(df["price_usd"].values)[-1:-10:-1],["srch_id","price_usd"]]

Unnamed: 0,srch_id,price_usd
1168566,78107,19726328.0
680748,45559,11818011.0
3117007,209314,9381308.71
1168574,78107,5444467.0
2945135,197817,4973355.0
1168576,78107,4884239.0
1168562,78107,4339792.0
1168580,78107,4260887.0
4172824,279943,4216286.0


In [16]:
#this search seems to have incorrect price_usd values, as all values have no cents?
df.loc[df["srch_id"]==78107]

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool
1168561,78107,2013-06-17 17:32:48,5,202,,,39,5131,4,3.5,1,4.26,0.0636,0.0,21,1494033.0,1,15122,4,7,2,1,1,0,,,0,,,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,1.0,,,,,,,,,,,0,,0
1168562,78107,2013-06-17 17:32:48,5,202,,,39,7077,5,4.5,1,3.76,0.0323,0.0,27,4339792.0,0,15122,4,7,2,1,1,0,,,0,,,,1.0,0.0,9.0,0.0,0.0,,0.0,0.0,,1.0,0.0,9.0,,,,,,,,,,0,,0
1168563,78107,2013-06-17 17:32:48,5,202,,,39,21839,3,3.5,0,4.16,0.0572,0.0,29,1810876.0,1,15122,4,7,2,1,1,0,,,0,,,,0.0,0.0,,0.0,0.0,,,1.0,,0.0,0.0,,,,,,,,,,,0,,0
1168564,78107,2013-06-17 17:32:48,5,202,,,39,30422,5,4.5,1,4.33,0.1243,0.0,16,3235118.0,0,15122,4,7,2,1,1,0,,,0,,,,1.0,0.0,21.0,0.0,0.0,8.0,0.0,0.0,,0.0,0.0,,,,,,,,,,,0,,0
1168565,78107,2013-06-17 17:32:48,5,202,,,39,39047,5,4.5,1,4.36,0.1437,0.0,8,2461846.0,1,15122,4,7,2,1,1,0,,,0,,,,-1.0,0.0,15.0,-1.0,0.0,15.0,-1.0,0.0,15.0,-1.0,0.0,15.0,,,,,,,,,,0,,0
1168566,78107,2013-06-17 17:32:48,5,202,,,39,39677,5,5.0,1,2.4,0.0074,0.0,12,19726328.0,0,15122,4,7,2,1,1,0,,,0,,,,0.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0,5.0,0.0,0.0,2.0,,,,,,,,,,0,,0
1168567,78107,2013-06-17 17:32:48,5,202,,,39,47308,4,4.0,0,4.33,0.0754,0.0,25,1509064.0,0,15122,4,7,2,1,1,0,,,0,,,,0.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,7.0,0.0,0.0,4.0,,,,,,,,,,0,,0
1168568,78107,2013-06-17 17:32:48,5,202,,,39,54873,5,4.5,0,4.19,0.1185,0.0,6,2741960.0,0,15122,4,7,2,1,1,0,,,0,,,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,,,,,,,,,,0,,0
1168569,78107,2013-06-17 17:32:48,5,202,,,39,55399,4,3.5,1,4.04,0.03,0.0,13,1467638.0,1,15122,4,7,2,1,1,0,,,0,,,,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,,,,,,,,,,,0,,0
1168570,78107,2013-06-17 17:32:48,5,202,,,39,61111,4,4.0,1,2.64,0.0043,0.0,15,2589139.0,1,15122,4,7,2,1,1,0,,,0,,,,,,,,,,,,,,,,,,,,,,,,,0,,0


##### show histogram of all columns

In [None]:
df.hist(df.columns.values,figsize=(8,10*len(df.columns)),layout=(len(df.columns),1));