# Start of Data Exploration

In [1]:
# Imports
import sklearn as sk
import pandas as pd

In [2]:
train_data = pd.read_csv('data/training_set_VU_DM.csv')
original_columns = train_data.columns
train_data.head(5) # Show top 5

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool
0,1,2013-04-04 08:32:15,12,187,,,219,893,3,3.5,...,,,,,0.0,0.0,,0,,0
1,1,2013-04-04 08:32:15,12,187,,,219,10404,4,4.0,...,,,,,0.0,0.0,,0,,0
2,1,2013-04-04 08:32:15,12,187,,,219,21315,3,4.5,...,,,,,0.0,0.0,,0,,0
3,1,2013-04-04 08:32:15,12,187,,,219,27348,2,4.0,...,,,,,-1.0,0.0,5.0,0,,0
4,1,2013-04-04 08:32:15,12,187,,,219,29604,4,3.5,...,,,,,0.0,0.0,,0,,0


# Manual Column exploration
---
## Main columns
- `search_id` seems to represent each individual 'user'.
- `booking_bool` is essentially the answer.

## Categorical features
The following features are categorical (to be onehot-encoded):

User-specific
- `site_id`: category of website Expedia used
- `visitor_location_country_id`: categories of which country user is from
- `srch_destination_id`: where did the user search from
- `srch_saturday_night_bool`: boolean if stay includes staturday

Hotel-specific:
- `prop_id`: categories of associated hotels
- `prop_brand_bool`: boolean if hotel is part of chain or not
- `promotion_flag`: displaying promotion or not

Expedia-specific vs competitors 1_8:
- `comp{i}_rate`: if expedia has a lower price, do +1, 0 if same, -1 price is higher, null if no competitive data
- `comp{i}_inv`: if competitor has no availability, +1, 0 if both have availability, null if no competitive data

## Numerical features

User-specific
- `visitor_hist_starrating`: average of previous stars of associated user
- `visitor_hist_adr_usd`: average price per night of hotels of associated user
- `srch_length_of_stay`: number of nights stays **searched** 
- `srch_booking_window`: number of days ahead the start of booking window **searched**
- `srch_adults_count`: number of adults **searched**
- `srch_children_count`: number of children **searched**
- `srch_room_count`: number of rooms **searched**
- `random_bool`: if sort was random at time of search
- `gross_booking_usd`: ❗Training-only❗ payment includign taxes, etc for hotel

Hotel-specific
- `prop_starrating`: star rating of hotel (1-5)
- `prop_review_score`: average review score of hotel (1-5)
- `prop_location_score_1`: score1 of hotel's location desirability
- `prop_location_score_2`: score2 of hotel's location desirability
- `prop_log_historical_price`: logarithm of average price of hotel lately (0 == not sold)
- `price_usd`: displayed price of hotel.
    - ❗ Important: Different countries have different conventions.
    - Value can change per night
- `srch_query_affinity_score`: log probability a hotel is clicked in internet searches

User-hotel coupled:
- `orig_destination_distance`: distance between hotel and customer at search-time (null means no distance calculated)

Expedia-specific vs competitors 1_8:
- `comp{i}_rate_percent_diff`: absolute difference between expedia and competitor's price, with null being no competitive data


## Unknown type
- `date_time`

# Initial data cleanup

## Impute missing value

In [3]:
# We will have to cleanup our data next up. Let's first impute the missing columns. 
# To do this we search for the columns with nans
na_cols = train_data.isna().any()
nan_cols = train_data.columns[na_cols]
nan_cols

Index(['visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_review_score',
       'prop_location_score2', 'srch_query_affinity_score',
       'orig_destination_distance', 'comp1_rate', 'comp1_inv',
       'comp1_rate_percent_diff', 'comp2_rate', 'comp2_inv',
       'comp2_rate_percent_diff', 'comp3_rate', 'comp3_inv',
       'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv',
       'comp4_rate_percent_diff', 'comp5_rate', 'comp5_inv',
       'comp5_rate_percent_diff', 'comp6_rate', 'comp6_inv',
       'comp6_rate_percent_diff', 'comp7_rate', 'comp7_inv',
       'comp7_rate_percent_diff', 'comp8_rate', 'comp8_inv',
       'comp8_rate_percent_diff', 'gross_bookings_usd'],
      dtype='object')

Aside from `comp{i}_rate` and `comp2_inv`, all of these columns are numerical features. We could, initially,
simply replace all these values with -1 for the moment.

❗ Important: Note, this is actually incorrect, but might work for the moment.

In [4]:
# Simple numerical impute: select numerical data, fill it with -1
imputed_numerical_data = train_data[nan_cols].filter(regex='[^comp\d_(rate|inv)$]')
imputed_numerical_data = imputed_numerical_data.fillna(-1)
train_data.update(imputed_numerical_data)

# Manual cleanup to ensure no problem with space
del imputed_numerical_data
train_data.head(5)

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool
0,1,2013-04-04 08:32:15,12,187,-1.0,-1.0,219,893,3,3.5,...,-1.0,,,-1.0,0.0,0.0,-1.0,0,-1.0,0
1,1,2013-04-04 08:32:15,12,187,-1.0,-1.0,219,10404,4,4.0,...,-1.0,,,-1.0,0.0,0.0,-1.0,0,-1.0,0
2,1,2013-04-04 08:32:15,12,187,-1.0,-1.0,219,21315,3,4.5,...,-1.0,,,-1.0,0.0,0.0,-1.0,0,-1.0,0
3,1,2013-04-04 08:32:15,12,187,-1.0,-1.0,219,27348,2,4.0,...,-1.0,,,-1.0,-1.0,0.0,5.0,0,-1.0,0
4,1,2013-04-04 08:32:15,12,187,-1.0,-1.0,219,29604,4,3.5,...,-1.0,,,-1.0,0.0,0.0,-1.0,0,-1.0,0


In [5]:
# Simple naive categorical impute
na_cols = train_data.columns[train_data.isna().any()]
imputed_categorical_data = train_data[na_cols].fillna(-2)
train_data.update(imputed_categorical_data)

# Cleanup
del imputed_categorical_data
train_data.head(5)

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool
0,1,2013-04-04 08:32:15,12,187,-1.0,-1.0,219,893,3,3.5,...,-1.0,-2.0,-2.0,-1.0,0.0,0.0,-1.0,0,-1.0,0
1,1,2013-04-04 08:32:15,12,187,-1.0,-1.0,219,10404,4,4.0,...,-1.0,-2.0,-2.0,-1.0,0.0,0.0,-1.0,0,-1.0,0
2,1,2013-04-04 08:32:15,12,187,-1.0,-1.0,219,21315,3,4.5,...,-1.0,-2.0,-2.0,-1.0,0.0,0.0,-1.0,0,-1.0,0
3,1,2013-04-04 08:32:15,12,187,-1.0,-1.0,219,27348,2,4.0,...,-1.0,-2.0,-2.0,-1.0,-1.0,0.0,5.0,0,-1.0,0
4,1,2013-04-04 08:32:15,12,187,-1.0,-1.0,219,29604,4,3.5,...,-1.0,-2.0,-2.0,-1.0,0.0,0.0,-1.0,0,-1.0,0


# Initial feature transformation

In [6]:
# Imports for feature transformation
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [7]:
#Onehot encode the categorical variables
oh_encoder = OneHotEncoder()
oh_columns = ['site_id', 'visitor_location_country_id', 'prop_country_id', 'prop_id', 'prop_brand_bool', 'promotion_flag', 
              'srch_destination_id', 'srch_saturday_night_bool', 'random_bool', 'booking_bool', 'click_bool'
             ]
#todo competitor columns

for column in oh_columns:
    train_data[column]=train_data[column].astype('category')


#encode the numerical values
num_scale_encoder = StandardScaler()
num_scale_columns = ['visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_starrating', 'prop_review_score', 
                     'prop_location_score1', 'prop_location_score2', 'prop_log_historical_price', 'price_usd', 
                     'srch_length_of_stay', 'srch_booking_window', 'srch_adults_count', 'srch_children_count',
                     'srch_room_count', 'srch_query_affinity_score', 'orig_destination_distance' 
                    ]

#we do a preselection of columns that we feel will become useful features after encoding
chosen_columns = ['prop_starrating', 'prop_review_score', 'prop_location_score1', 'prop_location_score2', 
                  'prop_log_historical_price', 'price_usd', 'srch_query_affinity_score',  'promotion_flag']

chosen_oh_cols = list(set(chosen_columns) & set(oh_columns))
chosen_num_cols = list(set(chosen_columns) & set(num_scale_columns))
df_transformer = ColumnTransformer([
    ('oh', oh_encoder, oh_columns),
    ('num', num_scale_encoder, num_scale_columns),
], remainder='drop')


# We fit this transformer on our training data, and transform our training data into this new format
encoded_X = df_transformer.fit_transform(train_data)

# Initial model feature selection

In [17]:
from sklearn.feature_selection import SelectFromModel, SelectKBest, RFE
from sklearn.svm import SVC

In [18]:
model = SVC()
feature_selector = SelectFromModel(model)

In [20]:
X_new = model.transform(encoded_X)
X_new.shape

AttributeError: 'LinearSVC' object has no attribute 'transform'