In [134]:
import pandas as pd
import numpy as np
import math
import sklearn
from sklearn.preprocessing import OneHotEncoder

In [181]:
#load data
data = pd.read_csv('./data/TH_data_challenge.tsv',sep='\t', header=0)
#print the names of columns
names = data.columns.values.tolist()
print(names)
#devide the features into different sets
#for boolean value, 1 for true, 0 for flase
#for real value feature, normalize the true value, then encode the feature into id
#for string value feature, encode the string into id
real_value_set = {'m_effective_daily_price','m_pricing_cleaning_fee','dim_lat','dim_lng','m_checkouts','m_reviews',
                  'dim_person_capacity','image_quality_score','m_total_overall_rating','m_professional_pictures',
                  'ds_night_day_of_week','ds_night_day_of_year','ds_checkin_gap','ds_checkout_gap',
                  'occ_occupancy_plus_minus_7_ds_night','occ_occupancy_plus_minus_14_ds_night',
                  'occ_occupancy_trailing_90_ds','m_minimum_nights','m_maximum_nights',
                  'price_booked_most_recent', 'p2_p3_click_through_score', 'p3_inquiry_score', 
                  'listing_m_listing_views_2_6_ds_night_decay', 'general_market_m_unique_searchers_0_6_ds_night', 
                  'general_market_m_contacts_0_6_ds_night', 'general_market_m_reservation_requests_0_6_ds_night',
                  'general_market_m_is_booked_0_6_ds_night', 'm_available_listings_ds_night',
                  'days_since_last_bookin','dim_person_capacity'}
id_value_set = {'id_listing_anon','id_user_anon','dim_market','dim_room_type','cancel_policy'}
#parse label data
data[names[0]] = data[names[0]].map(lambda x: 1.0 if x > 0.0 else 0.0 )
y = data[names[0]]
#for one hot encoding
onehot_id = OneHotEncoder(n_values= 'auto',  
                       dtype=np.float32, 
                       sparse=True, 
                       handle_unknown='error')
X = None
#encode the data
#handle the NaN value
for name in names[3:]:
    if name in real_value_set:
        #for real value, throw the outliers (5 std or more)
        #encode the real value data into 10 buckets (could be improved)
        feature_values = sklearn.preprocessing.scale(data[name])
        select_idx = np.array(np.where( feature_values < mean-5*std)).reshape(-1)
        feature_values[select_idx] = mean-5*std 
        select_idx = np.array(np.where( feature_values > mean+5*std)).reshape(-1)
        feature_values[select_idx] = mean+5*std
        feature_values = (feature_values-(mean-5*std))/std
        data[name] = np.array([ str(int(x)+1) if  not math.isnan(x) else 0 for x in feature_values ])
    else:
        try:
            new_feature_values =  onehot_id.fit_transform( data[name].values.reshape(-1,1) )
        except ValueError as e:
            data[name] = np.array([ str(int(x)+1) if  not math.isnan(x) else 0 for x in data[name].values ])
X = onehot_id.fit_transform( data[names[3:]] )

['dim_is_requested', 'ds_night', 'ds', 'id_listing_anon', 'id_user_anon', 'm_effective_daily_price', 'm_pricing_cleaning_fee', 'dim_market', 'dim_lat', 'dim_lng', 'dim_room_type', 'dim_person_capacity', 'dim_is_instant_bookable', 'm_checkouts', 'm_reviews', 'days_since_last_booking', 'cancel_policy', 'image_quality_score', 'm_total_overall_rating', 'm_professional_pictures', 'dim_has_wireless_internet', 'ds_night_day_of_week', 'ds_night_day_of_year', 'ds_checkin_gap', 'ds_checkout_gap', 'occ_occupancy_plus_minus_7_ds_night', 'occ_occupancy_plus_minus_14_ds_night', 'occ_occupancy_trailing_90_ds', 'm_minimum_nights', 'm_maximum_nights', 'price_booked_most_recent', 'p2_p3_click_through_score', 'p3_inquiry_score', 'listing_m_listing_views_2_6_ds_night_decay', 'general_market_m_unique_searchers_0_6_ds_night', 'general_market_m_contacts_0_6_ds_night', 'general_market_m_reservation_requests_0_6_ds_night', 'general_market_m_is_booked_0_6_ds_night', 'm_available_listings_ds_night', 'kdt_score',



In [203]:
train_idx = np.array(np.where(data['ds_night']<'2015-12-01')).reshape(-1)
test_idx = np.array(np.where(data['ds_night']>='2015-12-01')).reshape(-1)
print(train_idx.shape)
print(test_idx.shape)
train_y = y[train_idx]
train_x = X[train_idx]
test_y = y[test_idx]
test_x = X[test_idx]

(168694,)
(15585,)


In [205]:
#train a LR model as baseline
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
              intercept_scaling=1, max_iter=1000, multi_class='ovr', n_jobs=1,
              penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
              verbose=0, warm_start=False)
model.fit(train_x,train_y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

In [207]:
#evaluate the performance
from sklearn import metrics
y_pred = model.predict(test_x)
metrics.accuracy_score(test_y, y_pred)
fpr, tpr, thresholds = metrics.roc_curve(test_y, y_pred)
metrics.auc(fpr, tpr)

In [210]:
from sklearn.neural_network import MLPClassifier

In [None]:
from sklearn.neural_network import MLPClassifier
#try a multi-layers classifier
#my personal computer is very slow, it takes long time for training
model = MLPClassifier()
model.fit(train_x,train_y)