In [None]:
import numpy as np
import pandas as pd
from sklearn import svm
import random
from sklearn.model_selection import train_test_split as ts
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
import statistics

In [None]:
traindf, labeldf, testdf = pd.read_csv('train.csv'), pd.read_csv('train_label.csv'), pd.read_csv('test.csv')

In [None]:
def encoding(dataframe, train=True):
    df = dataframe.copy()
    df.drop('ID', inplace=True, axis=1)
    df.drop('arrival_date_year', inplace=True, axis=1)
    if train:
        df.drop('reservation_status', inplace=True, axis=1)
        df.drop('reservation_status_date', inplace=True, axis=1)
    df['hotel'] = df.hotel.map(lambda x: 1 if x == 'City Hotel' else 0)
    
    #arrival_month
    possiblecat = 'January,February,March,April,May,June,July,August,September,October,November,December'.split(',')
    df['arrival_date_month'] = df['arrival_date_month'].astype('category', categories=possiblecat)
    df = pd.get_dummies(df, columns=['arrival_date_month'], dummy_na=False)
    
    df = pd.get_dummies(df, columns=['meal'], dummy_na=False)
    df['country'] = df.country.map(lambda x: 'Others' 
                               if x not in ['PRT','GBR','FRA','ESP','DEU','ITA','IRL','BRA','NLD','BEL','USA','CHE'] 
                               else x)
    df = pd.get_dummies(df, columns=['country'], dummy_na=False)
    
    # market_segment
    possiblecat = ['Online TA','Offline TA/TO','Groups','Direct','Corporate','Complementary','Aviation','Undefined']
    df['market_segment'] = df['market_segment'].astype('category', categories=possiblecat)
    df = pd.get_dummies(df, columns=['market_segment'], dummy_na=False)
    
    # distribution_channel
    possiblecat = ['TA/TO','Direct','Corporate','GDS','Undefined']
    df['distribution_channel'] = df['distribution_channel'].astype('category', categories=possiblecat)
    df = pd.get_dummies(df, columns=['distribution_channel'], dummy_na=False)
    
    #reserved_room_type & assigned_room_type
    possiblecat = list('ABCDEFGHIJKL')
    df['reserved_room_type'] = df['reserved_room_type'].astype('category', categories=possiblecat)
    df['assigned_room_type'] = df['assigned_room_type'].astype('category', categories=possiblecat)
    df = pd.get_dummies(df, columns=['reserved_room_type'], dummy_na=False)
    df = pd.get_dummies(df, columns=['assigned_room_type'], dummy_na=False)
    
    df = pd.get_dummies(df, columns=['deposit_type'], dummy_na=False)
    df['agent'] = df.agent.map(lambda x: 1 if x else 0)
    df['company'] = df.company.map(lambda x: 1 if x else 0)
    df = pd.get_dummies(df, columns=['customer_type'], dummy_na=False)
    
    #outcome variables
    if train:
        adr = df['adr']
        cancel = df['is_canceled']
    
        df.drop('adr', inplace=True, axis=1)
        df.drop('is_canceled', inplace=True, axis=1)
    
    df = df.fillna(0)
    
    if train:
        return df, adr, cancel
    else:
        return df

In [None]:
def fromdatatolabel(dataset,labelset):
    df = dataset.copy()
    label_df = labelset.copy()
    boundary_const = lambda x: 9.0 if x>=10 else(0.0 if x<0 else x)
    predictoutcome = []
    mlist = 'January,February,March,April,May,June,July,August,September,October,November,December'.split(',')
    for i in label_df['arrival_date']:
        y,m,d = i.split('-')
        #print(y,m,d)
        subsetdf = df[(df['arrival_date_year'] == int(y)) & 
                        (df['arrival_date_month'] == mlist[int(m)-1]) &
                        (df['arrival_date_day_of_month'] == int(d))]
        subsetdf['new'] = (1-subsetdf['is_canceled_pred'])*(subsetdf['stays_in_weekend_nights']+subsetdf['stays_in_week_nights'])*subsetdf['adr_pred']
        
        predictoutcome.append(boundary_const(np.floor(subsetdf['new'].sum()/10000)))
    return predictoutcome

In [None]:
df,adr,cancel = encoding(traindf)
df_test = encoding(testdf,False)

In [None]:
model_rf = RandomForestClassifier(n_estimators = 100, oob_score = True, n_jobs = -1,random_state =50,
                                max_features = "auto",max_depth = 30)
model_rf.fit(df,cancel)
model_rf.score(df,cancel)

In [None]:
model_rfreg = RandomForestRegressor(200)
model_rfreg.fit(df,adr)
model_rfreg.score(df,adr)

In [None]:
ypred_rf = model_rf.predict(df_test)
ypred_rfreg = model_rfreg.predict(df_test)
testdf['adr_pred'] = ypred_rfreg
testdf['is_canceled_pred'] = ypred_rf

In [None]:
labeldf_test = pd.read_csv('test_nolabel.csv')
labeldf_test['label'] = fromdatatolabel(testdf,labeldf_test)
labeldf_test