# Results
**___**
## Preprocessing.

In [205]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import PolynomialFeatures

df = pd.read_csv("train.csv")
df_label = pd.read_csv("train_label.csv")
df_test = pd.read_csv('test.csv')

# print(df.loc[0])

df_train = df.drop(columns=['is_canceled', 'adr', 'reservation_status', 'reservation_status_date'])
df_hidden = df[['is_canceled', 'adr']]

df_cat = pd.concat([df_train, df_test], ignore_index = True)
# print(df_cat.loc[112600])

In [206]:
# format date
sort_dict_m = {'January':'01', 'February':'02', 'March':'03', 'April':'04', 'May':'05', 'June':'06',
               'July':'07', 'August':'08', 'September':'09', 'October':'10', 'November':'11', 'December':'12'}
sort_dict_d = {1:'01', 2:'02', 3:'03', 4:'04', 5:'05', 6:'06', 7:'07', 8:'08', 9:'09'}
df_cat = df_cat.replace({'arrival_date_month':sort_dict_m})
df_cat = df_cat.replace({'arrival_date_day_of_month':sort_dict_d})
df_cat = df_cat.assign(arrival_date = lambda x: x["arrival_date_year"].astype(str)+'-'+x["arrival_date_month"]+'-'+x["arrival_date_day_of_month"].astype(str))
# print(df_cat.loc[112600])

In [207]:
df_encoded = pd.get_dummies(df_cat, columns=["hotel", "arrival_date_month", "meal", "country", "market_segment",
                                             "distribution_channel", "reserved_room_type", "assigned_room_type",
                                             "deposit_type", "customer_type", "agent", "company"], dummy_na = True)
# append 0 to missing values
idx = df_encoded[df_encoded['children'].isnull()].index.tolist()
df_encoded["children"][idx] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [208]:
df_train = df_encoded.iloc[:91531, :].reset_index(drop=True)
df_train = pd.concat([df_train, df_hidden], join = 'outer', axis = 1)
df_test = df_encoded.iloc[91531:, :].reset_index(drop=True)

print(df_train.info())

# calculating revenue per request with real is_cancelled and adr
df_train = df_train.assign(rev = lambda x: (x.is_canceled==0)*x.adr*(x.stays_in_weekend_nights+x.stays_in_week_nights))
print(df_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91531 entries, 0 to 91530
Columns: 955 entries, ID to adr
dtypes: float64(2), int64(16), object(2), uint8(935)
memory usage: 95.6+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91531 entries, 0 to 91530
Columns: 956 entries, ID to rev
dtypes: float64(3), int64(16), object(2), uint8(935)
memory usage: 96.3+ MB
None


___
## 5-fold Validation

In [None]:
#iscadr = df[['is_canceled', 'adr']]
#df_x = df_train.drop(columns=['ID', 'arrival_date', 'arrival_date_year', 'arrival_date_week_number', 'arrival_date_day_of_month'])

adr = df_train['adr']
isc = df_train['is_canceled']
df_x = df_train.drop(columns = ['ID', 'arrival_date', 'arrival_date_year', 'arrival_date_week_number', 
                                'arrival_date_day_of_month', 'is_canceled', 'adr', 'rev'])

# NAMES
# --------------------------------------------------
# df       : original training data
# df_train : dummy-coded training data with only columns that the test data also contains
# df_x     : df_train without columns containing separate date information (except for 'arrival_date_month')
#           (Train the models with this!)
# df_y     : labels of training data (rank column)
# df_test  : testing data
# df_x_test: testing data drop irrelevant variable
# iscadr   : is_canceled and adr columns (training data)
# rev      : revenue column (calculated with training data)

In [121]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=0)    

### Logistic & Linear

In [122]:
count = 0
Ein_lr = [0,0,0,0,0]
Eval_lr = [0,0,0,0,0]
for train_index, val_index in kf.split(df_x):
    '''
    if count > 0: 
        break
    else: 
        count = count+1
    ''' 
    X_train = df_x.iloc[train_index, :].reset_index(drop=True)
    X_val = df_x.iloc[val_index, :].reset_index(drop=True)
    ADR_train =  adr.iloc[train_index].reset_index(drop=True)
    ADR_val = adr.iloc[val_index].reset_index(drop=True)
    ISC_train = isc.iloc[train_index].reset_index(drop=True)
    ISC_val = isc.iloc[val_index].reset_index(drop=True)
    
    # logistic isc
    lgc_isc = LogisticRegression(random_state=0).fit(X_train, ISC_train) 
    lgc_isc_ptrain = lgc_isc.predict(X_train) 
    lgc_isc_pval = lgc_isc.predict(X_val)
    print(lgc_isc.score(X_train, ISC_train))  
    
    # linear adr
    lr_adr = LinearRegression(normalize=True).fit(X_train, ADR_train)
    lr_adr_ptrain = lr_adr.predict(X_train) 
    lr_adr_pval = lr_adr.predict(X_val)
    print(lr_adr.score(X_train, ADR_train))  
    
    # concatenate
    DF_train = df_train.iloc[train_index, :].reset_index(drop=True)
    lgc_isc_ptrain = pd.DataFrame({'isc_pred': lgc_isc_ptrain})
    lr_adr_ptrain = pd.DataFrame({'adr_pred': lr_adr_ptrain})
    DF_train = pd.concat([DF_train, lgc_isc_ptrain, lr_adr_ptrain], join='outer', axis=1)
    
    DF_val = df_train.iloc[val_index, :].reset_index(drop=True)
    lgc_isc_pval = pd.DataFrame({'isc_pred': lgc_isc_pval})
    lr_adr_pval = pd.DataFrame({'adr_pred': lr_adr_pval})
    DF_val = pd.concat([DF_val, lgc_isc_pval, lr_adr_pval], join='outer', axis=1)
    
    # calculate rev per request for both training and validation set
    DF_train = DF_train.assign(rev_pred = lambda x: (x.isc_pred==0)*x.adr_pred*(x.stays_in_weekend_nights+x.stays_in_week_nights))
    #print(DF_train)
    DF_val = DF_val.assign(rev_pred = lambda x: (x.isc_pred==0)*x.adr_pred*(x.stays_in_weekend_nights+x.stays_in_week_nights))
    #print(DF_val)
    
    # calculate the daily revenue with predicted data
    DF_rev_train = DF_train[['arrival_date', 'rev_pred']]
    DF_rev_train = DF_rev_train.groupby(['arrival_date']).sum()
    DF_rev_val = DF_val[['arrival_date', 'rev_pred']]
    DF_rev_val = DF_rev_val.groupby(['arrival_date']).sum() 
    
    # calculate the daily revenue with real data
    df_rev_t = DF_train[['arrival_date', 'rev']]
    df_rev_t = df_rev_t.groupby(['arrival_date']).sum()
    df_rev_v = DF_val[['arrival_date', 'rev']]
    df_rev_v = df_rev_v.groupby(['arrival_date']).sum() 
    
    print('###############')
    #print(df_rev_t)
    print(df_rev_t[df_rev_t['rev'] < 10000])
    #print(DF_rev_train)
    print(DF_rev_train[DF_rev_train['rev_pred'] < 10000])
    print('###############') 
    #print(df_rev_v)
    print(df_rev_v[df_rev_v['rev'] < 10000])    
    #print(DF_rev_val)
    print(DF_rev_val[DF_rev_val['rev_pred'] < 10000])
    print('###############')
    
    # calculate MAE, the former param has to be the real data, the latter has to be the predicted data
    Ein = mean_absolute_error(df_rev_t, DF_rev_train)
    Ein_lr[count] = Ein
    print("Ein_MAE =", Ein)
    print('___')
    
    Eval = mean_absolute_error(df_rev_v, DF_rev_val)
    Eval_lr[count] = Eval
    print("Eval_MAE =", Eval)
    print('___')
    
    count = count+1
    
print(Ein_lr)
print(Eval_lr)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.8292499726865509
0.5492322324739733
###############
                      rev
arrival_date             
2015-07-07    9205.679721
2015-07-09    8602.786916
2015-07-15    9748.027040
2015-07-22    8842.248582
2015-07-28    6802.096679
...                   ...
2017-02-03    5895.863982
2017-02-07    7480.872820
2017-02-08    9285.059952
2017-02-21    9547.734715
2017-02-28    9128.794114

[129 rows x 1 columns]
               rev_pred
arrival_date           
2015-07-08    8857.3125
2015-07-09    8372.4375
2015-07-28    6872.3750
2015-10-27    8067.5625
2015-10-30    9228.1875
...                 ...
2017-02-01    8643.1875
2017-02-07    8828.1875
2017-02-08    7178.6250
2017-02-21    8801.2500
2017-02-28    7336.1250

[104 rows x 1 columns]
###############
                      rev
arrival_date             
2015-07-01    4246.392048
2015-07-02    3407.434578
2015-07-03    2361.628679
2015-07-04    3736.522908
2015-07-05    4896.395662
...                   ...
2017-03-27    6254.65551

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.8311369067941277
0.6422926640764199
###############
                      rev
arrival_date             
2015-07-07    8783.500218
2015-07-09    7953.542650
2015-07-22    8573.418233
2015-07-28    9772.290627
2015-09-29    9708.353743
...                   ...
2017-02-03    4616.361655
2017-02-07    7876.551651
2017-02-08    9827.943573
2017-02-28    8981.886300
2017-03-04    9276.752926

[125 rows x 1 columns]
               rev_pred
arrival_date           
2015-07-08    9509.2500
2015-07-09    7961.8125
2015-07-14    9886.6875
2015-07-28    9849.8125
2015-10-27    8594.8750
...                 ...
2017-01-31    4911.7500
2017-02-01    9191.3125
2017-02-07    8686.6875
2017-02-08    7442.5000
2017-02-28    7503.0625

[108 rows x 1 columns]
###############
                      rev
arrival_date             
2015-07-01    3764.615724
2015-07-02    3232.057456
2015-07-03    1883.684632
2015-07-04    2566.178710
2015-07-05    3397.137355
...                   ...
2017-03-27    4079.72462

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.8231341754865141
0.5524161194747544
###############
                      rev
arrival_date             
2015-07-07    8490.003071
2015-07-08    9937.948207
2015-07-09    8091.460173
2015-07-28    8072.780620
2015-10-14    6395.936093
...                   ...
2017-02-07    7227.128706
2017-02-08    7458.374651
2017-02-28    8703.896534
2017-03-01    5713.244539
2017-03-22    8800.643840

[129 rows x 1 columns]
                rev_pred
arrival_date            
2015-07-08    9139.62500
2015-07-09    7621.21875
2015-07-21    9671.62500
2015-07-28    8720.75000
2015-10-14    8105.03125
...                  ...
2017-01-24    7102.50000
2017-01-31    6595.81250
2017-02-01    9362.78125
2017-02-08    6742.31250
2017-02-28    7702.62500

[105 rows x 1 columns]
###############
                      rev
arrival_date             
2015-07-01    3132.701938
2015-07-02    1540.866480
2015-07-03    2686.399628
2015-07-04    4368.436953
2015-07-05    4422.216631
...                   ...
2017-03-27 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.8256469784909526
0.5493256416470023
###############
                      rev
arrival_date             
2015-07-07    8726.035139
2015-07-08    7745.132583
2015-07-09    7685.164286
2015-07-22    9914.020349
2015-07-28    8516.444300
...                   ...
2017-02-01    6097.499498
2017-02-07    8535.165519
2017-02-21    8990.879077
2017-02-28    8449.819542
2017-03-21    8570.643273

[124 rows x 1 columns]
               rev_pred
arrival_date           
2015-07-08    7437.1875
2015-07-09    6181.9375
2015-07-28    7885.0625
2015-10-17    9387.6875
2015-10-27    8000.0625
...                 ...
2017-01-24    8346.5000
2017-01-31    6930.6875
2017-02-01    9481.2500
2017-02-08    8638.3125
2017-02-28    7165.5625

[103 rows x 1 columns]
###############
                      rev
arrival_date             
2015-07-01    4137.623775
2015-07-02    5155.725454
2015-07-03    2548.549001
2015-07-04    3290.062661
2015-07-05    2978.315476
...                   ...
2017-03-27    5576.41566

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.8251963127347217
0.5502583922279506
###############
                      rev
arrival_date             
2015-07-03    9480.261941
2015-07-07    8518.056409
2015-07-09    8764.366300
2015-07-15    9916.376582
2015-07-22    9572.865958
...                   ...
2017-02-01    5355.711924
2017-02-03    5003.724605
2017-02-07    6023.078748
2017-02-08    8534.901981
2017-02-28    9065.007075

[131 rows x 1 columns]
                rev_pred
arrival_date            
2015-07-08    9714.71875
2015-07-09    8116.87500
2015-07-28    8540.96875
2015-10-17    8129.81250
2015-10-27    6592.50000
...                  ...
2017-01-25    8339.93750
2017-01-31    5883.37500
2017-02-07    8538.68750
2017-02-08    6521.87500
2017-02-28    8312.43750

[108 rows x 1 columns]
###############
                      rev
arrival_date             
2015-07-01    5029.853136
2015-07-02    3194.561309
2015-07-03    3486.452223
2015-07-04    3519.453025
2015-07-05    3897.393354
...                   ...
2017-03-27 

In [123]:
print(DF_val[['arrival_date', 'adr', 'adr_pred', 'rev', 'rev_pred']][DF_val['adr_pred'] < -1000])

      arrival_date         adr      adr_pred         rev      rev_pred
1622    2015-09-10   84.480566 -2.682757e+16  337.922265 -1.073103e+17
1860    2015-09-17  104.332818 -1.395514e+16  312.998453 -4.186542e+16
2202    2015-09-26  204.178104 -2.123999e+16  816.712416 -8.495994e+16
3533    2015-11-15   48.753572 -1.157446e+16  146.260717 -3.472338e+16
3689    2015-11-23   79.958118 -1.924764e+15  159.916235 -3.849529e+15
6571    2016-03-30   87.247953 -4.335525e+15  174.495905 -8.671051e+15
6750    2016-04-05   38.426739 -3.271335e+16   38.426739 -3.271335e+16
6756    2016-04-05   36.819389 -3.271335e+16   36.819389 -3.271335e+16
6820    2016-04-07   22.806856 -1.331289e+16   45.613711 -2.662579e+16
7667    2016-04-29   59.462565 -3.700942e+16  118.925130 -7.401884e+16
9823    2016-06-29  108.794752 -3.486069e+16  326.384257 -1.045821e+17
10240   2016-07-15  141.123394 -9.939418e+15  282.246789 -1.987884e+16
11188   2016-08-13   98.123011 -2.885998e+15  196.246022 -5.771997e+15
12189 

### Decision Tree

In [124]:
count = 0
Ein_dt = [0,0,0,0,0]
Eval_dt = [0,0,0,0,0]
for train_index, val_index in kf.split(df_x):
    '''
    if count > 0: 
        break
    else: 
        count = count+1
    '''
    X_train = df_x.iloc[train_index, :].reset_index(drop=True)
    X_val = df_x.iloc[val_index, :].reset_index(drop=True)
    ADR_train =  adr.iloc[train_index].reset_index(drop=True)
    ADR_val = adr.iloc[val_index].reset_index(drop=True)
    ISC_train = isc.iloc[train_index].reset_index(drop=True)
    ISC_val = isc.iloc[val_index].reset_index(drop=True)
    
    # Decision Tree isc
    dt_isc = DecisionTreeClassifier(random_state=0).fit(X_train, ISC_train)
    dt_isc_ptrain = dt_isc.predict(X_train) 
    dt_isc_pval = dt_isc.predict(X_val)    
    print(dt_isc.score(X_train, ISC_train)) 
    
    # Decision Tree adr
    dt_adr = DecisionTreeRegressor(random_state=0).fit(X_train, ADR_train)
    dt_adr_ptrain = dt_adr.predict(X_train) 
    dt_adr_pval = dt_adr.predict(X_val)  
    print(dt_adr.score(X_train, ADR_train))  
    
    # concatenate
    DF_train = df_train.iloc[train_index, :].reset_index(drop=True)
    dt_isc_ptrain = pd.DataFrame({'isc_pred': dt_isc_ptrain})
    dt_adr_ptrain = pd.DataFrame({'adr_pred': dt_adr_ptrain})
    DF_train = pd.concat([DF_train, dt_isc_ptrain, dt_adr_ptrain], join='outer', axis=1)
    
    DF_val = df_train.iloc[val_index, :].reset_index(drop=True)
    dt_isc_pval = pd.DataFrame({'isc_pred': dt_isc_pval})
    dt_adr_pval = pd.DataFrame({'adr_pred': dt_adr_pval})
    DF_val = pd.concat([DF_val, dt_isc_pval, dt_adr_pval], join='outer', axis=1)
    
    # calculate rev per request for both training and validation set
    DF_train = DF_train.assign(rev_pred = lambda x: (x.isc_pred==0)*x.adr_pred*(x.stays_in_weekend_nights+x.stays_in_week_nights))
    #print(DF_train)
    DF_val = DF_val.assign(rev_pred = lambda x: (x.isc_pred==0)*x.adr_pred*(x.stays_in_weekend_nights+x.stays_in_week_nights))
    #print(DF_val)
    
    # calculate the daily revenue with predicted data
    DF_rev_train = DF_train[['arrival_date', 'rev_pred']]
    DF_rev_train = DF_rev_train.groupby(['arrival_date']).sum()
    DF_rev_val = DF_val[['arrival_date', 'rev_pred']]
    DF_rev_val = DF_rev_val.groupby(['arrival_date']).sum() 
    
    # calculate the daily revenue with real data
    df_rev_t = DF_train[['arrival_date', 'rev']]
    df_rev_t = df_rev_t.groupby(['arrival_date']).sum()
    df_rev_v = DF_val[['arrival_date', 'rev']]
    df_rev_v = df_rev_v.groupby(['arrival_date']).sum() 
    
    print('###############')
    #print(df_rev_t)
    print(df_rev_t[df_rev_t['rev'] < 10000])
    #print(DF_rev_train)
    print(DF_rev_train[DF_rev_train['rev_pred'] < 10000])
    print('###############') 
    #print(df_rev_v)
    print(df_rev_v[df_rev_v['rev'] < 10000])    
    #print(DF_rev_val)
    print(DF_rev_val[DF_rev_val['rev_pred'] < 10000])
    print('###############')
    
    # calculate MAE, the former param has to be the real data, the latter has to be the predicted data
    Ein = mean_absolute_error(df_rev_t, DF_rev_train)
    Ein_dt[count] = Ein
    print("Ein_MAE =", Ein)
    print('___')
    
    Eval = mean_absolute_error(df_rev_v, DF_rev_val)
    Eval_dt[count] = Eval
    print("Eval_MAE =", Eval)
    print('___')
    
    count = count+1
    
print(Ein_dt)
print(Eval_dt)

0.9937315634218289
0.9826856797309558
###############
                      rev
arrival_date             
2015-07-07    9205.679721
2015-07-09    8602.786916
2015-07-15    9748.027040
2015-07-22    8842.248582
2015-07-28    6802.096679
...                   ...
2017-02-03    5895.863982
2017-02-07    7480.872820
2017-02-08    9285.059952
2017-02-21    9547.734715
2017-02-28    9128.794114

[129 rows x 1 columns]
                 rev_pred
arrival_date             
2015-07-07    9205.679721
2015-07-09    8602.786916
2015-07-15    9748.027040
2015-07-22    9416.324167
2015-07-28    6802.096679
...                   ...
2017-02-03    6186.461343
2017-02-07    7500.995621
2017-02-08    9301.785345
2017-02-21    9527.299665
2017-02-28    9197.824688

[129 rows x 1 columns]
###############
                      rev
arrival_date             
2015-07-01    4246.392048
2015-07-02    3407.434578
2015-07-03    2361.628679
2015-07-04    3736.522908
2015-07-05    4896.395662
...                   ..

### Adaboost

In [125]:
count = 0
Ein_ada = [0,0,0,0,0]
Eval_ada = [0,0,0,0,0]
for train_index, val_index in kf.split(df_x):
    '''
    if count > 0: 
        break
    else: 
        count = count+1
    '''
    X_train = df_x.iloc[train_index, :].reset_index(drop=True)
    X_val = df_x.iloc[val_index, :].reset_index(drop=True)
    ADR_train =  adr.iloc[train_index].reset_index(drop=True)
    ADR_val = adr.iloc[val_index].reset_index(drop=True)
    ISC_train = isc.iloc[train_index].reset_index(drop=True)
    ISC_val = isc.iloc[val_index].reset_index(drop=True)
    
    # Adaboost isc
    ada_isc = AdaBoostClassifier(DecisionTreeClassifier(max_depth=100), random_state=0, n_estimators=50).fit(X_train, ISC_train)
    ada_isc_ptrain = ada_isc.predict(X_train) 
    ada_isc_pval = ada_isc.predict(X_val) 
    print(ada_isc.score(X_train, ISC_train))
    
    # Adaboost adr
    ada_adr = AdaBoostRegressor(DecisionTreeRegressor(max_depth=100), random_state=0, n_estimators=50).fit(X_train, ADR_train)
    ada_adr_ptrain = ada_adr.predict(X_train) 
    ada_adr_pval = ada_adr.predict(X_val) 
    print(ada_adr.score(X_train, ADR_train))  
    
    # concatenate
    DF_train = df_train.iloc[train_index, :].reset_index(drop=True)
    ada_isc_ptrain = pd.DataFrame({'isc_pred': ada_isc_ptrain})
    ada_adr_ptrain = pd.DataFrame({'adr_pred': ada_adr_ptrain})
    DF_train = pd.concat([DF_train, ada_isc_ptrain, ada_adr_ptrain], join='outer', axis=1)
    
    DF_val = df_train.iloc[val_index, :].reset_index(drop=True)
    ada_isc_pval = pd.DataFrame({'isc_pred': ada_isc_pval})
    ada_adr_pval = pd.DataFrame({'adr_pred': ada_adr_pval})
    DF_val = pd.concat([DF_val, ada_isc_pval, ada_adr_pval], join='outer', axis=1)
    
    # calculate rev per request for both training and validation set
    DF_train = DF_train.assign(rev_pred = lambda x: (x.isc_pred==0)*x.adr_pred*(x.stays_in_weekend_nights+x.stays_in_week_nights))
    #print(DF_train)
    DF_val = DF_val.assign(rev_pred = lambda x: (x.isc_pred==0)*x.adr_pred*(x.stays_in_weekend_nights+x.stays_in_week_nights))
    #print(DF_val)
    
    # calculate the daily revenue with predicted data
    DF_rev_train = DF_train[['arrival_date', 'rev_pred']]
    DF_rev_train = DF_rev_train.groupby(['arrival_date']).sum()
    DF_rev_val = DF_val[['arrival_date', 'rev_pred']]
    DF_rev_val = DF_rev_val.groupby(['arrival_date']).sum() 
    
    # calculate the daily revenue with real data
    df_rev_t = DF_train[['arrival_date', 'rev']]
    df_rev_t = df_rev_t.groupby(['arrival_date']).sum()
    df_rev_v = DF_val[['arrival_date', 'rev']]
    df_rev_v = df_rev_v.groupby(['arrival_date']).sum() 
    
    print('###############')
    #print(df_rev_t)
    print(df_rev_t[df_rev_t['rev'] < 10000])
    #print(DF_rev_train)
    print(DF_rev_train[DF_rev_train['rev_pred'] < 10000])
    print('###############') 
    #print(df_rev_v)
    print(df_rev_v[df_rev_v['rev'] < 10000])    
    #print(DF_rev_val)
    print(DF_rev_val[DF_rev_val['rev_pred'] < 10000])
    print('###############')
    
    # calculate MAE, the former param has to be the real data, the latter has to be the predicted data
    Ein = mean_absolute_error(df_rev_t, DF_rev_train)
    Ein_ada[count] = Ein
    print("Ein_MAE =", Ein)
    print('___')
    
    Eval = mean_absolute_error(df_rev_v, DF_rev_val)
    Eval_ada[count] = Eval
    print("Eval_MAE =", Eval)
    print('___')
    
    count = count+1
    
print(Ein_ada)
print(Eval_ada)

0.9937315634218289
0.9706172748694991
###############
                      rev
arrival_date             
2015-07-07    9205.679721
2015-07-09    8602.786916
2015-07-15    9748.027040
2015-07-22    8842.248582
2015-07-28    6802.096679
...                   ...
2017-02-03    5895.863982
2017-02-07    7480.872820
2017-02-08    9285.059952
2017-02-21    9547.734715
2017-02-28    9128.794114

[129 rows x 1 columns]
                 rev_pred
arrival_date             
2015-07-07    9249.540635
2015-07-09    8661.721509
2015-07-15    9669.587026
2015-07-22    9435.603494
2015-07-28    6909.215496
...                   ...
2017-02-03    6377.621186
2017-02-07    7966.755179
2017-02-08    9455.643978
2017-02-21    9826.789602
2017-02-28    9936.830581

[121 rows x 1 columns]
###############
                      rev
arrival_date             
2015-07-01    4246.392048
2015-07-02    3407.434578
2015-07-03    2361.628679
2015-07-04    3736.522908
2015-07-05    4896.395662
...                   ..

### Random Forest

In [126]:
count = 0
Ein_rf = [0,0,0,0,0]
Eval_rf = [0,0,0,0,0]
for train_index, val_index in kf.split(df_x):
    '''
    if count > 0: 
        break
    else: 
        count = count+1
    '''
    X_train = df_x.iloc[train_index, :].reset_index(drop=True)
    X_val = df_x.iloc[val_index, :].reset_index(drop=True)
    ADR_train =  adr.iloc[train_index].reset_index(drop=True)
    ADR_val = adr.iloc[val_index].reset_index(drop=True)
    ISC_train = isc.iloc[train_index].reset_index(drop=True)
    ISC_val = isc.iloc[val_index].reset_index(drop=True)
    
    # Random Forest isc
    rf_isc = RandomForestClassifier(bootstrap=True, oob_score=True, random_state=0).fit(X_train, ISC_train)
    rf_isc_ptrain = rf_isc.predict(X_train) 
    rf_isc_pval = rf_isc.predict(X_val)    
    print(rf_isc.score(X_train, ISC_train)) 

    # Random Forest adr
    rf_adr = RandomForestRegressor (bootstrap=True, oob_score=True, random_state=0).fit(X_train, ADR_train)
    rf_adr_ptrain = rf_adr.predict(X_train) 
    rf_adr_pval = rf_adr.predict(X_val)  
    print(rf_adr.score(X_train, ADR_train)) 
    
    # concatenate
    DF_train = df_train.iloc[train_index, :].reset_index(drop=True)
    rf_isc_ptrain = pd.DataFrame({'isc_pred': rf_isc_ptrain})
    rf_adr_ptrain = pd.DataFrame({'adr_pred': rf_adr_ptrain})
    DF_train = pd.concat([DF_train, rf_isc_ptrain, rf_adr_ptrain], join='outer', axis=1)
    
    DF_val = df_train.iloc[val_index, :].reset_index(drop=True)
    rf_isc_pval = pd.DataFrame({'isc_pred': rf_isc_pval})
    rf_adr_pval = pd.DataFrame({'adr_pred': rf_adr_pval})
    DF_val = pd.concat([DF_val, rf_isc_pval, rf_adr_pval], join='outer', axis=1)
    
    # calculate rev per request for both training and validation set
    DF_train = DF_train.assign(rev_pred = lambda x: (x.isc_pred==0)*x.adr_pred*(x.stays_in_weekend_nights+x.stays_in_week_nights))
    #print(DF_train)
    DF_val = DF_val.assign(rev_pred = lambda x: (x.isc_pred==0)*x.adr_pred*(x.stays_in_weekend_nights+x.stays_in_week_nights))
    #print(DF_val)
    
    # calculate the daily revenue with predicted data
    DF_rev_train = DF_train[['arrival_date', 'rev_pred']]
    DF_rev_train = DF_rev_train.groupby(['arrival_date']).sum()
    DF_rev_val = DF_val[['arrival_date', 'rev_pred']]
    DF_rev_val = DF_rev_val.groupby(['arrival_date']).sum() 
    
    # calculate the daily revenue with real data
    df_rev_t = DF_train[['arrival_date', 'rev']]
    df_rev_t = df_rev_t.groupby(['arrival_date']).sum()
    df_rev_v = DF_val[['arrival_date', 'rev']]
    df_rev_v = df_rev_v.groupby(['arrival_date']).sum() 
    
    print('###############')
    #print(df_rev_t)
    print(df_rev_t[df_rev_t['rev'] < 10000])
    #print(DF_rev_train)
    print(DF_rev_train[DF_rev_train['rev_pred'] < 10000])
    print('###############') 
    #print(df_rev_v)
    print(df_rev_v[df_rev_v['rev'] < 10000])    
    #print(DF_rev_val)
    print(DF_rev_val[DF_rev_val['rev_pred'] < 10000])
    print('###############')
    
    # calculate MAE, the former param has to be the real data, the latter has to be the predicted data
    Ein = mean_absolute_error(df_rev_t, DF_rev_train)
    Ein_rf[count] = Ein
    print("Ein_MAE =", Ein)
    print('___')
    
    Eval = mean_absolute_error(df_rev_v, DF_rev_val)
    Eval_rf[count] = Eval
    print("Eval_MAE =", Eval)
    print('___')
    
    count = count+1
    
print(Ein_rf)
print(Eval_rf)

0.9937315634218289
0.9459301070711248
###############
                      rev
arrival_date             
2015-07-07    9205.679721
2015-07-09    8602.786916
2015-07-15    9748.027040
2015-07-22    8842.248582
2015-07-28    6802.096679
...                   ...
2017-02-03    5895.863982
2017-02-07    7480.872820
2017-02-08    9285.059952
2017-02-21    9547.734715
2017-02-28    9128.794114

[129 rows x 1 columns]
                 rev_pred
arrival_date             
2015-07-09    9056.758351
2015-07-15    9743.000073
2015-07-22    9886.525773
2015-07-28    7122.103259
2015-10-27    6280.694678
...                   ...
2017-02-03    8291.589105
2017-02-07    7690.923909
2017-02-08    9184.681592
2017-02-21    9936.936767
2017-02-28    9247.168742

[123 rows x 1 columns]
###############
                      rev
arrival_date             
2015-07-01    4246.392048
2015-07-02    3407.434578
2015-07-03    2361.628679
2015-07-04    3736.522908
2015-07-05    4896.395662
...                   ..

## Choose the best model

In [213]:
print('Ein_MAE_Linear =', Ein_lr)
print('Eval_MAE_Linear =', Eval_lr)

print('Ein_MAE_DecisionTree =', Ein_dt)
print('Eval_MAE_DecisionTree =', Eval_dt)

print('Ein_MAE_Adaboost =', Ein_ada)
print('Eval_MAE_Adaboost =', Eval_ada)

print('Ein_MAE_RandomForest =', Ein_rf)
print('Eval_MAE_RandomForest =', Eval_rf)

print(np.mean(Eval_lr))
print(np.mean(Eval_dt))
print(np.mean(Eval_ada))
print(np.mean(Eval_rf))

print(np.mean(Ein_lr))
print(np.mean(Ein_dt))
print(np.mean(Ein_ada))
print(np.mean(Ein_rf))

Ein_MAE_Linear = [3453.0081035124617, 3144.575776376567, 3430.373233326233, 3411.3427006987463, 3266.0072241278426]
Eval_MAE_Linear = [2749846250775009.5, 4224517107318028.0, 3097540521623283.0, 1694671122818891.5, 3220932658045285.0]
Ein_MAE_DecisionTree = [124.0202963149259, 113.85313849638592, 118.31756753171953, 116.1263604174923, 113.67057828447619]
Eval_MAE_DecisionTree = [848.5865297403019, 785.2475794011503, 791.8951348067537, 815.4456982456438, 728.5854836070142]
Ein_MAE_Adaboost = [396.04540202922385, 398.4494123646542, 494.5985812918945, 430.036024598759, 474.7207386958591]
Eval_MAE_Adaboost = [783.2470916786936, 684.0899261594501, 750.5998609210211, 770.4297658809252, 712.1136926462356]
Ein_MAE_RandomForest = [503.30897816141476, 505.5146194772411, 503.87464240634273, 496.7877515233557, 509.39945147350255]
Eval_MAE_RandomForest = [741.4348152372135, 663.4273464410204, 736.5991536024861, 730.0057857546253, 682.6792805313629]
2997501532116099.0
793.9520851601729
740.096067457

#### Hence, the best model selected by 5-fold validation is RandomForest

#### performance (measured by mae): 
RandomForest > Adaboost (max_depth = 100, i.e. initialized with no depth limit) > DecisionTree > LinearRegression

## Train model with the best hypothesis using all training data (RandomForest)

In [129]:
# use random forest classification to predict is_cancelled
rf_isc = RandomForestClassifier(bootstrap=True, oob_score=True, random_state=0).fit(df_x, isc)
print(rf_isc.predict(df_x))
print(rf_isc.score(df_x, isc))

# use random forest regression to predict adr
rf_adr = RandomForestRegressor (bootstrap=True, oob_score=True, random_state=0).fit(df_x, adr)
print(rf_adr.predict(df_x))
print(rf_adr.score(df_x, adr))

[0 0 0 ... 0 0 0]
0.9934885448645814
[23.30676378 57.3375625  76.02246361 ... 81.4042944  41.39682543
 49.8101906 ]
0.9503485612929312


In [183]:
# predict isc and adr for training set and test set
df_x_test = df_test.drop(columns = ['ID', 'arrival_date', 'arrival_date_year', 
                                    'arrival_date_week_number', 'arrival_date_day_of_month'])
#print(df_x)
#print(df_x_test)

rf_adr_ptrain = rf_adr.predict(df_x) 
rf_isc_ptrain = rf_isc.predict(df_x) 
rf_isc_ptest = rf_isc.predict(df_x_test)
rf_adr_ptest = rf_adr.predict(df_x_test)

# concatenate
rf_isc_ptrain = pd.DataFrame({'isc_pred': rf_isc_ptrain})
rf_adr_ptrain = pd.DataFrame({'adr_pred': rf_adr_ptrain})
df_train = pd.concat([df_train, rf_isc_ptrain, rf_adr_ptrain], join='outer', axis=1)

rf_isc_ptest = pd.DataFrame({'isc_pred': rf_isc_ptest})
rf_adr_ptest = pd.DataFrame({'adr_pred': rf_adr_ptest})
df_test = pd.concat([df_test, rf_isc_ptest, rf_adr_ptest], join='outer', axis=1)

# calculate rev per request for both training and test set
df_train = df_train.assign(rev_pred = lambda x: (x.isc_pred==0)*x.adr_pred*(x.stays_in_weekend_nights+x.stays_in_week_nights))
#print(df_train)
df_test = df_test.assign(rev_pred = lambda x: (x.isc_pred==0)*x.adr_pred*(x.stays_in_weekend_nights+x.stays_in_week_nights))
#print(df_test)

# calculate the daily revenue with predicted data for training and testing set
df_rev_train = df_train[['arrival_date', 'rev_pred']]
df_rev_train = df_rev_train.groupby(['arrival_date']).sum()
df_rev_test = df_test[['arrival_date', 'rev_pred']]
df_rev_test = df_rev_test.groupby(['arrival_date']).sum() 

# calculate the daily revenue with real data for training set
df_rev_t = df_train[['arrival_date', 'rev']]
df_rev_t = df_rev_t.groupby(['arrival_date']).sum()

print('###############')
#print(df_rev_t)
print(df_rev_t[df_rev_t['rev'] < 10000])
#print(df_rev_train)
print(df_rev_train[df_rev_train['rev_pred'] < 10000])
print('###############')    
#print(df_rev_test)
print(df_rev_test[df_rev_test['rev_pred'] < 10000])
print('###############')

       lead_time  stays_in_weekend_nights  stays_in_week_nights  adults  \
0            342                        0                     0       2   
1            257                        0                     2       1   
2            257                        0                     2       2   
3            257                        0                     2       2   
4            257                        0                     2       2   
...          ...                      ...                   ...     ...   
91526         19                        0                     2       2   
91527         28                        0                     2       2   
91528          2                        0                     1       2   
91529         30                        3                     7       2   
91530          1                        0                     1       1   

       children  babies  is_repeated_guest  previous_cancellations  \
0           0.0       0      

###############
                      rev
arrival_date             
2015-10-27    7235.040153
2015-10-30    9708.883835
2015-11-03    8196.691363
2015-11-04    7904.533787
2015-11-05    9720.225488
...                   ...
2017-01-24    9118.039321
2017-01-31    7582.440215
2017-02-01    6966.123151
2017-02-03    7778.413234
2017-02-07    9285.699361

[94 rows x 1 columns]
                 rev_pred
arrival_date             
2015-10-27    7697.637006
2015-11-03    8926.431233
2015-11-04    7906.322635
2015-11-09    8423.426529
2015-11-10    6387.045714
...                   ...
2017-01-17    7852.236615
2017-01-31    7678.621663
2017-02-01    7102.533259
2017-02-03    9944.185469
2017-02-07    9431.565963

[88 rows x 1 columns]
###############
Empty DataFrame
Columns: [rev_pred]
Index: []
###############


___
## Postprocessing.

### From **revenue** to **ranking**.

### Degree2

In [195]:
rank = df_label['label']

poly = PolynomialFeatures(2)    # default degree: 2, may change
rev_inter = poly.fit_transform(df_rev_train)
reg = LinearRegression(normalize=True).fit(rev_inter, rank)
rank_p = reg.predict(rev_inter)
print(reg.score(rev_inter, rank))

print("Ein_MAE_deg2 =", mean_absolute_error(rank, rank_p))
print('___')

df_test_rev_inter = poly.fit_transform(df_rev_test)
rank_pred = reg.predict(df_test_rev_inter)
for i in range(np.shape(rank_pred)[0]):
    if (rank_pred[i] < 0):
        rank_pred[i] = 0
    elif (rank_pred[i] > 9):
        rank_pred[i] = 9
    else:
        rank_pred[i] = round(rank_pred[i])
        
# Generate submission file
ans = df_rev_test[:]
ans['label'] = rank_pred
ans.drop(columns=['rev_pred'], inplace=True)
ans.to_csv('out_rf_deg2.csv')

0.9602845048624878
Ein_MAE_deg2 = 0.24932467776314518
___


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### Degree1

In [196]:
rank = df_label['label']

rev_inter = df_rev_train
reg = LinearRegression(normalize=True).fit(rev_inter, rank)
rank_p = reg.predict(rev_inter)
print(reg.score(rev_inter, rank))

print("Ein_MAE_deg1 =", mean_absolute_error(rank, rank_p))
print('___')

df_test_rev_inter = df_rev_test
rank_pred = reg.predict(df_test_rev_inter)
for i in range(np.shape(rank_pred)[0]):
    if (rank_pred[i] < 0):
        rank_pred[i] = 0
    elif (rank_pred[i] > 9):
        rank_pred[i] = 9
    else:
        rank_pred[i] = round(rank_pred[i])
        
# Generate submission file
ans = df_rev_test[:]
ans['label'] = rank_pred
ans.drop(columns=['rev_pred'], inplace=True)
ans.to_csv('out_rf_deg1.csv')

0.9601977718856713
Ein_MAE_deg1 = 0.2499577711918996
___


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


## Train model with Adaboost

In [197]:
# use ada classification to predict is_cancelled
ada_isc = AdaBoostClassifier(DecisionTreeClassifier(max_depth=100), random_state=0, n_estimators=50).fit(df_x, isc)
print(ada_isc.predict(df_x))
print(ada_isc.score(df_x, isc))

# use ada regression to predict adr
ada_adr = AdaBoostRegressor(DecisionTreeRegressor(max_depth=100), random_state=0, n_estimators=50).fit(df_x, adr)
print(ada_adr.predict(df_x))
print(ada_adr.score(df_x, adr))

[0 0 0 ... 0 0 0]
0.9934885448645814
[-6.30516084 55.65833812 73.95051184 ... 90.81455355 38.1355651
 43.64298055]
0.967294897221243


In [202]:
# predict isc and adr for training set and test set
df_x_test = df_test.drop(columns = ['ID', 'arrival_date', 'arrival_date_year', 'arrival_date_week_number', 'arrival_date_day_of_month'])
#print(df_x)
#print(df_x_test)

ada_adr_ptrain = ada_adr.predict(df_x) 
ada_isc_ptrain = ada_isc.predict(df_x) 
ada_isc_ptest = ada_isc.predict(df_x_test)
ada_adr_ptest = ada_adr.predict(df_x_test)

# concatenate
ada_isc_ptrain = pd.DataFrame({'isc_pred': ada_isc_ptrain})
ada_adr_ptrain = pd.DataFrame({'adr_pred': ada_adr_ptrain})
df_train = pd.concat([df_train, ada_isc_ptrain, ada_adr_ptrain], join='outer', axis=1)

ada_isc_ptest = pd.DataFrame({'isc_pred': ada_isc_ptest})
ada_adr_ptest = pd.DataFrame({'adr_pred': ada_adr_ptest})
df_test = pd.concat([df_test, ada_isc_ptest, ada_adr_ptest], join='outer', axis=1)

# calculate rev per request for both training and test set
df_train = df_train.assign(rev_pred = lambda x: (x.isc_pred==0)*x.adr_pred*(x.stays_in_weekend_nights+x.stays_in_week_nights))
#print(df_train)
df_test = df_test.assign(rev_pred = lambda x: (x.isc_pred==0)*x.adr_pred*(x.stays_in_weekend_nights+x.stays_in_week_nights))
#print(df_test)

# calculate the daily revenue with predicted data for training and testing set
df_rev_train = df_train[['arrival_date', 'rev_pred']]
df_rev_train = df_rev_train.groupby(['arrival_date']).sum()
df_rev_test = df_test[['arrival_date', 'rev_pred']]
df_rev_test = df_rev_test.groupby(['arrival_date']).sum() 

# calculate the daily revenue with real data for training set
df_rev_t = df_train[['arrival_date', 'rev']]
df_rev_t = df_rev_t.groupby(['arrival_date']).sum()

print('###############')
#print(df_rev_t)
print(df_rev_t[df_rev_t['rev'] < 10000])
#print(df_rev_train)
print(df_rev_train[df_rev_train['rev_pred'] < 10000])
print('###############')    
#print(df_rev_test)
print(df_rev_test[df_rev_test['rev_pred'] < 10000])
print('###############')

###############
                      rev
arrival_date             
2015-10-27    7235.040153
2015-10-30    9708.883835
2015-11-03    8196.691363
2015-11-04    7904.533787
2015-11-05    9720.225488
...                   ...
2017-01-24    9118.039321
2017-01-31    7582.440215
2017-02-01    6966.123151
2017-02-03    7778.413234
2017-02-07    9285.699361

[94 rows x 1 columns]
                 rev_pred
arrival_date             
2015-10-27    7863.672622
2015-11-04    8102.111671
2015-11-09    8359.688231
2015-11-10    6780.426023
2015-11-11    7259.035730
...                   ...
2017-01-24    9949.457433
2017-01-31    7951.505766
2017-02-01    7128.011152
2017-02-03    8011.078933
2017-02-07    9710.993354

[86 rows x 1 columns]
###############
Empty DataFrame
Columns: [rev_pred]
Index: []
###############


In [203]:
rank = df_label['label']

poly = PolynomialFeatures(2)    # default degree: 2, may change
rev_inter = poly.fit_transform(df_rev_train)
reg = LinearRegression(normalize=True).fit(rev_inter, rank)
rank_p = reg.predict(rev_inter)
print(reg.score(rev_inter, rank))

print("Ein_MAE_deg2 =", mean_absolute_error(rank, rank_p))
print('___')

df_test_rev_inter = poly.fit_transform(df_rev_test)
rank_pred = reg.predict(df_test_rev_inter)
for i in range(np.shape(rank_pred)[0]):
    if (rank_pred[i] < 0):
        rank_pred[i] = 0
    elif (rank_pred[i] > 9):
        rank_pred[i] = 9
    else:
        rank_pred[i] = round(rank_pred[i])
        
# Generate submission file
ans = df_rev_test[:]
ans['label'] = rank_pred
ans.drop(columns=['rev_pred'], inplace=True)
ans.to_csv('out_ada_deg2.csv')

0.9607323858565674
Ein_MAE_deg2 = 0.24953663797845765
___


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [204]:
rank = df_label['label']

rev_inter = df_rev_train
reg = LinearRegression(normalize=True).fit(rev_inter, rank)
rank_p = reg.predict(rev_inter)
print(reg.score(rev_inter, rank))

print("Ein_MAE_deg1 =", mean_absolute_error(rank, rank_p))
print('___')

df_test_rev_inter = df_rev_test
rank_pred = reg.predict(df_test_rev_inter)
for i in range(np.shape(rank_pred)[0]):
    if (rank_pred[i] < 0):
        rank_pred[i] = 0
    elif (rank_pred[i] > 9):
        rank_pred[i] = 9
    else:
        rank_pred[i] = round(rank_pred[i])
        
# Generate submission file
ans = df_rev_test[:]
ans['label'] = rank_pred
ans.drop(columns=['rev_pred'], inplace=True)
ans.to_csv('out_ada_deg1.csv')

0.960610935080953
Ein_MAE_deg1 = 0.2499247363969234
___


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


## Train model with Decision Tree

In [209]:
# use ada classification to predict is_cancelled
dt_isc = DecisionTreeClassifier(random_state=0).fit(df_x, isc)
print(dt_isc.predict(df_x))
print(dt_isc.score(df_x, isc))

# use ada regression to predict adr
dt_adr = DecisionTreeRegressor(random_state=0).fit(df_x, adr)
print(dt_adr.predict(df_x))
print(dt_adr.score(df_x, adr))

[0 0 0 ... 0 0 0]
0.9934885448645814
[-6.30516084 57.46302333 76.24894718 ... 90.81455355 38.1355651
 58.19647044]
0.9814240327056286


In [210]:
# predict isc and adr for training set and test set
df_x_test = df_test.drop(columns = ['ID', 'arrival_date', 'arrival_date_year', 'arrival_date_week_number', 'arrival_date_day_of_month'])
#print(df_x)
#print(df_x_test)

dt_adr_ptrain = dt_adr.predict(df_x) 
dt_isc_ptrain = dt_isc.predict(df_x) 
dt_isc_ptest = dt_isc.predict(df_x_test)
dt_adr_ptest = dt_adr.predict(df_x_test)

# concatenate
dt_isc_ptrain = pd.DataFrame({'isc_pred': dt_isc_ptrain})
dt_adr_ptrain = pd.DataFrame({'adr_pred': dt_adr_ptrain})
df_train = pd.concat([df_train, dt_isc_ptrain, dt_adr_ptrain], join='outer', axis=1)

dt_isc_ptest = pd.DataFrame({'isc_pred': dt_isc_ptest})
dt_adr_ptest = pd.DataFrame({'adr_pred': dt_adr_ptest})
df_test = pd.concat([df_test, dt_isc_ptest, dt_adr_ptest], join='outer', axis=1)

# calculate rev per request for both training and test set
df_train = df_train.assign(rev_pred = lambda x: (x.isc_pred==0)*x.adr_pred*(x.stays_in_weekend_nights+x.stays_in_week_nights))
#print(df_train)
df_test = df_test.assign(rev_pred = lambda x: (x.isc_pred==0)*x.adr_pred*(x.stays_in_weekend_nights+x.stays_in_week_nights))
#print(df_test)

# calculate the daily revenue with predicted data for training and testing set
df_rev_train = df_train[['arrival_date', 'rev_pred']]
df_rev_train = df_rev_train.groupby(['arrival_date']).sum()
df_rev_test = df_test[['arrival_date', 'rev_pred']]
df_rev_test = df_rev_test.groupby(['arrival_date']).sum() 

# calculate the daily revenue with real data for training set
df_rev_t = df_train[['arrival_date', 'rev']]
df_rev_t = df_rev_t.groupby(['arrival_date']).sum()

print('###############')
#print(df_rev_t)
print(df_rev_t[df_rev_t['rev'] < 10000])
#print(df_rev_train)
print(df_rev_train[df_rev_train['rev_pred'] < 10000])
print('###############')    
#print(df_rev_test)
print(df_rev_test[df_rev_test['rev_pred'] < 10000])
print('###############')

###############
                      rev
arrival_date             
2015-10-27    7235.040153
2015-10-30    9708.883835
2015-11-03    8196.691363
2015-11-04    7904.533787
2015-11-05    9720.225488
...                   ...
2017-01-24    9118.039321
2017-01-31    7582.440215
2017-02-01    6966.123151
2017-02-03    7778.413234
2017-02-07    9285.699361

[94 rows x 1 columns]
                 rev_pred
arrival_date             
2015-10-27    7272.215273
2015-10-30    9731.636045
2015-11-03    8257.971435
2015-11-04    7885.035732
2015-11-05    9810.290762
...                   ...
2017-01-24    9548.368968
2017-01-31    7582.375017
2017-02-01    6965.289072
2017-02-03    8069.010595
2017-02-07    9305.822163

[94 rows x 1 columns]
###############
Empty DataFrame
Columns: [rev_pred]
Index: []
###############


In [211]:
rank = df_label['label']

poly = PolynomialFeatures(2)    # default degree: 2, may change
rev_inter = poly.fit_transform(df_rev_train)
reg = LinearRegression(normalize=True).fit(rev_inter, rank)
rank_p = reg.predict(rev_inter)
print(reg.score(rev_inter, rank))

print("Ein_MAE_deg2 =", mean_absolute_error(rank, rank_p))
print('___')

df_test_rev_inter = poly.fit_transform(df_rev_test)
rank_pred = reg.predict(df_test_rev_inter)
for i in range(np.shape(rank_pred)[0]):
    if (rank_pred[i] < 0):
        rank_pred[i] = 0
    elif (rank_pred[i] > 9):
        rank_pred[i] = 9
    else:
        rank_pred[i] = round(rank_pred[i])
        
# Generate submission file
ans = df_rev_test[:]
ans['label'] = rank_pred
ans.drop(columns=['rev_pred'], inplace=True)
ans.to_csv('out_dt_deg2.csv')

0.9617592021861299
Ein_MAE_deg2 = 0.24702515271067912
___


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [212]:
rank = df_label['label']

rev_inter = df_rev_train
reg = LinearRegression(normalize=True).fit(rev_inter, rank)
rank_p = reg.predict(rev_inter)
print(reg.score(rev_inter, rank))

print("Ein_MAE_deg1 =", mean_absolute_error(rank, rank_p))
print('___')

df_test_rev_inter = df_rev_test
rank_pred = reg.predict(df_test_rev_inter)
for i in range(np.shape(rank_pred)[0]):
    if (rank_pred[i] < 0):
        rank_pred[i] = 0
    elif (rank_pred[i] > 9):
        rank_pred[i] = 9
    else:
        rank_pred[i] = round(rank_pred[i])
        
# Generate submission file
ans = df_rev_test[:]
ans['label'] = rank_pred
ans.drop(columns=['rev_pred'], inplace=True)
ans.to_csv('out_dt_deg1.csv')

0.9616272163466836
Ein_MAE_deg1 = 0.24754808711018256
___


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [214]:
print(df_train.info())
print(df_train)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91531 entries, 0 to 91530
Columns: 959 entries, ID to rev_pred
dtypes: float64(5), int64(17), object(2), uint8(935)
memory usage: 98.4+ MB
None
          ID  lead_time  arrival_date_year  arrival_date_week_number  \
0          0        342               2015                        27   
1          1        257               2015                        27   
2          2        257               2015                        27   
3          3        257               2015                        27   
4          4        257               2015                        27   
...      ...        ...                ...                       ...   
91526  91526         19               2017                        13   
91527  91527         28               2017                        13   
91528  91528          2               2017                        13   
91529  91529         30               2017                        13   
91530  91530  