<a href="https://colab.research.google.com/github/Matancoo/IML.HUJI/blob/main/challenge/DATA_CHALLENGE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import pickle
import re

# from kmodes.kprototypes import KPrototypes
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.metrics import confusion_matrix

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from tensorflow.keras.callbacks import EarlyStopping


import torch
import torchvision
import torchvision.transforms as transforms
import random 
from torchvision import transforms, datasets
from torch.utils.data import DataLoader,random_split
from torch import nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
# #helper functions
# def split_train_test(X: pd.DataFrame, y: pd.Series, train_proportion: float = .25): 
    
#     """
#     Split given sample to a training- and testing sample

#     Parameters
#     ----------
#     X : DataFrame of shape (n_samples, n_features)
#         Data frame of samples and feature values.

#     y : Series of shape (n_samples, )
#         Responses corresponding samples in data frame.

#     train_proportion: Fraction of samples to be split as training set


#     """

# #we will first suffle and then take first frac.
#     n_samples = y.shape[0]
#     p = np.ceil(train_proportion * n_samples).astype(int)
#     full_set = pd.concat([X,y],axis=1).sample(frac=1)

#     train_X= full_set.iloc[:p, :]
#     test_X = full_set.iloc[p:,:]
#     train_y = train_X.pop(train_X.columns[-1])
#     test_y = test_X.pop(test_X.columns[-1])



    # return train_X,train_y,test_X,test_y



In [3]:



def load_data(filename: str):
    """
    Load Agoda booking cancellation dataset
    Parameters
    ----------
    filename: str
        Path to house prices dataset
    Returns
    -------
    Design matrix and response vector in either of the following formats:
    1) Single dataframe with last column representing the response
    2) Tuple of pandas.DataFrame and Series
    3) Tuple of ndarray of shape (n_samples, n_features) and ndarray of shape (n_samples,)
    """
    # Data Preprocessing
    data = pd.read_csv(filename).drop_duplicates()

    # reformat the feature names:   #TODO: doesnt work
    # def camel_to_snake(name):
    #     name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    #     return re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower()
    # # Reformating the feature names: 
    # data.columns = data.columns.map(lambda x : x.strip())


   
    #feature engineering before datetime conversion
    data['is_cancelled'] = (~pd.isna(data.cancellation_datetime)) #new response
    


    # creating one hot representation
    data = pd.get_dummies(data, columns=['charge_option'])

    # Changing dates to datetime type: 
    data['cancellation_datetime'] = pd.to_datetime(data['cancellation_datetime'])
    data['booking_datetime'] = pd.to_datetime(data['booking_datetime'])
    data['checkout_date'] = pd.to_datetime(data['checkout_date'])
    data['hotel_live_date'] = pd.to_datetime(data['hotel_live_date'])
    data['checkin_date'] = pd.to_datetime(data['checkin_date'])

    #feature engineering after datetime conversion
    data['time_interval'] = abs(data.booking_datetime.dt.day - data.checkin_date.dt.day).astype(int)
    
    #nationalities --- take only relevant features

    # sample dropping
    data.dropna(subset = ['origin_country_code','origin_country_code'], inplace = True)
    
    # check that check-in is before the end_date
    end_date = pd.to_datetime('2018-12-13')
    data = data[data.checkin_date <= end_date]

    #feature dropping:
    data.drop(columns=['booking_datetime', 'checkin_date','checkout_date','cancellation_datetime','hotel_id','h_booking_id','h_customer_id','hotel_brand_code','hotel_area_code','hotel_live_date','hotel_city_code','cancellation_policy_code','original_payment_currency','original_payment_type','original_payment_method','language','origin_country_code','guest_nationality_country_name','customer_nationality','accommadation_type_name','hotel_country_code'],inplace=True)
#not final--> need to do k-prototype and further data featuring


    #replaceing nan with 0.0

    data['request_nonesmoke'].replace(to_replace=np.nan, value = 0.0, inplace= True)
    data['request_latecheckin'].replace(to_replace=np.nan, value=0.0, inplace = True)
    data['request_highfloor'].replace(to_replace=np.nan, value=0.0, inplace = True)
    data['request_largebed'].replace(to_replace=np.nan, value=0.0, inplace = True)
    data['request_twinbeds'].replace(to_replace=np.nan, value=0.0, inplace = True)
    data['request_airport'].replace(to_replace=np.nan, value=0.0, inplace = True)
    data['request_earlycheckin'].replace(to_replace=np.nan, value=0.0, inplace = True)

    # data.cancellation_datetime = data['cancellation_datetime'].apply(lambda x: x.date())
    # start_date = pd.to_datetime('2018-12-07')
    # end_date = pd.to_datetime('2018-12-13')
    # df['cancellation_datetime'] = (df.cancellation_datetime <= end_date) &  (df.cancellation_datetime > start_date)






    # Dropping samples that have a checkin-date after 2018-12-13
    #Turning objects to one-hot vectors:
    # labels = data.pop("cancellation_datetime")
    # features = data

    return data





def evaluate_and_export(estimator, X: np.ndarray, filename: str):
    """
    Export to specified file the prediction results of given estimator on given testset.
    File saved is in csv format with a single column named 'predicted_values' and n_samples rows containing
    predicted values.
    Parameters
    ----------
    estimator: BaseEstimator or any object implementing predict() method as in BaseEstimator (for example sklearn)
        Fitted estimator to use for prediction
    X: ndarray of shape (n_samples, n_features)
        Test design matrix to predict its responses
    filename:
        path to store file at
    """
    pd.DataFrame(estimator.predict(X), columns=["predicted_values"]).to_csv(filename, index=False)




Data Preprocessing

In [4]:
from pandas.io.parsers.readers import read_csv
#loading data:
data  = load_data('https://raw.githubusercontent.com/Matancoo/IML.HUJI/main/datasets/agoda_cancellation_train.csv')


In [5]:
data.dtypes


hotel_star_rating                float64
guest_is_not_the_customer          int64
no_of_adults                       int64
no_of_children                     int64
no_of_extra_bed                    int64
no_of_room                         int64
original_selling_amount          float64
is_user_logged_in                   bool
is_first_booking                    bool
request_nonesmoke                float64
request_latecheckin              float64
request_highfloor                float64
request_largebed                 float64
request_twinbeds                 float64
request_airport                  float64
request_earlycheckin             float64
hotel_chain_code                 float64
is_cancelled                        bool
charge_option_Pay Later            uint8
charge_option_Pay Now              uint8
charge_option_Pay at Check-in      uint8
time_interval                      int64
dtype: object

In [6]:
#visualize cancellation basded on hotel country location

# df= data[["hotel_country_code", 'is_cancelled']].groupby(["hotel_country_code"])['is_cancelled'].sum().reset_index()
# px.bar(df,x="hotel_country_code",y='is_cancelled')



In [7]:
#visualize cancellation basded on clients country location

# df= data[["customer_nationality", 'is_cancelled']].groupby(['customer_nationality'])['is_cancelled'].sum().reset_index()
# px.bar(df,x="customer_nationality",y='is_cancelled')

In [8]:
#visualizing time interval and booking cancellation

# x = data[['time_interval', 'is_cancelled']]
# px.bar(x,x='is_cancelled',y='time_interval')

In [9]:

# # returns a correlation matrix of each feature with all others.
# # data = df.select_dtypes(include=np.number)  # select only numerical features
# corr_matrix = data.corr(method='pearson').loc[:, ['is_cancelled']].sort_values('is_cancelled')
# corr_target = abs(corr_matrix)

response = data.pop('is_cancelled')


In [10]:
import torch.utils.data as data_utils

X_train, X_test, y_train, y_test = train_test_split(data, response)
BATCH_SIZE = 64

# Scaling the data


ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

trainloader = torch.utils.data.DataLoader(X_train, batch_size=BATCH_SIZE,
                                          shuffle=True, num_workers=2)
testloader = torch.utils.data.DataLoader(X_test, batch_size=BATCH_SIZE,
                                         shuffle=False, num_workers=2)


# Passing to DataLoader
#normalization of data before NN
# Creating our model's structure

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120) #can we use flatten here to guess the input dimention to the linear layer?
        # self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(120, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1) 
       
        x = F.relu(self.fc1(x))
        # x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x 
net = Net()


EPOCH = 15
train_losses =[]
test_losses =[]


for epoch in range(EPOCH):  # loop over the dataset multiple times

#training

    running_loss = 0.0
    for data in trainloader:
        print(data.shape)
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    #Train loss
        running_loss += loss.item()
    epoch_loss = running_loss/len(trainloader)
    train_losses.append(epoch_loss)


#testing

    running_loss = 0.0
    with torch.no_grad():
      for data in testloader:
        images, labels = data 

        outputs = net(images)
        loss = criterion(outputs,labels)
        running_loss += loss.item()

    #Test loss
    epoch_loss = running_loss/len(testloader)
    test_losses.append(epoch_loss)




#plotting training and testing losses
 
x = np.linspace(1,EPOCH,EPOCH).astype(int)
y_train = np.array(train_losses)
y_test = np.array(test_losses)

plt.plot(x,y_train, 'r')
plt.plot(x,y_test,'b')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()


print('Finished Training')
# Fitting the model
history = model.fit(X_train,
                    y_train, 
                    batch_size = 256,
                    validation_data =(X_test, y_test),
                    epochs = 15,
                    verbose = 0)






torch.Size([64, 21])


ValueError: ignored

In [None]:
# Check out our train loss and test loss over epochs.
train_loss = history.history['loss']
test_loss = history.history['val_loss']

# Visualizing our training and testing loss by epoch
plt.figure(figsize=(10, 5))
plt.plot(train_loss, label='Training Loss', color='#185fad')
plt.plot(test_loss, label='Testing Loss', color='orange')
plt.title('Training and Testing Loss by Epoch', fontsize = 20)
plt.xlabel('Epoch', fontsize = 11)
plt.ylabel('Binary Crossentropy', fontsize = 11)
plt.legend(fontsize = 11);

# Credit to GA CNN global lecture author for the graph code 

In [None]:

# if __name__ == '__main__':
#     np.random.seed(0)

#     # Load data
#     df, cancellation_labels = load_data("../datasets/agoda_cancellation_train.csv")
#     train_X, train_y, test_X, test_y = split_train_test(df, cancellation_labels)

#     # Fit model over data
#     estimator = AgodaCancellationEstimator().fit(train_X, train_y)

#     # Store model predictions over test set
#     evaluate_and_export(estimator, test_X, "id1_id2_id3.csv")