In [1]:
#unzip downloaded files

#install library
!pip install zipfile36

#import library
import zipfile

#unzipping
train_data = zipfile.ZipFile('train.csv.zip')
test_data = zipfile.ZipFile('test.csv.zip')

#extract both data
train_data.extractall()
test_data.extractall()

#close both data
train_data.close()
test_data.close()



In [2]:
#import all necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
#load datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
train.head()

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,srch_children_cnt,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster
0,2014-08-11 07:46:59,2,3,66,348,48862,2234.2641,12,0,1,...,0,1,8250,1,0,3,2,50,628,1
1,2014-08-11 08:22:12,2,3,66,348,48862,2234.2641,12,0,1,...,0,1,8250,1,1,1,2,50,628,1
2,2014-08-11 08:24:33,2,3,66,348,48862,2234.2641,12,0,0,...,0,1,8250,1,0,1,2,50,628,1
3,2014-08-09 18:05:16,2,3,66,442,35390,913.1932,93,0,0,...,0,1,14984,1,0,1,2,50,1457,80
4,2014-08-09 18:08:18,2,3,66,442,35390,913.6259,93,0,0,...,0,1,14984,1,0,1,2,50,1457,21


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37670293 entries, 0 to 37670292
Data columns (total 24 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   date_time                  object 
 1   site_name                  int64  
 2   posa_continent             int64  
 3   user_location_country      int64  
 4   user_location_region       int64  
 5   user_location_city         int64  
 6   orig_destination_distance  float64
 7   user_id                    int64  
 8   is_mobile                  int64  
 9   is_package                 int64  
 10  channel                    int64  
 11  srch_ci                    object 
 12  srch_co                    object 
 13  srch_adults_cnt            int64  
 14  srch_children_cnt          int64  
 15  srch_rm_cnt                int64  
 16  srch_destination_id        int64  
 17  srch_destination_type_id   int64  
 18  is_booking                 int64  
 19  cnt                        int64  
 20  

In [6]:
print(train.shape)
print(test.shape)

(37670293, 24)
(2528243, 22)


There are nearly 37.6 million rows. We need to make the dataset smaller as we do not have enough compute

In [7]:
print(train['user_id'].nunique())

1198786


These 37.6 million rows represent 1.2 million users. We will downsample the dataset to 10000 randomly chosen users

In [8]:
userids = list(train['user_id'].unique())  #gets list of unique user_ids

In [9]:
import random
random.seed(10)
#choose 10k random users
userids_10k = random.choices(userids, k=10000)

In [10]:
#create train dataset with the 10k users
train_10k = train[train['user_id'].isin(userids_10k)] 

In [11]:
train_10k.head()

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,srch_children_cnt,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster
1927,2014-02-27 07:24:03,2,3,66,174,23793,1780.4772,8654,1,0,...,2,1,5438,3,0,2,2,50,637,47
1928,2014-08-30 16:10:42,2,3,66,174,13277,133.1612,8654,1,0,...,1,1,8278,1,0,1,2,50,970,80
1929,2014-08-30 16:25:16,2,3,66,174,13277,124.8268,8654,1,0,...,1,1,8278,1,0,1,2,50,368,10
1930,2014-08-30 16:26:45,2,3,66,174,13277,112.2581,8654,1,0,...,1,1,8278,1,0,1,2,50,368,21
1931,2014-08-30 16:29:57,2,3,66,174,13277,117.6094,8654,1,0,...,1,1,8278,1,0,3,2,50,368,95


In [12]:
print(train_10k.shape)

(316422, 24)


This has reduced the training data set to 10000 users consisting of 316K rows

Now we want to remove the travel agents from the dataset so that it does not affect the single user preferences. We identify a travel agent as someone with more than 20 bookings.

In [13]:
#remove all non-bookings to make counting easier
train_10k_booking = train_10k[train_10k.is_booking != 0]
for user in userids_10k:
  #count the number of rows under a single user
  bookings = len(train_10k_booking.loc[train_10k_booking['user_id'] == user])
  if bookings >= 20:
    #remove the travel agent from dataset
    train_10k = train_10k[train_10k.user_id != user]

In [14]:
print(train_10k.shape)

(290825, 24)


#### Handling missing data

In [15]:
# Calculate total missing values per column
total_missing = train_10k.isnull().sum()

# Calculate % missing values per column
percent_missing = ((train_10k.isnull().sum()/train_10k.isnull().count())*100)

# Create a dataframe with the total and % missing values per column 
missing_data = pd.concat([total_missing, percent_missing], axis=1, keys=['Total Missing', 'Percent Missing'])
missing_data

Unnamed: 0,Total Missing,Percent Missing
date_time,0,0.0
site_name,0,0.0
posa_continent,0,0.0
user_location_country,0,0.0
user_location_region,0,0.0
user_location_city,0,0.0
orig_destination_distance,102986,35.411674
user_id,0,0.0
is_mobile,0,0.0
is_package,0,0.0


In [19]:
#find mean orig_destination_distance
print(train_10k['orig_destination_distance'].mean())

1972.884946094794


In [20]:
#fill missing values for orig_destination_distance with the column mean
train_10k['orig_destination_distance'] = train_10k['orig_destination_distance'].fillna(1972.884946094794)

In [21]:
#clean data with empty cells
train_10k = train_10k.dropna()

In [22]:
print(train_10k.shape)

(290413, 24)


In [23]:
train_10k.to_csv('train_downsampled.csv', index = False)