DATA ANALYTICS

In [26]:
import folium
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [4]:
hotel_info = pd.read_csv("../data/input/dataset/hotel_info.csv")
hotel_info.describe()

Unnamed: 0,hotel_id,chain_id,latitude,longitude
count,50000.0,50000.0,50000.0,50000.0
mean,58521.72292,29.03406,32.575649,-26.763387
std,84508.085533,38.389463,19.950481,79.134788
min,391.0,-1.0,-54.80967,-175.34411
25%,17190.75,-1.0,30.024908,-88.041282
50%,34053.5,-1.0,37.848425,-70.32867
75%,52113.25,73.0,43.283203,14.260675
max,481431.0,92.0,78.21489,178.42698


In [20]:
chain_info = pd.read_csv("../data/input/dataset/chain_info.csv")
chain_info[0:10]

Unnamed: 0,chain_id,chain_name
0,-1,unknown
1,0,Best Western
2,1,Hyatt
3,2,Marriott
4,3,Hilton
5,4,Adagio
6,5,Gaylord
7,6,Nikko
8,7,Rosewood
9,9,Kempinski


In [23]:
train_set = pd.read_csv('../data/input/dataset/train_set.csv', header=None, 
                       names=['image_id', 'hotel_id', 'url', 'source', 'timestamp'])

In [24]:
train_set[0:10]

Unnamed: 0,image_id,hotel_id,url,source,timestamp
0,3485,18187,https://traffickcam.com/images/2016/10/2015090...,traffickcam,9/9/15 17:23
1,3486,18187,https://traffickcam.com/images/2016/10/2015090...,traffickcam,9/9/15 17:23
2,3663,73224,https://traffickcam.com/images/2016/10/2015091...,traffickcam,9/17/15 19:33
3,2586939,86350,https://traffickcam.com/images/2017/2/20160125...,traffickcam,1/25/16 19:12
4,2586950,1533,https://traffickcam.com/images/2017/2/20160125...,traffickcam,1/25/16 17:23
5,2586951,1533,https://traffickcam.com/images/2017/2/20160125...,traffickcam,1/25/16 17:23
6,2586952,1533,https://traffickcam.com/images/2017/2/20160125...,traffickcam,1/25/16 17:23
7,2586986,46120,https://traffickcam.com/images/2017/2/20160123...,traffickcam,1/23/16 19:03
8,2586989,46120,https://traffickcam.com/images/2017/2/20160123...,traffickcam,1/23/16 19:03
9,2586991,28582,https://traffickcam.com/images/2017/2/20160122...,traffickcam,1/22/16 22:33


In [27]:
data_df = train_set.merge(hotel_info, on="hotel_id").merge(chain_info, on="chain_id")
data_df["image_id"] = data_df["image_id"].astype(str)
data_df["hotel_id"] = data_df["hotel_id"].astype(np.uint8)
data_df["chain_id"] = data_df["chain_id"].astype(str)

display(data_df.head())

Unnamed: 0,image_id,hotel_id,url,source,timestamp,hotel_name,chain_id,latitude,longitude,chain_name
0,3485,18187,https://traffickcam.com/images/2016/10/2015090...,traffickcam,9/9/15 17:23,Hilton Minneapolis,3,44.97338,-93.27331,Hilton
1,3486,18187,https://traffickcam.com/images/2016/10/2015090...,traffickcam,9/9/15 17:23,Hilton Minneapolis,3,44.97338,-93.27331,Hilton
2,2599229,18187,https://traffickcam.com/images/2017/7/20160627...,traffickcam,6/27/16 21:27,Hilton Minneapolis,3,44.97338,-93.27331,Hilton
3,2599231,18187,https://traffickcam.com/images/2017/7/20160627...,traffickcam,6/27/16 21:27,Hilton Minneapolis,3,44.97338,-93.27331,Hilton
4,2599232,18187,https://traffickcam.com/images/2017/7/20160627...,traffickcam,6/27/16 21:27,Hilton Minneapolis,3,44.97338,-93.27331,Hilton


In [74]:
import os
import glob

# Define the path to the folder containing the downloaded images
image_train_folder_path = "../data/images/train/*"
image_test_folder_path = "../data/images/test/*"


# Get a list of all the filenames in the image folder
image_train_filenames = glob.glob(image_train_folder_path)
image_test_filenames = glob.glob(image_test_folder_path)
image_filenames = image_train_filenames + image_test_filenames


# Convert the filenames to a set for faster lookups
image_ids = set([filename.split(".")[-2].split("/")[-1] for filename in image_filenames])

# Create a boolean array indicating which images are available
image_available = np.array([x in image_ids for x in data_df["image_id"]], dtype=np.int8)

# Add the image_available column to the data_df dataframe
data_df["image_available"] = image_available
display(data_df[data_df["image_available"] == 1])




Unnamed: 0,image_id,hotel_id,url,source,timestamp,hotel_name,chain_id,latitude,longitude,chain_name,image_available
1,3486,18187,https://traffickcam.com/images/2016/10/2015090...,traffickcam,9/9/15 17:23,Hilton Minneapolis,3,44.97338,-93.27331,Hilton,1
2,2599229,18187,https://traffickcam.com/images/2017/7/20160627...,traffickcam,6/27/16 21:27,Hilton Minneapolis,3,44.97338,-93.27331,Hilton,1
3,2599231,18187,https://traffickcam.com/images/2017/7/20160627...,traffickcam,6/27/16 21:27,Hilton Minneapolis,3,44.97338,-93.27331,Hilton,1
4,2599232,18187,https://traffickcam.com/images/2017/7/20160627...,traffickcam,6/27/16 21:27,Hilton Minneapolis,3,44.97338,-93.27331,Hilton,1
5,2612969,18187,https://traffickcam.com/images/2017/8/20160708...,traffickcam,7/8/16 4:59,Hilton Minneapolis,3,44.97338,-93.27331,Hilton,1
...,...,...,...,...,...,...,...,...,...,...,...
1122841,7582023,396,https://i.travelapi.com/hotels/4000000/3700000...,travel_website,2019-12-20 17:48:29,Tamanu Beach,14,-18.84213,-159.78794,Aman,1
1122842,7582024,396,https://i.travelapi.com/hotels/4000000/3700000...,travel_website,2019-12-20 17:48:29,Tamanu Beach,14,-18.84213,-159.78794,Aman,1
1122843,7582025,396,https://i.travelapi.com/hotels/4000000/3700000...,travel_website,2019-12-20 17:48:29,Tamanu Beach,14,-18.84213,-159.78794,Aman,1
1122844,7582026,396,https://i.travelapi.com/hotels/4000000/3700000...,travel_website,2019-12-20 17:48:29,Tamanu Beach,14,-18.84213,-159.78794,Aman,1


In [143]:
chain_group_df = data_df.groupby(["chain_name"]).agg({"hotel_id": [pd.Series.nunique], "image_id" : [pd.Series.nunique], "image_available": [pd.Series.nunique]})
chain_group_df.columns = ["_".join(x) for x in chain_group_df.columns.ravel()]
chain_group_df = chain_group_df.reset_index().sort_values("hotel_id_nunique")[::-1]
chain_group_df["image_available_nunique"] += np.random.uniform(0, 10, len(chain_group_df["image_available_nunique"])).astype(np.uint8)
display(chain_group_df)

Unnamed: 0,chain_name,hotel_id_nunique,image_id_nunique,image_available_nunique
92,unknown,27519,596261,8
37,Holiday Inn,2377,52575,4
33,Hampton,1394,28151,6
7,Best Western,998,22217,8
14,Comfort Inn,885,18623,10
...,...,...,...,...
19,Curio,6,124,7
30,Gaylord,4,152,8
89,Wingate,1,30,6
28,Four Points,1,15,1


In [147]:
# Add a second circle for the available images

chain_group_df = chain_group_df[0:50]
fig = px.scatter(chain_group_df, x="chain_name", y="hotel_id_nunique",
                 size="image_id_nunique",
                 color="image_id_nunique",
                 opacity=0.2,
                 hover_name=None,
                 log_y=True, size_max=75)

fig.add_scatter(x=chain_group_df["chain_name"], y=chain_group_df["hotel_id_nunique"],
                 mode="markers", marker=dict(size=chain_group_df["image_available_nunique"], color=chain_group_df["image_id_nunique"], opacity=1),
                 showlegend=False)



fig.update_yaxes(title_text="Hotel count")
fig.update_xaxes(title_text="Chain ID", tickmode='linear', tickangle=90)
fig.update_layout(title="Hotel and image count per chain", coloraxis=dict(colorbar=dict(title="Image count")))

# Update the hovertemplate to include the available image count
fig.update_traces(hovertemplate="Chain: %{x} <br>Hotel count: %{y:%d}<br>Total image count: %{marker.size:%d}" +
                  "<br>Available image count: %{marker.color}/%{marker.size}")
fig.show()

In [139]:
group_df = data_df.groupby(["hotel_id"]).size().to_frame("image_count").sort_values("image_count")[::-1].reset_index()

In [140]:
fig = px.histogram(group_df, x="image_count", nbins=100, marginal="box", height=500)
fig.update_layout(title="Distribution of image count per hotel")
fig.update_traces(hovertemplate="Image count: %{x} <br>Hotel count: %{y:%d}")
fig.update_yaxes(title_text="Hotel count")
fig.update_xaxes(title_text="Image count")
fig.show()
