# Forecasting Tourism Demand in Singapore

## Importing Libraries

In [1264]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.subplots as make_subplots
import plotly.graph_objects as go

## Data Collection and Preprocessing

### Importing Dataset

In [1265]:
inter_arr = 'data/raw/international_visitor_arrivals.csv'
inter_arr_stay_days = 'data/raw/international_visitor_arrivals_stay_days.csv'
inter_arr_age = 'data/raw/international_visitor_arrivals_age.csv'
hotel_stats = 'data/raw/hotel_statistics.csv'
weather = 'data/raw/weather.csv'
purpose_visit = 'data/raw/purpose_of_visit.xlsx'
holiday_14_22 = 'data/raw/holidays_2014_2022.csv'
holiday_23 = 'data/raw/holidays_2023.csv'
holiday_24 = 'data/raw/holidays_2024.csv'
gdp = 'data/raw/gdp.csv'

inter_arrival_df = pd.read_csv(inter_arr)
inter_arrival_stay_days_df = pd.read_csv(inter_arr_stay_days)
inter_arrival_age_df = pd.read_csv(inter_arr_age)
hotel_stats_df = pd.read_csv(hotel_stats)
weather_df = pd.read_csv(weather)
purpose_visit_df = pd.read_excel(purpose_visit)
holiday_14_22_df = pd.read_csv(holiday_14_22)
holiday_23_df = pd.read_csv(holiday_23)
holiday_24_df = pd.read_csv(holiday_24)
gdp_df = pd.read_csv(gdp)

### Converting long to wide format and Cleaning

In [1266]:
# Long to Wide convertor function
def wide2Long(_df, value_name, series_name):
    df = _df.melt(id_vars='DataSeries', var_name='date', value_name=value_name)
    df['date'] = pd.to_datetime(df['date'], format="%Y%b")
    df.set_index('date', inplace=True)
    df.rename(columns={
        'DataSeries': series_name
    } ,inplace=True)
    df = df.loc['2018-01-01':]
    return df

def wide2Long_2(_df, value_name, series_name):
    df = _df.melt(id_vars='DataSeries', var_name='date', value_name=value_name)
    df['date'] = pd.to_datetime(df['date'], format="%Y").dt.year
    df.set_index('date', inplace=True)
    df.rename(columns={
        'DataSeries': series_name
    } ,inplace=True)
    df = df.loc['2015':]
    df.index = df.index.sort_values(ascending=False)
    return df

In [1267]:
# International Visitor Arrival
inter_arrival_df = wide2Long(inter_arrival_df, 'visitor_arrivals', 'region')
inter_arrival_df['region'] = inter_arrival_df['region'].str.replace('Total International Visitor Arrivals By Inbound Tourism Markets', 'Total')

# International Visitor Arrival Stay Days
inter_arrival_stay_days_df = wide2Long(inter_arrival_stay_days_df, 'visitor_arrivals', 'stay_days')
inter_arrival_stay_days_df['stay_days'] = inter_arrival_stay_days_df['stay_days'].str.replace('Total International Visitor Arrivals', 'Total')

# International Visitor Arrival Age
inter_arrival_age_df = wide2Long(inter_arrival_age_df, 'visitor_arrivals', 'age')

# International Visitor Arrival Gender
inter_arrival_gender_df = inter_arrival_age_df[inter_arrival_age_df['age'].isin(['Males','Females'])]
inter_arrival_age_df = inter_arrival_age_df[~inter_arrival_age_df['age'].isin(['Males','Females'])]
inter_arrival_gender_df.rename(
    columns={'age': 'gender'},inplace=True
)

# Hotel Statistics
hotel_stats_df = wide2Long(hotel_stats_df, 'hotels_data', 'hotels_info')

# Weather
weather_df = wide2Long(weather_df, 'weather_data', 'weather_info')

# Purpose of Visit
purpose_visit_df = wide2Long_2(purpose_visit_df, 'count', 'purpose')


# GDP
gdp_df = wide2Long_2(gdp_df, 'dollar', 'gdp')
gdp_df = gdp_df.drop(columns=['gdp'])

# Holidays
holiday_14_22_df['date'] = pd.to_datetime(holiday_14_22_df['date'], format='%Y-%m-%d')
holiday_23_df['date'] = pd.to_datetime(holiday_23_df['date'], format='%Y-%m-%d')
holiday_24_df['date'] = pd.to_datetime(holiday_24_df['date'], format='%Y-%m-%d')
holidays = pd.concat([holiday_14_22_df, holiday_23_df, holiday_24_df])
holidays = holidays.reset_index(drop=True)


### Converting Data Types

In [1268]:
inter_arrival_stay_days_df['visitor_arrivals'] = inter_arrival_stay_days_df['visitor_arrivals'].astype('int')

inter_arrival_age_df = inter_arrival_age_df[inter_arrival_age_df['age'] != 'Not Stated']
inter_arrival_age_df['visitor_arrivals'] = inter_arrival_age_df['visitor_arrivals'].astype('int')

inter_arrival_gender_df['visitor_arrivals'] = inter_arrival_gender_df['visitor_arrivals'].astype('int')

weather_df['weather_data'] = weather_df['weather_data'].astype('float')

### Making Date Index


In [1269]:
datasets1 = [
    inter_arrival_stay_days_df, 
    inter_arrival_age_df, 
    hotel_stats_df, 
    weather_df, 
    holidays, 
]

for df in datasets1:
    df = df.reset_index()
    df['date'] = pd.to_datetime(df['date'])
    df.set_index('date', inplace=True)

holidays = holidays.reset_index()
holidays['date'] = pd.to_datetime(holidays['date'])
holidays = holidays.set_index('date')
holidays = holidays.drop(columns='index')

# datasets2 = [
#     purpose_visit_df, 
#     spent_df, 
#     gdp_df
# ]

# for df in datasets2:
#     df = df.reset_index()
#     df['date'] = pd.to_datetime(df['date'], format='%Y')
#     df = df.set_index('date')

gdp_df = gdp_df.reset_index()
gdp_df['date'] = pd.to_datetime(gdp_df['date'], format='%Y')
gdp_df = gdp_df.set_index('date')


purpose_visit_df = purpose_visit_df.reset_index()
purpose_visit_df['date'] = pd.to_datetime(purpose_visit_df['date'], format='%Y')
purpose_visit_df = purpose_visit_df.set_index('date')

## EDA

In [1270]:
inter_arrival_df_total = inter_arrival_df[inter_arrival_df['region'] == 'Total'].copy()
inter_arrival_df_total_ = inter_arrival_df_total.groupby('date')['visitor_arrivals'].mean().reset_index()

fig = px.line(inter_arrival_df_total_, x='date', y='visitor_arrivals', title='Visitor Arrivals over Time', labels={'date':'Date', 'visitor_arrivals': 'Visitors Count'})
fig.show()

In [1271]:
inter_arrival_df_per_country = inter_arrival_df.copy()
inter_arrival_df_per_country = inter_arrival_df_per_country.groupby('region')['visitor_arrivals'].mean().sort_values(ascending=False)
inter_arrival_df_per_country = pd.DataFrame(inter_arrival_df_per_country[inter_arrival_df_per_country.index != 'Total']).iloc[:5,:].reset_index()

fig = px.bar(inter_arrival_df_per_country, x='region', y='visitor_arrivals', title='Top 5 Country by Visitor Arrivals', labels={'region': 'Country', 'visitor_arrivals': 'Visitors Count'})
fig.show()


In [1272]:
inter_arrival_age_df_group = inter_arrival_age_df.copy()
inter_arrival_age_df_group = inter_arrival_age_df_group.groupby('age')['visitor_arrivals'].mean()
inter_arrival_age_df_group = pd.DataFrame(inter_arrival_age_df_group[inter_arrival_age_df_group.index != 'Total']).reset_index()

fig = px.bar(inter_arrival_age_df_group, x='age', y='visitor_arrivals', title='Visitor Arrivals by Age Group', labels={'age':'Age', 'visitor_arrivals': 'Visitors Count'})
fig.show()

In [1273]:
inter_arrival_stay_days_df_group = inter_arrival_stay_days_df.copy()
inter_arrival_stay_days_df_group = inter_arrival_stay_days_df_group.groupby('stay_days')['visitor_arrivals'].mean().sort_values(ascending=False)
inter_arrival_stay_days_df_group = pd.DataFrame(inter_arrival_stay_days_df_group[(inter_arrival_stay_days_df_group.index != 'Total') & (inter_arrival_stay_days_df_group.index != 'Average Length Of Stay')]).reset_index()

fig = px.bar(inter_arrival_stay_days_df_group, x='stay_days', y='visitor_arrivals', title='Visitor Arrivals by Stay Days', labels={'stay_days':'Stay Days', 'visitor_arrivals': 'Visitors Count'})
fig.show()

In [1274]:
gdp_df_ = gdp_df.reset_index()

fig = px.line(gdp_df_, x='date', y='dollar', title='GDP of Singapore', labels={'date':'Date', 'dollar':'Dollar'})
fig.show()

In [1275]:
purpose_visit_df_ = purpose_visit_df[purpose_visit_df['purpose'] != 'Total'].reset_index()

fig = px.pie(purpose_visit_df_, values='count', names='purpose', title='Purpose of Visit per Arrivals')
fig.show()

In [1276]:
weather_temp_df = weather_df[weather_df['weather_info'] == 'Highest Daily Rainfall Total']
inter_arrival_total_df = inter_arrival_df[inter_arrival_df['region'] == 'Total']
merged_df = inter_arrival_total_df.merge(weather_temp_df, on="date")

fig = px.scatter(merged_df, x='weather_data', y='visitor_arrivals', title='Visitor Arrivals Vs. Daily Rainfall', labels={'weather_data': 'Daily Rainfall', 'visitor_arrivals': 'Visitor Arrivals'})
fig.show()


In [1277]:
hotel_stats_df_ = hotel_stats_df[hotel_stats_df['hotels_info'] == 'Hotel Room Revenue'].reset_index()

fig = px.line(hotel_stats_df_, x='date', y='hotels_data', title='Hotel Revenue over Time', labels={'date': 'Year', 'hotels_data': 'Revenue'})
fig.show()

In [1278]:
hotel_stats_occupy_df = hotel_stats_df[hotel_stats_df["hotels_info"] == "Standard Average Hotel Occupancy Rate"]
merged_data = pd.merge(hotel_stats_occupy_df, inter_arrival_df_total, left_index=True, right_index=True)

fig = px.scatter(merged_data, x='visitor_arrivals', y='hotels_data', title='Hotel Occupacy Rate Vs. Visitor Arrivals')
fig.show()

In [1279]:
holiday_dates = holidays.index
holiday_arrivals = inter_arrival_df_total.loc[inter_arrival_df_total.index.isin(holiday_dates)]

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=inter_arrival_df_total.index,
        y=inter_arrival_df_total['visitor_arrivals'],
        marker=dict(color='blue'),
        name='Regular Arrivals'
    )
)

fig.add_trace(
    go.Scatter(
        x=holiday_arrivals.index,
        y=holiday_arrivals['visitor_arrivals'],
        mode='markers',
        marker=dict(color='red', size=8),
        name='Holdiay Arrivals'
    )
)

fig.update_layout(
    title='Visitor Arrivals Over Time',
    xaxis_title = 'Date',
    yaxis_title = 'Arrivals',
    legend_title = 'Type of Arrivals'
)

fig.show()

In [1280]:
merged_gdp = pd.merge(inter_arrival_df_total, gdp_df, left_index=True, right_index=True)
mean_arrivals = merged_gdp['visitor_arrivals'].mean()
std_arrivals = merged_gdp['visitor_arrivals'].std()

mean_gdp = merged_gdp['dollar'].mean()
std_gdp = merged_gdp['dollar'].std()

merged_gdp['standardized_visitor_arrivals'] = (merged_gdp['visitor_arrivals'] - mean_arrivals) / std_arrivals
merged_gdp['standardized_gdp'] = (merged_gdp['dollar'] - mean_gdp) / std_gdp

# plt.plot(merged_gdp.index, merged_gdp['std_visitor_arrivals'], label='Visitor Arrivals')
# plt.plot(merged_gdp.index, merged_gdp['std_dollar'], label='GDP')
# plt.xlabel('Date')
# plt.ylabel('Value')
# plt.title('Visitor Arrivals vs. GDP')
# plt.legend()
# plt.show()

fig = go.Figure()

fig.add_trace(
    go.Scatter(x=merged_gdp.index, y=merged_gdp['standardized_visitor_arrivals'], name='Visitor Arrivals'),
)

fig.add_trace(
    go.Scatter(x=merged_gdp.index, y=merged_gdp['standardized_gdp'], name='GDP'),
)

fig.update_layout(
    title='Visitor Arrivals Vs. GDP',
    xaxis_title = 'Year' ,
    yaxis_title = 'Standardized Value'
)

fig.show()

## Preparing for modeling

In [1281]:
inter_arrival_gender_df['category'] = 'gender'
inter_arrival_age_df['category'] = 'age_bracket'
inter_arrival_stay_days_df['category'] = 'stay_days'
inter_arrival_df['category'] = 'region'
purpose_visit_df['category'] = 'purpose_of_visit'

inter_arrival_age_df = inter_arrival_age_df[~(inter_arrival_age_df['age'] == 'Total')]
inter_arrival_stay_days_df = inter_arrival_stay_days_df[~(inter_arrival_stay_days_df['stay_days'] == 'Total')]
inter_arrival_df = inter_arrival_df[~(inter_arrival_df['region'] == 'Total')]
purpose_visit_df = purpose_visit_df[~(purpose_visit_df['purpose'] == 'Total')]

In [1282]:
inter_arrival_gender_df = inter_arrival_gender_df.rename(columns={"gender": "value"})
inter_arrival_age_df = inter_arrival_age_df.rename(columns={"age": "value"})
inter_arrival_stay_days_df = inter_arrival_stay_days_df.rename(columns={"stay_days": "value"})
inter_arrival_df = inter_arrival_df.rename(columns={"region": "value"})
purpose_visit_df = purpose_visit_df.rename(columns={"purpose": "value"})
purpose_visit_df = purpose_visit_df.rename(columns={"count": "visitor_arrivals"})

In [1283]:
holidays = holidays.reset_index()
holidays['year_month'] = holidays['date'].dt.to_period('M')
holidays = holidays.groupby('year_month')['holiday'].apply(lambda x: ', '.join(x)).reset_index()
holidays['date'] = holidays['year_month'].astype(str) + "-01"
holidays = holidays.drop(['year_month'], axis=1)
holidays['date'] = pd.to_datetime(holidays['date'])
holidays['holiday'] = holidays['holiday'].apply(lambda x: ', '.join(sorted(set(x.split(', ')))))

In [1284]:
def merge_dfs(merge_df_, df):
    df = df.reset_index()
    df['year'] = df['date'].dt.year
    merge_df_ = merge_df.merge(df, on='year', how='left')
    merge_df_.drop(['date_y'], axis=1, inplace=True)
    merge_df_.rename(columns={'date_x': 'date'}, inplace=True)
    return merge_df_
    
merge_df = pd.concat([
    inter_arrival_gender_df,
    inter_arrival_age_df,
    inter_arrival_stay_days_df,
    inter_arrival_df,
    purpose_visit_df
])

merge_df = merge_df.reset_index()
merge_df['year'] = merge_df['date'].dt.year

merge_df = merge_dfs(merge_df, gdp_df)
merge_df.rename(columns={'dollar': 'gdp'}, inplace=True)

merge_df = merge_df.merge(holidays, on='date', how='left')
merge_df['holiday'] = merge_df['holiday'].fillna('No Holiday')

In [1292]:
weather_df.sample(5)

Unnamed: 0_level_0,weather_info,weather_data
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-06-01,Number Of Rainy Days,18.0
2024-02-01,Air Temperature Means Daily Minimum,25.8
2021-02-01,Air Temperature Means Daily Minimum,24.8
2022-04-01,Air Temperature Absolute Extremes Minimum,23.1
2024-07-01,Air Temperature Means Daily Maximum,32.3
