# Forecasting Tourism Demand in Singapore

## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Data Collection and Preprocessing

### Importing Dataset

In [40]:
inter_arr = 'data/raw/international_visitor_arrivals.csv'
inter_arr_stay_days = 'data/raw/international_visitor_arrivals_stay_days.csv'
inter_arr_age = 'data/raw/international_visitor_arrivals_age.csv'
hotel_stats = 'data/raw/hotel_statistics.csv'
weather = 'data/raw/weather.csv'
purpose_visit = 'data/raw/purpose_of_visit.xlsx'
spent = 'data/raw/money_spent.xlsx'
holiday_14_22 = 'data/raw/holidays_2014_2022.csv'
holiday_23 = 'data/raw/holidays_2023.csv'
holiday_24 = 'data/raw/holidays_2024.csv'

inter_arrival_df = pd.read_csv(inter_arr)
inter_arrival_stay_days_df = pd.read_csv(inter_arr_stay_days)
inter_arrival_age_df = pd.read_csv(inter_arr_age)
hotel_stats_df = pd.read_csv(hotel_stats)
weather_df = pd.read_csv(weather)
purpose_visit_df = pd.read_excel(purpose_visit)
spent_df = pd.read_excel(spent)
holiday_14_22_df = pd.read_csv(holiday_14_22)
holiday_23_df = pd.read_csv(holiday_23)
holiday_24_df = pd.read_csv(holiday_24)

### Converting long to wide format and Cleaning

In [54]:
# Long to Wide convertor function
def wide2Long(_df, value_name, series_name):
    df = _df.melt(id_vars='DataSeries', var_name='date', value_name=value_name)
    df['date'] = pd.to_datetime(df['date'], format="%Y%b")
    df.set_index('date', inplace=True)
    df.rename(columns={
        'DataSeries': series_name
    } ,inplace=True)
    df = df.loc['2015-01-01':]
    return df

def wide2Long_2(_df, value_name, series_name):
    df = _df.melt(id_vars='DataSeries', var_name='date', value_name=value_name)
    df['date'] = pd.to_datetime(df['date'], format="%Y").dt.year
    df.set_index('date', inplace=True)
    df.rename(columns={
        'DataSeries': series_name
    } ,inplace=True)
    df = df.loc['2010':]
    df.index = df.index.sort_values(ascending=False)
    return df

In [42]:
# International Visitor Arrival
inter_arrival_df = wide2Long(inter_arrival_df, 'visitor_arrivals', 'region')
inter_arrival_df['region'] = inter_arrival_df['region'].str.replace('Total International Visitor Arrivals By Inbound Tourism Markets', 'Total')

# International Visitor Arrival Stay Days
inter_arrival_stay_days_df = wide2Long(inter_arrival_stay_days_df, 'visitor_arrivals', 'stay_days')
inter_arrival_stay_days_df['stay_days'] = inter_arrival_stay_days_df['stay_days'].str.replace('Total International Visitor Arrivals', 'Total')

# International Visitor Arrival Age
inter_arrival_age_df = wide2Long(inter_arrival_age_df, 'visitor_arrivals', 'age')

# International Visitor Arrival Gender
inter_arrival_gender_df = inter_arrival_age_df[inter_arrival_age_df['age'].isin(['Males','Females'])]
inter_arrival_age_df = inter_arrival_age_df[~inter_arrival_age_df['age'].isin(['Males','Females'])]

# Hotel Statistics
hotel_stats_df = wide2Long(hotel_stats_df, 'hotels_data', 'hotels_info')

# Weather
weather_df = wide2Long(weather_df, 'weather_data', 'weather_info')

# Purpose of Visit
purpose_visit_df = wide2Long_2(purpose_visit_df, 'count', 'purpose')

# Travel Spent
spent_df = wide2Long_2(spent_df, 'dollar(millions)', 'travel')
spent_df = spent_df.drop(columns=['travel'])

# Holidays
holiday_14_22_df['date'] = pd.to_datetime(holiday_14_22_df['date'], format='%Y-%m-%d')
holiday_23_df['date'] = pd.to_datetime(holiday_23_df['date'], format='%Y-%m-%d')
holiday_24_df['date'] = pd.to_datetime(holiday_24_df['date'], format='%Y-%m-%d')
holidays = pd.concat([holiday_14_22_df, holiday_23_df, holiday_24_df])
holidays = holidays.reset_index(drop=True)


### Converting Data Types

In [60]:
inter_arrival_stay_days_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2618 entries, 2024-11-01 to 2015-01-01
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   region            2618 non-null   object
 1   visitor_arrivals  2618 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 61.4+ KB
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1904 entries, 2024-11-01 to 2015-01-01
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   stay_days         1904 non-null   object 
 1   visitor_arrivals  1904 non-null   float64
dtypes: float64(1), object(1)
memory usage: 44.6+ KB
