# Data Preparation

In [2]:
import pandas as pd
import plotly.express as px
import matplotlib as plt

In [3]:
# Due the different version of python can be found and each machine or libriries 
# I just decided to use this library to avoid distraction and anoing warnigs
import warnings
warnings.filterwarnings("ignore")

## Licence of use

In [4]:
df_irl = pd.read_csv("TAM07.csv")
df_irl

Unnamed: 0,Statistic Label,Month,Airports in Ireland,Country,Direction,Flight Type,UNIT,VALUE
0,Passengers,2023 January,Dublin,All Countries,Arrival,Scheduled,Thousand,1056.4
1,Passengers,2023 January,Dublin,All Countries,Departure,Scheduled,Thousand,1014.5
2,Passengers,2023 January,Dublin,Ireland (domestic),Arrival,Scheduled,Thousand,5.7
3,Passengers,2023 January,Dublin,Ireland (domestic),Departure,Scheduled,Thousand,3.7
4,Passengers,2023 January,Dublin,Austria,Arrival,Scheduled,Thousand,8.7
...,...,...,...,...,...,...,...,...
1057,Passengers,2023 September,Dublin,United Arab Emirates,Departure,Scheduled,Thousand,26.3
1058,Passengers,2023 September,Dublin,Other Asian countries (4),Arrival,Scheduled,Thousand,16.4
1059,Passengers,2023 September,Dublin,Other Asian countries (4),Departure,Scheduled,Thousand,14.8
1060,Passengers,2023 September,Dublin,Oceania and Polar regions (1),Arrival,Scheduled,Thousand,0.0


In [5]:
df_hkg = pd.read_csv("statistics_on_daily_passenger_traffic.csv")
df_hkg

Unnamed: 0,Date,Control Point,Arrival / Departure,Hong Kong Residents,Mainland Visitors,Other Visitors,Total,Unnamed: 7
0,01-01-2021,Airport,Arrival,341,0,9,350,
1,01-01-2021,Airport,Departure,803,17,28,848,
2,01-01-2021,Express Rail Link West Kowloon,Arrival,0,0,0,0,
3,01-01-2021,Express Rail Link West Kowloon,Departure,0,0,0,0,
4,01-01-2021,Hung Hom,Arrival,0,0,0,0,
...,...,...,...,...,...,...,...,...
33501,26-12-2023,Harbour Control,Departure,0,0,17,17,
33502,26-12-2023,Kai Tak Cruise Terminal,Arrival,0,0,0,0,
33503,26-12-2023,Kai Tak Cruise Terminal,Departure,0,0,0,0,
33504,26-12-2023,Macau Ferry Terminal,Arrival,12269,834,2995,16098,


## Making sure that my data set is ready to be explored - EDA
    - df.info( ) # just with this command will be possible see shape(df.shape),types(df.dtypes)
        and just above, when the DF's were created, it's noticed the shape of each DF.
    

In [30]:
df_irl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 472 entries, 0 to 471
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Month                472 non-null    object 
 1   Airports in Ireland  472 non-null    object 
 2   Country              472 non-null    object 
 3   Direction            472 non-null    object 
 4   Flight Type          472 non-null    object 
 5   VALUE                472 non-null    float64
 6   real_number          472 non-null    int64  
dtypes: float64(1), int64(1), object(5)
memory usage: 25.9+ KB


In [13]:
df_hkg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33506 entries, 0 to 33505
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Date                 33506 non-null  object 
 1   Control Point        33506 non-null  object 
 2   Arrival / Departure  33506 non-null  object 
 3   Hong Kong Residents  33506 non-null  int64  
 4   Mainland Visitors    33506 non-null  int64  
 5   Other Visitors       33506 non-null  int64  
 6   Total                33506 non-null  int64  
 7   Unnamed: 7           0 non-null      float64
dtypes: float64(1), int64(4), object(3)
memory usage: 2.0+ MB


## Data Cleaning
 

In [6]:
#Verifing if I have any data duplicated (inconsitent)
duplicate_rows_df = df_irl[df_irl.duplicated()]
print("number of duplicate rows: ", duplicate_rows_df.shape)

number of duplicate rows:  (0, 8)


In [7]:
duplicate_rows_df1 = df_hkg[df_hkg.duplicated()]
print("number of duplicate rows: ", duplicate_rows_df1.shape)

number of duplicate rows:  (0, 8)


In [8]:
#Cleaning my dataset - I won't need these 2 columns - Irish DF
to_drop = ["Statistic Label", "UNIT"]
df_irl.drop(to_drop, inplace=True, axis=1)

In [9]:
#Cleaning my dataset - I won't need that column - HK DF
df_hkg.drop("Unnamed: 7", inplace=True, axis=1)

In [10]:
#Converting the numbers to be work with real numbers and get a better visualisation
df_irl ["real_number"] = (df_irl["VALUE"] * 1000).astype(int)

In [11]:
# Filter rows where "Control Point" contains "Airport" and creating a new df 
df_hkg1 = df_hkg[df_hkg["Control Point"].str.contains("Airport")].reset_index(drop=True)

In [12]:
df_hkg1

Unnamed: 0,Date,Control Point,Arrival / Departure,Hong Kong Residents,Mainland Visitors,Other Visitors,Total
0,01-01-2021,Airport,Arrival,341,0,9,350
1,01-01-2021,Airport,Departure,803,17,28,848
2,02-01-2021,Airport,Arrival,363,10,10,383
3,02-01-2021,Airport,Departure,940,22,33,995
4,03-01-2021,Airport,Arrival,880,4,36,920
...,...,...,...,...,...,...,...
2175,24-12-2023,Airport,Departure,44507,6719,14920,66146
2176,25-12-2023,Airport,Arrival,30783,6998,20491,58272
2177,25-12-2023,Airport,Departure,33244,6654,17880,57778
2178,26-12-2023,Airport,Arrival,47795,5423,18787,72005


In [13]:
df_hkg1['Date'] = pd.to_datetime(df_hkg1['Date'], format='%d-%m-%Y', errors='coerce')

# Create new columns for day, month, and year as integers
df_hkg1['day'] = df_hkg1['Date'].dt.day.astype(int)
df_hkg1['month'] = df_hkg1['Date'].dt.month.astype(int)
df_hkg1['year'] = df_hkg1['Date'].dt.year.astype(int)

In [14]:
df_hkg1

Unnamed: 0,Date,Control Point,Arrival / Departure,Hong Kong Residents,Mainland Visitors,Other Visitors,Total,day,month,year
0,2021-01-01,Airport,Arrival,341,0,9,350,1,1,2021
1,2021-01-01,Airport,Departure,803,17,28,848,1,1,2021
2,2021-01-02,Airport,Arrival,363,10,10,383,2,1,2021
3,2021-01-02,Airport,Departure,940,22,33,995,2,1,2021
4,2021-01-03,Airport,Arrival,880,4,36,920,3,1,2021
...,...,...,...,...,...,...,...,...,...,...
2175,2023-12-24,Airport,Departure,44507,6719,14920,66146,24,12,2023
2176,2023-12-25,Airport,Arrival,30783,6998,20491,58272,25,12,2023
2177,2023-12-25,Airport,Departure,33244,6654,17880,57778,25,12,2023
2178,2023-12-26,Airport,Arrival,47795,5423,18787,72005,26,12,2023


In [15]:
df_hkg_filtered = df_hkg1[(df_hkg1['Date'] >= '2023-01-01') & (df_hkg1['Date'] <= '2023-09-30')]

In [16]:
df_hkg_filtered

Unnamed: 0,Date,Control Point,Arrival / Departure,Hong Kong Residents,Mainland Visitors,Other Visitors,Total,day,month,year
1460,2023-01-01,Airport,Arrival,24925,1687,2456,29068,1,1,2023
1461,2023-01-01,Airport,Departure,14591,1648,4644,20883,1,1,2023
1462,2023-01-02,Airport,Arrival,28258,1624,1857,31739,2,1,2023
1463,2023-01-02,Airport,Departure,13367,1422,6713,21502,2,1,2023
1464,2023-01-03,Airport,Arrival,22993,1663,2459,27115,3,1,2023
...,...,...,...,...,...,...,...,...,...,...
2001,2023-09-28,Airport,Departure,27189,18098,12176,57463,28,9,2023
2002,2023-09-29,Airport,Arrival,19472,10744,13086,43302,29,9,2023
2003,2023-09-29,Airport,Departure,34032,16384,9648,60064,29,9,2023
2004,2023-09-30,Airport,Arrival,19131,10952,11050,41133,30,9,2023
