# Data Preparation

Let's begin performing data preparation to whole project that consist in:
    - collecting
    - cleaning 
    - transforming
    - manipulating
 The raw data in regards to make a great analysis.

In [None]:
import pandas as pd
import plotly.express as px
import matplotlib as plt

In [None]:
# Due the different version of python can be found and each machine or libriries 
# I just decided to use this library to avoid distraction and anoing warnigs
import warnings
warnings.filterwarnings("ignore")

## Licence of use

Working with Dublin airpot dataset:
That dataset was collected from data.gov.ie - <https://data.gov.ie/dataset/tam07-passengers-freight-and-commercial-flights>
licence by Creative Commons Attribution 4.0 (CC BY 4.0) <https://creativecommons.org/licenses/by/4.0/>

Working with Hong Kong airport dataset:
That dataset was collected from data.gov.hk - <https://www.immd.gov.hk/opendata/eng/transport/immigration_clearance/statistics_on_daily_passenger_traffic.csv>
Open data licence by DATA.GOV.HK  <https://data.gov.hk/en/terms-and-conditions>

In [None]:
df_irl = pd.read_csv("TAM07.csv")
df_irl

In [None]:
df_hkg = pd.read_csv("statistics_on_daily_passenger_traffic.csv")
df_hkg

## Making sure that my data set is ready to be explored - EDA (Exploratory Data Analysis) - Understanding the data and patterns

    - df.info( ) # just with this command will be possible see shape(df.shape),types(df.dtypes)
        and just above, when the DF's were created, it's noticed the shape of each DF.
    

In [None]:
df_irl.info()

In [None]:
df_hkg.info()

## Data Cleaning
 

In [None]:
#Verifing if I have any data duplicated (inconsitent)
duplicate_rows_df = df_irl[df_irl.duplicated()]
print("number of duplicate rows: ", duplicate_rows_df.shape)

In [None]:
duplicate_rows_df1 = df_hkg[df_hkg.duplicated()]
print("number of duplicate rows: ", duplicate_rows_df1.shape)

In [None]:
# spliting the feature month in two new columns
df_irl['Month'] = pd.to_datetime(df_irl['Month'], format='%Y %B')

# Extract year and month into separate columns
df_irl['year'] = df_irl['Month'].dt.year
df_irl['month'] = df_irl['Month'].dt.month

In [None]:
#Converting the numbers to be work with real numbers and get a better visualisation
df_irl ["real_number"] = (df_irl["VALUE"] * 1000).astype(int)

In [None]:
#Cleaning dataset
to_drop = ["Statistic Label", "UNIT", "Flight Type","Month"]
df_irl.drop(to_drop, inplace=True, axis=1)

In [None]:
df_irl

In [None]:
df_hkg['Date'] = pd.to_datetime(df_hkg['Date'], format='%d-%m-%Y', errors='coerce')

# Create new columns for day, month, and year as integers
df_hkg['day'] = df_hkg['Date'].dt.day.astype(int)
df_hkg['month'] = df_hkg['Date'].dt.month.astype(int)
df_hkg['year'] = df_hkg['Date'].dt.year.astype(int)

In [None]:
# Filter rows where "Control Point" contains "Airport" and creating a new df 
df_hkg = df_hkg[df_hkg["Control Point"].str.contains("Airport")].reset_index(drop=True)

In [None]:
df_hkg = df_hkg[(df_hkg['Date'] >= '2023-01-01') & (df_hkg['Date'] <= '2023-09-30')]

In [None]:
#Cleaning dataset
to_drop = ["Unnamed: 7", "Date", "Control Point"]
df_hkg.drop(to_drop, inplace=True, axis=1)

In [None]:
df_hkg

In [None]:
df_hkg1

# Data preparation to Sentimental analysis - Dublin and Hong Kong airport

In order to collect reviews about Dublin and Hong kong airports, Praw library it' going to be use to collect comments from Reddit.

Note: Praw library is an officially supported way to interact with the Reddit API, and Reddit's Terms of Service does allow the use for non-commercial purposes. 

In [None]:
import praw
from datetime import datetime as dt
from dotenv import load_dotenv
from os import getenv

In [None]:
load_dotenv()

def revw_reddit(subreddit,ask_query): # This function will collect reviews from reddit with 2 parameters to bring back a DF
    reddit = praw.Reddit(
        client_id=getenv("APP_ID"),
        client_secret=getenv("AP_SECRET"),
        user_agent=f"{getenv('APP_NAME')} (by u/{getenv('REDDIT_USERNAME')})",
        username=getenv("REDDIT_USERNAME"),
        password=getenv("REDDIT_PASSWORD")
    )
    subreddit = reddit.subreddit(subreddit)
    results = subreddit.search(ask_query, limit=400)  

# Creating Lists to store titles and texts to convert those in DF acordind my seach like Dublin, airport
    titles = []
    texts = []
    
# Collecting titles and texts which is the base to sentimental analisys
    for submission in results:
        titles.append(submission.title)
        texts.append(submission.selftext)

# Return a DataFrame with title and comments
    return pd.DataFrame({'title': titles, 'text': texts})

In [None]:
load_dotenv() #TEST of reading my .env document

## Collecting reviews from Reddit - Dublin airport

In [None]:
df_dublin_reviews = revw_reddit("Dublin","airport")

In [None]:
df_dublin_reviews

## Collecting reviews from Reddit - Hong Kong airport

In [None]:
df_hkg_reviews = revw_reddit("HongKong","airport")

In [None]:
df_hkg_reviews

### Preparing the datasets

In [None]:
#DUBLIN

# Merging title and text in order to have mora words to analyse
df_dublin_reviews['text'] = df_dublin_reviews['title'] + ' ' + df_dublin_reviews['text']
df_dublin_reviews = df_dublin_reviews.drop(columns=['title'])

# removing any special characters, double spaces, tabs etc.
df_dublin_reviews['text'] = df_dublin_reviews['text'].str.replace('[^\w\s\.]','')

# Dropping na and duplicates
df_dublin_reviews = df_dublin_reviews.dropna()
df_dublin_reviews = df_dublin_reviews.drop_duplicates()

#exporting the data set as csv - to share indeed
df_dublin_reviews.to_csv('df_dublin_reviews.csv', index = False)


In [None]:
#HONG KONG

# Merging title and text in order to have mora words to analyse
df_hkg_reviews['text'] = df_hkg_reviews['title'] + ' ' + df_hkg_reviews['text']
df_hkg_reviews = df_hkg_reviews.drop(columns=['title'])

# removing any special characters, double spaces, tabs etc.
df_hkg_reviews['text'] = df_hkg_reviews['text'].str.replace('[^\w\s\.]','')

# Dropping na and duplicates
df_hkg_reviews = df_hkg_reviews.dropna()
df_hkg_reviews = df_hkg_reviews.drop_duplicates()

#exporting the data set as csv - to share indeed
df_hkg_reviews.to_csv('df_hkg_reviews.csv', index = False)


In [19]:
 df_irl

Unnamed: 0,Airports in Ireland,Country,Direction,VALUE,year,month,real_number
0,Dublin,All Countries,Arrival,1056.4,2023,1,1056400
1,Dublin,All Countries,Departure,1014.5,2023,1,1014500
2,Dublin,Ireland (domestic),Arrival,5.7,2023,1,5700
3,Dublin,Ireland (domestic),Departure,3.7,2023,1,3700
4,Dublin,Austria,Arrival,8.7,2023,1,8700
...,...,...,...,...,...,...,...
1057,Dublin,United Arab Emirates,Departure,26.3,2023,9,26300
1058,Dublin,Other Asian countries (4),Arrival,16.4,2023,9,16400
1059,Dublin,Other Asian countries (4),Departure,14.8,2023,9,14800
1060,Dublin,Oceania and Polar regions (1),Arrival,0.0,2023,9,0


In [None]:
df_hkg

In [None]:
# Just in case if needs an update
#!pip install dash --upgrade
#!pip install plotly --upgrade

In [38]:
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import pandas as pd
import plotly.graph_objects as go

# Assuming your dataset is df_irl
df_irl = pd.read_csv('df_irl.csv')  # Replace with the actual filename or path

# Create a Dash app
app = dash.Dash(__name__)

# Initial month for default display
initial_month = df_irl['month'].unique()[0]

# Create an interactive choropleth map with Plotly Graph Objects
fig = go.Figure()

# Add initial frame
fig.add_trace(
    go.Choropleth(
        locations=df_irl['Country'],
        z=df_irl[df_irl['month'] == initial_month]['real_number'],
        hoverinfo='location+z',
        locationmode='country names',
        colorscale='RdBu',  # Change the color scale here
        colorbar=dict(title='Number of Arrivals (log scale)'),
    )
)

# Add colorbar for reference
fig.update_layout(coloraxis_colorbar=dict(title="Number of Arrivals (log scale)"))

# Update layout for larger graph
fig.update_layout(
    height=800,  # Change the height as per your preference
)

# Define layout for the app
app.layout = html.Div([
    dcc.Graph(id='air-traffic-map', figure=fig),
])

# Define callback to update choropleth map based on selected month
@app.callback(
    Output('air-traffic-map', 'figure'),
    [Input('air-traffic-map', 'relayoutData')]
)
def update_choropleth_map(relayout_data):
    frames = []

    for frame_month in df_irl['month'].unique():
        frame_data = go.Choropleth(
            locations=df_irl['Country'],
            z=df_irl[df_irl['month'] == frame_month]['real_number'],
            hoverinfo='location+z',
            locationmode='country names',
            colorscale='RdBu',  # Change the color scale here
            colorbar=dict(title='Number of Arrivals (log scale)'),
        )
        frame = go.Frame(data=[frame_data], name=str(frame_month))
        frames.append(frame)

    # Update layout properties
    fig.update_layout(
        updatemenus=[{
            'buttons': [
                {
                    'args': [None, {'frame': {'duration': 500, 'redraw': True}, 'fromcurrent': True}],
                    'label': 'Play',
                    'method': 'animate',
                },
                {
                    'args': [[None], {'frame': {'duration': 0, 'redraw': True}, 'mode': 'immediate', 'transition': {'duration': 0}}],
                    'label': 'Pause',
                    'method': 'animate',
                },
            ],
            'direction': 'left',
            'pad': {'r': 10, 't': 87},
            'showactive': False,
            'type': 'buttons',
            'x': 0.1,
            'xanchor': 'right',
            'y': 0,
            'yanchor': 'top',
        }],
        sliders=[{
            'active': 0,
            'yanchor': 'top',
            'xanchor': 'left',
            'currentvalue': {
                'font': {'size': 16},
                'prefix': 'Month:',
                'visible': True,
                'xanchor': 'right',
            },
            'transition': {'duration': 300, 'easing': 'cubic-in-out'},
            'steps': [{'args': [[str(frame)], {'frame': {'duration': 300, 'redraw': True}, 'mode': 'immediate', 'transition': {'duration': 300}}],
                       'label': str(frame),
                       'method': 'animate'} for frame in df_irl['month'].unique()],
        }],
    )

    fig.frames = frames

    return fig

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True, port=8051)


In [None]:
 have to clean up the large skewed numbers