## Import dataset

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore') # Suppress warnings

# Read csv file into pandas dataframe 
# url = https://data.gov.ie/dataset/tmq01-visits-to-and-from-ireland/resource/d018a88d-6944-4728-aa62-cd05074ffa5e
df = pd.read_csv("TMQ11-Overseas-Trips-to-and-from-Ireland.csv")

## Explore Imported Dataset

#### Dataset describes travel to and from Ireland by routes between the years of 1985 and 2008.

In [2]:
# Prints out the first few rows
df.head(144)

Unnamed: 0,STATISTIC,Statistic Label,C02169V02615,Trips,TLIST(Q1),Quarter,UNIT,VALUE
0,TMQ11,Overseas Trips to and from Ireland,-,All Trips,20081,2008Q1,Thousand,3304.7
1,TMQ11,Overseas Trips to and from Ireland,-,All Trips,20082,2008Q2,Thousand,4210.2
2,TMQ11,Overseas Trips to and from Ireland,-,All Trips,20083,2008Q3,Thousand,4900.7
3,TMQ11,Overseas Trips to and from Ireland,-,All Trips,20084,2008Q4,Thousand,3300.8
4,TMQ11,Overseas Trips to and from Ireland,-,All Trips,20091,2009Q1,Thousand,2941.9
...,...,...,...,...,...,...,...,...
139,TMQ11,Overseas Trips to and from Ireland,4,Overseas Trips to Ireland by Non-Residents,20184,2018Q4,Thousand,2412.8
140,TMQ11,Overseas Trips to and from Ireland,4,Overseas Trips to Ireland by Non-Residents,20191,2019Q1,Thousand,2026.7
141,TMQ11,Overseas Trips to and from Ireland,4,Overseas Trips to Ireland by Non-Residents,20192,2019Q2,Thousand,3021.8
142,TMQ11,Overseas Trips to and from Ireland,4,Overseas Trips to Ireland by Non-Residents,20193,2019Q3,Thousand,3334.4


In [3]:
# Dataframe is composed of 8 Columns and 960 Rows
df.shape

(144, 8)

In [4]:
df.describe()

Unnamed: 0,TLIST(Q1),VALUE
count,144.0,144.0
mean,20137.5,2560.463889
std,34.65918,1144.399775
min,20081.0,1084.8
25%,20109.25,1720.075
50%,20137.5,2181.9
75%,20165.75,3226.475
max,20194.0,6108.4


#### Dataset contains categorical and quantitative variables 
#### Categorical = 
#### Quantitative = 

In [5]:
df.dtypes

STATISTIC           object
Statistic Label     object
C02169V02615        object
Trips               object
TLIST(Q1)            int64
Quarter             object
UNIT                object
VALUE              float64
dtype: object

## Edit Dataframe - Data Cleaning

### Remove, Rename and Transform columns  

In [6]:
# Remove 'UNIT' column as 'VALUE' column heading will be amended to reflect units. 
# Remove 'ROUTE' column as 'Route of Travel' contains the same information in human readable language.
# Remove 'STATISTIC' column as this data is contained within 'Quarter' and 'Statistic Label' columns in human readable language.
tourism_df = df.drop(columns=['UNIT', 'ROUTE', 'STATISTIC'])

# Rename 'TLIST(Q1)' column to Year
tourism_df.rename(columns={"TLIST(Q1)": "Year"}, inplace=True)

# Last character in 'Year' column refers to quarter (1-4) 
# Remove last character from by dividing by 10 and saving as integer to remove non-whole numbers. e.g. 19854 -> 1985.4 = 1985
tourism_df["Year"] = tourism_df["Year"].transform(lambda x: x / 10).astype(int)

# The'Quarter' column contains the year and Q before the number designating the quarter. e.g. 1985Q4
# Transform 'Quarter' column, reassigning last character of each entry as integer reflecting the quarter. e.g. 1985Q4 = 4
tourism_df["Quarter"] = tourism_df["Quarter"].transform(lambda x: str(x)[-1]).astype(int)

# Rename 'Statistic Label' column for Inbound/Outbound
tourism_df.rename(columns={"Statistic Label": "Inbound/Outbound"}, inplace=True)

# Rename Column for Travellers x 1,000
tourism_df.rename(columns={"VALUE": "Travellers (x 1,000)"}, inplace=True)

KeyError: "['ROUTE'] not found in axis"

## Explore Edited Dataframe

In [None]:
# Select cross channel travel by both sea and air for each quarter and year
cross_channel_df = tourism_df[tourism_df.isin(['Air Cross Channel', 'Sea Cross Channel'])['Route of Travel']==True]
pd.set_option('display.max_rows', cross_channel_df.shape[0]+1)
cross_channel_df

In [None]:
# Combine Air and Sea cross channel travel into Cross channel travel dataframe
combined_cross_channel = cross_channel_df.groupby(['Year', 'Quarter', 'Inbound/Outbound'], as_index=False).sum()
combined_cross_channel['Route of Travel'] = 'Cross Channel'
combined_cross_channel

In [None]:
tourism_df = tourism_df.drop(
    tourism_df[tourism_df.isin(['Air Cross Channel', 'Sea Cross Channel'])['Route of Travel']==True].index)

In [None]:
# Prints out the first few rows
pd.set_option('display.max_rows', tourism_df.shape[0]+1)
tourism_df

In [None]:
tourism_df = pd.concat([tourism_df, combined_cross_channel], axis=0, ignore_index=True)
pd.set_option('display.max_rows', tourism_df.shape[0]+1)
tourism_df

In [None]:
# Assess how many Series and Rows in tidied dataframe
tourism_df.shape

In [None]:
tourism_df.dtypes

In [None]:
tourism_df.describe()

## Create Seperate Dataframe of Inbound Travel to Ireland

In [None]:
# Create new seperate dataframes for inbound Irish travel
intourism_df = tourism_df[tourism_df['Inbound/Outbound'] == 'Visitors to Ireland']
intourism_df = intourism_df.drop(columns=['Inbound/Outbound'])
#intourism_df = intourism_df.drop(intourism_df[intourism_df['Route of Travel'] == 'All Routes'].index)

In [None]:
(intourism_df.head(5))

In [None]:
intourism_df.shape

In [None]:
intourism_df.describe()

In [7]:
# 5 categories for Routes of Travel with column 0 = Sum of columns 1-4
intourism_df['Route of Travel'].value_counts() 

NameError: name 'intourism_df' is not defined

In [8]:
# Create dataframe with values for each quarter of the year summed to give travel into Ireland per year by each route 
in_grouped = intourism_df.groupby(['Year', 'Route of Travel'], as_index=False).sum().drop(columns=['Quarter'])
in_grouped

NameError: name 'intourism_df' is not defined

In [9]:
# Assess demographics of tourists travelling to Ireland from different regions over time
plt.figure(figsize=(12, 6))

ax = sns.barplot(data=in_grouped, x="Year", y="Travellers (x 1,000)", hue='Route of Travel')
ax.tick_params(axis='x', labelrotation=45)

NameError: name 'in_grouped' is not defined

<Figure size 1200x600 with 0 Axes>

## Create Seperate Dataframe of Outbound Travel from Ireland

In [10]:
# Create new seperate dataframes for inbound Irish travel
outtourism_df = tourism_df[tourism_df['Inbound/Outbound'] == 'Visits Abroad by Irish Residents']
outtourism_df = outtourism_df.drop(columns=['Inbound/Outbound'])
#outtourism_df = outtourism_df.drop(outtourism_df[outtourism_df['Route of Travel'] == 'All Routes'].index)
#outtourism_df = outtourism_df.sum(outtourism_df[outtourism_df['Route of Travel'] == 'All Routes'].index)

NameError: name 'tourism_df' is not defined

In [11]:
(outtourism_df.tail(5))

NameError: name 'outtourism_df' is not defined

In [12]:
outtourism_df.shape

NameError: name 'outtourism_df' is not defined

In [13]:
outtourism_df.describe()

NameError: name 'outtourism_df' is not defined

In [14]:
# 5 categories for Routes of Travel with column 0 = Sum of columns 1-4
outtourism_df['Route of Travel'].value_counts() 

NameError: name 'outtourism_df' is not defined

In [15]:
# Create dataframe with values for each quarter of the year summed to give travel from Ireland per year by each route 
out_grouped = outtourism_df.groupby(['Year', 'Route of Travel'], as_index=False).sum().drop(columns=['Quarter'])
out_grouped

NameError: name 'outtourism_df' is not defined

In [16]:
# Assess demographics of tourists travelling from Ireland to different regions over time
plt.figure(figsize=(12, 6))

ax = sns.barplot(data=out_grouped, x="Year", y="Travellers (x 1,000)", hue='Route of Travel')
ax.tick_params(axis='x', labelrotation=45)

NameError: name 'out_grouped' is not defined

<Figure size 1200x600 with 0 Axes>

## Travel to Ireland by Quarter
Q1: January – March, Q2: April – June, Q3: July – September, Q4: October – December

In [17]:
# Assess demographics of tourists travelling to Ireland from different regions per Quarter
plt.figure(figsize=(12, 6))

ax = sns.barplot(data=intourism_df, x="Route of Travel", y="Travellers (x 1,000)", hue='Quarter')
ax.tick_params(axis='x', labelrotation=45)

NameError: name 'intourism_df' is not defined

<Figure size 1200x600 with 0 Axes>

In [18]:
# Assess demographics of tourists travelling to Ireland from different regions per Quarter
plt.figure(figsize=(12, 6))

ax = sns.barplot(data=intourism_df, x="Quarter", y="Travellers (x 1,000)", hue='Route of Travel')
ax.tick_params(axis='x', labelrotation=45)

NameError: name 'intourism_df' is not defined

<Figure size 1200x600 with 0 Axes>

## Travel from Ireland by Quarter

In [19]:
# Assess demographics of tourists travelling from Ireland to different regions per Quarter
plt.figure(figsize=(12, 6))

ax = sns.barplot(data=outtourism_df, x="Route of Travel", y="Travellers (x 1,000)", hue='Quarter')
ax.tick_params(axis='x', labelrotation=45)

NameError: name 'outtourism_df' is not defined

<Figure size 1200x600 with 0 Axes>

In [20]:
# Assess demographics of tourists travelling from Ireland to different regions per Quarter
plt.figure(figsize=(12, 6))

ax = sns.barplot(data=outtourism_df, x="Quarter", y="Travellers (x 1,000)", hue='Route of Travel')
ax.tick_params(axis='x', labelrotation=45)

NameError: name 'outtourism_df' is not defined

<Figure size 1200x600 with 0 Axes>

## Lineplot of Tourism out of Ireland

In [21]:
# Assess demographics of tourists travelling from Ireland to different regions per Quarter

plt.figure(figsize=(12, 6))

ax = sns.lineplot(data=outtourism_df, x="Year", y="Travellers (x 1,000)", hue='Route of Travel')
ax.tick_params(axis='x', labelrotation=45)

NameError: name 'outtourism_df' is not defined

<Figure size 1200x600 with 0 Axes>

In [22]:
# Assess demographics of tourists travelling from Ireland to different regions per Quarter
plt.figure(figsize=(12, 6))

ax = sns.lineplot(data=out_grouped, x="Year", y="Travellers (x 1,000)", hue='Route of Travel')
ax.tick_params(axis='x', labelrotation=45)

NameError: name 'out_grouped' is not defined

<Figure size 1200x600 with 0 Axes>

## Lineplot of Tourism into Ireland

In [23]:
# Assess demographics of tourists travelling from Ireland to different regions per Quarter
plt.figure(figsize=(12, 6))

ax = sns.lineplot(data=in_grouped, x="Year", y="Travellers (x 1,000)", hue='Route of Travel')
ax.tick_params(axis='x', labelrotation=45)

NameError: name 'in_grouped' is not defined

<Figure size 1200x600 with 0 Axes>

In [24]:
# Assess demographics of tourists travelling from Ireland to different regions per Quarter
plt.figure(figsize=(12, 6))

ax = sns.lineplot(data=intourism_df, x="Year", y="Travellers (x 1,000)", hue='Route of Travel')
ax.tick_params(axis='x', labelrotation=45)

NameError: name 'intourism_df' is not defined

<Figure size 1200x600 with 0 Axes>