In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA


In [None]:
df_arrivals = pd.read_csv(r'C:\Users\User\Desktop\COURSEWORK\international-tourist-arrivals.csv')
df_departures = pd.read_csv(r'C:\Users\User\Desktop\COURSEWORK\international-tourist-departures-per-1000.csv')
df_world_region = pd.read_csv(r'C:\Users\User\Desktop\COURSEWORK\international-tourist-arrivals-by-world-region.csv')
df_air_passengers = pd.read_csv(r'C:\Users\User\Desktop\COURSEWORK\air-passengers-carried.csv')

for df in [df_arrivals, df_departures, df_world_region, df_air_passengers]:
    df.dropna(inplace=True)
    if 'Year' in df.columns:
        df['Year'] = pd.to_datetime(df['Year'], format='%Y')

df_combined = df_arrivals.copy()
df_combined = df_combined.groupby(['Entity', df_combined['Year'].dt.year])['International tourism, number of arrivals'].sum().reset_index()
df_combined = df_combined[df_combined['Entity'] != 'World']
df_combined['Z-score'] = df_combined.groupby('Entity')['International tourism, number of arrivals'].transform(lambda x: (x - x.mean()) / x.std())
df_combined = df_combined[np.abs(df_combined['Z-score']) < 2.5]

total_arrivals = df_combined.groupby('Entity')['International tourism, number of arrivals'].sum()
top_country = total_arrivals.idxmax()
bottom_country = total_arrivals.idxmin()


In [None]:
def forecast_country_arima(df, country_name, forecast_horizon=10):
    country_data = df[df['Entity'] == country_name]
    country_data = country_data.groupby('Year')['International tourism, number of arrivals'].sum().reset_index()
    X = country_data['Year']
    y = country_data['International tourism, number of arrivals']

    model = ARIMA(y, order=(1, 1, 1))
    model_fit = model.fit()

    future_years = list(range(X.iloc[-1] + 1, X.iloc[-1] + 1 + forecast_horizon))
    forecast = model_fit.forecast(steps=forecast_horizon)

    return country_data, future_years, forecast


In [None]:
fig, axs = plt.subplots(1, 2, figsize=(15, 6))

country_data_top, future_top, predictions_top = forecast_country_arima(df_combined, top_country)
axs[0].bar(country_data_top['Year'], country_data_top['International tourism, number of arrivals'], label='Actual Data', color='blue')
axs[0].bar(future_top, predictions_top, label='Forecast', color='red')
axs[0].set_title(f'Forecast for Most Popular Country: {top_country}')
axs[0].set_xlabel('Year')
axs[0].set_ylabel('Number of Tourists')
axs[0].legend()

country_data_bottom, future_bottom, predictions_bottom = forecast_country_arima(df_combined, bottom_country)
axs[1].bar(country_data_bottom['Year'], country_data_bottom['International tourism, number of arrivals'], label='Actual Data', color='gray')
axs[1].bar(future_bottom, predictions_bottom, label='Forecast', color='orange')
axs[1].set_title(f'Forecast for Least Popular Country: {bottom_country}')
axs[1].set_xlabel('Year')
axs[1].set_ylabel('Number of Tourists')
axs[1].legend()

plt.tight_layout()
plt.show()


In [None]:
df_fatalities = pd.read_csv('global-fatalities-from-aviation-accidents-and-hijackings.csv')

def preprocess_fatalities(df, year_col, value_cols):
    df = df.dropna(subset=[year_col] + value_cols)
    df = df.drop_duplicates()
    df[year_col] = pd.to_datetime(df[year_col], format='%Y', errors='coerce').dt.year
    df = df.dropna(subset=[year_col])
    df['Total Fatalities'] = df[value_cols].sum(axis=1)
    df = df[df['Total Fatalities'] > 0]
    df = df.groupby(year_col)['Total Fatalities'].sum().reset_index()
    df.rename(columns={year_col: 'Year', 'Total Fatalities': 'Fatalities'}, inplace=True)
    return df

cleaned_df = preprocess_fatalities(df_fatalities, 'Year', ['Hijacking fatalities', 'Fatalities from commercial airliners'])

worst_year = cleaned_df.loc[cleaned_df['Fatalities'] == cleaned_df['Fatalities'].max(), 'Year'].values[0]
worst_value = int(cleaned_df['Fatalities'].max())

best_year = cleaned_df.loc[cleaned_df['Fatalities'] == cleaned_df['Fatalities'].min(), 'Year'].values[0]
best_value = int(cleaned_df['Fatalities'].min())


In [None]:
plt.figure(figsize=(14, 7))
colors = ['green' if year == best_year else 'red' if year == worst_year else 'gray' for year in cleaned_df['Year']]
plt.bar(cleaned_df['Year'], cleaned_df['Fatalities'], color=colors)
plt.title('Aviation Fatalities: All Years with Highlighted Best and Worst Years')
plt.xlabel('Year')
plt.ylabel('Number of Fatalities')
plt.axvline(x=best_year, color='green', linestyle='--', label=f'Best Year: {best_year}')
plt.axvline(x=worst_year, color='red', linestyle='--', label=f'Worst Year: {worst_year}')
plt.legend()
plt.grid(axis='y')
plt.show()


In [None]:
y = cleaned_df['Fatalities']
model = ARIMA(y, order=(1, 1, 1))
model_fit = model.fit()

forecast_years = list(range(cleaned_df['Year'].iloc[-1] + 1, 2031))
forecast = model_fit.forecast(steps=len(forecast_years))

forecast_df = pd.DataFrame({'Year': forecast_years, 'Fatalities': forecast})

plt.figure(figsize=(14, 7))
plt.bar(forecast_df['Year'], forecast_df['Fatalities'], color='blue', label='Forecast Fatalities')
plt.title('Aviation Fatalities Forecast to 2030 (ARIMA Model)')
plt.xlabel('Year')
plt.ylabel('Predicted Number of Fatalities')
plt.legend()
plt.grid(axis='y')
plt.show()


In [None]:
output_description = f"""
Most popular country: {top_country}
Least popular country: {bottom_country}

Key findings from tourism graphs:
1. The forecast for the most popular country ({top_country}) shows:
{', '.join([f'{year}: {int(pred):,}' for year, pred in zip(future_top, predictions_top)])}.
2. For the least popular country ({bottom_country}), the forecast indicates:
{', '.join([f'{year}: {int(pred):,}' for year, pred in zip(future_bottom, predictions_bottom)])}.

Key findings from aviation graphs:
- Best Year: {best_year}, Fatalities: {best_value}
- Worst Year: {worst_year}, Fatalities: {worst_value}
"""

print(output_description)


In [None]:
df_combined['International tourism, number of arrivals'] = pd.to_numeric(
    df_combined['International tourism, number of arrivals'], errors='coerce'
)


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from IPython.core.display import HTML
%matplotlib inline
sns.set_style('darkgrid')
plt.rcParams['font.size'] = 14
plt.rcParams['figure.figsize'] = (17, 5)
plt.rcParams['figure.facecolor'] = '#00000000'
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
pd.set_option('display.max_rows', None)

import pandas as pd
df1 = pd.read_csv('air-passengers-carried.csv')
df2 = pd.read_csv('air-passengers-per-fatality.csv')
df3 = pd.read_csv('fatal-airliner-accidents-hijacking-incidents.csv')
df4 = pd.read_csv('fatal-airliner-accidents-per-million-flights.csv')
df5 = pd.read_csv('global-fatalities-from-aviation-accidents-and-hijackings.csv')
df6 = pd.read_csv('international-tourist-arrivals-by-world-region.csv')
df7 = pd.read_csv('international-tourist-arrivals.csv')

atpc_tr = df1[df1.Year == 2019].sort_values('Air transport, passengers carried', ascending=False).drop(index=[8045,4124,4847,2936,7789,1959,2297,5476,2348,4218,3872,4800,6707,6911])
atpchd = atpc_tr.reset_index().drop(['index','Code'], axis=1).head(20)
atpctl = atpc_tr.reset_index().drop(['index','Code'], axis=1).tail(20).sort_values('Air transport, passengers carried', ascending=True)
atpcre = df1.loc[[1959,2297,5476,2348,3872,4800,6707,6911]].reset_index().drop(['index','Code'], axis=1)
atpcwd = df1[df1.Entity == 'World'].sort_values('Year').reset_index().drop(['index','Code'],axis=1)
atpcdf_tr = df1.pivot_table(values = 'Air transport, passengers carried',index = ['Entity'], columns = 'Year')
atpcrac = atpcdf_tr.drop(index=['East Asia and Pacific', 'Europe and Central Asia', 'European Union', 'High income', 'Latin America and Caribbean', 'Low and middle income', 'Low income', 'Lower middle income', 'Middle East and North Africa', 'Middle income', 'North America', 'South Asia', 'Sub-Saharan Africa', 'Upper middle income', 'World'])
atpcrac.fillna(0, inplace=True)
atpcrac.sort_values(list(atpcrac.columns),inplace=True)
atpcrac = atpcrac.sort_index()

itnoa_tr = df7[df7.Year == 2019].drop(index=[1328,1518,1543,1953,2463,2625,2644,2669,2972,2997,3394,4203,4306,4834,4984])
itnoahd = itnoa_tr.sort_values('International tourism, number of arrivals', ascending=False).reset_index().drop(['index','Code'], axis=1).head(20)
itnoatl = itnoa_tr.sort_values('International tourism, number of arrivals', ascending=False).reset_index().drop(['index','Code'], axis=1).tail(20).sort_values('International tourism, number of arrivals', ascending=True)
itnoare = df7.loc[[1328,1518,1543,2463,2972,3394,4203,4306]].reset_index().drop(['index', 'Code'], axis=1)
itnoawd = df7[df7.Entity == 'World'].reset_index().drop(['index','Code'], axis=1)
itnoadf_tr = df7.pivot_table(values = 'International tourism, number of arrivals',index = ['Entity'], columns = 'Year')
itnoarac = itnoadf_tr.drop(index=['East Asia and Pacific', 'Europe and Central Asia', 'European Union', 'High income', 'Latin America and Caribbean', 'Low and middle income', 'Low income', 'Lower middle income', 'Middle East and North Africa', 'Middle income', 'North America', 'South Asia', 'Sub-Saharan Africa', 'Upper middle income', 'World'])
itnoarac.fillna(0, inplace=True)
itnoarac.sort_values(list(itnoarac.columns),inplace=True)
itnoarac = itnoarac.sort_index()

mppfwd = df2.drop('Code', axis=1)
hifacawd = df3.drop('Code', axis=1)
fapmcfwd = df4.drop('Code', axis=1)
hfffcawd = df5.drop('Code', axis=1)
itare = df6.drop('Code', axis=1)

HTML('''<div class="flourish-embed flourish-bar-chart-race" data-src="visualisation/13199250"><script src="https://public.flourish.studio/resources/embed.js"></script></div>''')

sns.barplot(x='Entity', y='Air transport, passengers carried', data=atpchd)
plt.xlabel('Countries', fontsize=16)
plt.ylabel('Passengers Carried', fontsize=16)
plt.xticks(rotation=90)
plt.title('Air Transport, Passengers Carried [TOP 20 COUNTRIES]', fontsize=20)
plt.show();

plt.plot(atpctl['Entity'], atpctl['Air transport, passengers carried'], 'o-b', lw=3, ms=10)
plt.xlabel('Countries', fontsize=16)
plt.ylabel('Passengers Carried', fontsize=16)
plt.xticks(rotation=90)
plt.title('Air Transport, Passengers Carried [BOTTOM 20 COUNTRIES]', fontsize=20)
plt.show();

atpcrefig = px.pie(atpcre, values='Air transport, passengers carried', names='Entity', title='Air Transport, Passengers Carried [REGION]')
atpcrefig.show()

plt.plot(atpcwd['Year'], atpcwd['Air transport, passengers carried'], '-m', lw=3, ms=10)
plt.xlabel('Year', fontsize=16)
plt.ylabel('Passengers Carried', fontsize=16)
plt.title('Air Transport, Passengers Carried [WORLD]', fontsize=20)
plt.show();

HTML('''<div class="flourish-embed flourish-bar-chart-race" data-src="visualisation/13199323"><script src="https://public.flourish.studio/resources/embed.js"></script></div>''')
