In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from matplotlib import style

In [None]:
df = pd.read_csv('Air line Dataset.csv')

df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df["Departure Date"] = pd.to_datetime(df["Departure Date"])
df = df.sort_values(by = "Departure Date",ascending = True).reset_index(drop=True)

df

In [None]:
print(df.info(), "\n")

print("Nº duplicated:",df.duplicated().sum())

In [None]:
#we observe that the data for this analysis is not duplicated. We deduce this by using the "duplicated" variable.

# Analysis and Visualization.

#To develop this section, we consider the following questions:

#Distribution of passengers with age.
#Distribution of Flight statuses.
#Which are the top 5 Continents and countries with the most visits?
#Top 10 nationalities that travel from flights.
#What is the flight status throughout the year based on months and quarters?
#Passengers age over time by flight status.
#Number of passengers and airports in different airports.

In [None]:
#Ages of the passengers.
df["Age"].describe()

In [None]:
bins = [0, 12, 30, 50, 100] 

labels = ['Children', 'Youth', 'Adults', 'Seniors'] 

df['Age Group'] = pd.cut(df['Age'], bins=bins, labels=labels) 

'''We define the ages in a series of intervals and record them in "bins," adding a category for each corresponding age range,
which is stored in "labels." Next, we define a new column for our dataframe called "Age Group," where we will group the ages
from our "Age" column based on the defined age intervals ("bins") and assign them to the appropriate category ("labels") using
"pd.cut." '''

In [None]:
group_counts = df['Age Group'].value_counts()

plt.figure(figsize=(8, 5))
group_counts.plot(kind='bar', rot=0)
plt.xlabel("Age Group")
plt.ylabel("Count")
plt.title("Distribution by Age Group")
plt.show()


In [None]:
#We can see that the "Seniors" group has a significant presence in this dataframe, indicating that the majority of people in this database are between 50 and 100 years old.


In [None]:
#Flight statuses
df['Flight Status'].value_counts()

In [None]:
labels = df['Flight Status'].unique()
values = df['Flight Status'].value_counts()

fig, ax = plt.subplots()
ax.pie(values, labels=labels, autopct = '%1.1f%%')
plt.show()

In [None]:
# From the results obtained, we can see that the flight statuses: "Cancelled, Delayed, On time," follow a similar distribution, with one flight status not significantly standing out compared to the others.

In [None]:
#Which are the top 5 countries with the most visits?
country_visits = df["Country Name"].value_counts().head(5)

plt.figure(figsize=(10, 6))
country_visits.plot(kind='bar')
plt.title('The five most visited countries')
plt.xlabel('Country')
plt.ylabel('Nº of visits')
plt.xticks(rotation=45) 
plt.show()

In [None]:
#Which continent is the most visited?
continents_visits = df["Continents"].value_counts()

plt.figure(figsize=(10, 6))
continents_visits.plot(kind='bar')
plt.title('The most visited Continents')
plt.xlabel('Continents')
plt.ylabel('Nº of visits')
plt.xticks(rotation=45)  
plt.show()

In [None]:
#Top 10 nationalities.
nationality = df["Nationality"].value_counts().head(10)

plt.figure(figsize=(10,6))
nationality.plot(kind="bar")
plt.title("Top 10 nationalities")
plt.xlabel("Nationality")
plt.ylabel("Count")
plt.xticks(rotation=45) 
plt.show()


In [None]:
# What is the flight status throughout the year?
df["month"] = df["Departure Date"].dt.month

In [None]:
df.head()

In [None]:
# Create a list
status_unique = df['Flight Status'].unique()

# Subplots
num_rows = len(status_unique)
fig, axs = plt.subplots(num_rows, 1, figsize=(9, 5 * num_rows), sharex=False,gridspec_kw={'hspace': 0.5})

# Labels
labels = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

# To iterate through the status and create separate graphs
for i, status in enumerate(status_unique):
    f_status = df[df['Flight Status'] == status]
    f_month = f_status.groupby(f_status['month'])['Flight Status'].count()
    bars = axs[i].bar(f_month.index, f_month.values)
    axs[i].set_title(f'Flight {status} per Month')
    axs[i].set_xlabel('Month')
    axs[i].set_ylabel('Nº of Flight')
    
    # To set up X-axis labels to display the months.
    axs[i].set_xticks(range(1, 13))  # Set label positions
    axs[i].set_xticklabels(labels, rotation=45, ha='right')  # Set labels and rotation
    
    # Add value labels to the bars.
    for bar in bars:
        height = bar.get_height()
        axs[i].annotate(f'{height}', xy=(bar.get_x() + bar.get_width() / 2, height),
                        xytext=(0, 3), textcoords='offset points', ha='center', va='bottom')

plt.show()

In [None]:
df["quarter"] = df["Departure Date"].dt.quarter

In [None]:
df.head()

In [None]:
# Create a list
status_unique = df['Flight Status'].unique()

# Subplots
num_rows = len(status_unique)
fig, axs = plt.subplots(num_rows, 1, figsize=(9, 5 * num_rows), sharex=False, gridspec_kw={'hspace': 0.5})

# Labels
labels = ['Q1', 'Q2', 'Q3', 'Q4']

# To iterate through the status and create separate graphs
for i, status in enumerate(status_unique):
    f_status = df[df['Flight Status'] == status]
    f_quarter = f_status.groupby(f_status['quarter'])['Flight Status'].count()
    
    bars = axs[i].bar(f_quarter.index, f_quarter.values)
    axs[i].set_title(f'Flight {status} per Quarter')
    axs[i].set_xlabel('Quarter')
    axs[i].set_ylabel('Nº of Flight')
    
    # To set up X-axis labels to display the months.
    axs[i].set_xticks(range(1, 5))  # Set label positions
    axs[i].set_xticklabels(labels, rotation=45, ha='right')  # Set labels and rotation
    
    # Add value labels to the bars.
    for bar in bars:
        height = bar.get_height()
        axs[i].annotate(f'{height}', xy=(bar.get_x() + bar.get_width() / 2, height),
                        xytext=(0, 3), textcoords='offset points', ha='center', va='bottom')

plt.show()

In [None]:
gender = df['Gender'].value_counts()

plt.figure(figsize=(8, 6))
gender.plot(kind='bar', color=['turquoise', 'violet'])

plt.title('Travelers by Genders')
plt.xlabel('Gender')
plt.ylabel('Passenger Count')

plt.show()

In [None]:
#observations
#The frequency of the variables of flight status is almost equal.In addition, the number of men and women in flight status is almost the same, so it can be said that gender is not effective in flight status.
#As a country, we observed that the US had the highest number of on-time, delayed or canceled flights.As a arrival airport that the 0 had the highest number of on-time, delayed or canceled flights.But zero here means the unknown. And as a continents,that the North America had the highest number of on-time, delayed or canceled flights. Of course, this is due to the fact that the US is the country with the highest number of on-time, delayed and canceled flights.
#In terms of nationality, Chinese passengers are the most numerous and even account for almost 20 percent of all observations.
#In terms of departure month, on-time, delayed and canceled flights were most common in months 10, 11 and 12. This shows that most flights took place in these months and that the month has no effect on the flight status. However, it is observed that the departure days are different in three flight cases and it is determined that this situation, that is, the day of departure, has an effect on the flight status.

In [None]:
import seaborn as sns

In [None]:
df['Departure Date'] = pd.to_datetime(df['Departure Date'])
plt.figure(figsize=(12, 6))
sns.lineplot(x='Departure Date', y='Age', hue='Flight Status', data=df)
plt.title('Passenger Age over Time by Flight Status')
plt.xlabel('Departure Date')
plt.ylabel('Age')
plt.show()

In [None]:
'''Interpretation:

The plot visualizes how the age of passengers varies over time based on their flight status.
Each line on the plot represents a different flight status (e.g., 'On Time' or 'Delayed').
The x-axis shows the departure dates, allowing you to observe any patterns or trends over time.
The y-axis represents the age of passengers, helping you understand the distribution of ages for each flight status.'''

In [None]:
import plotly.express as px

In [None]:
# Convert 'Departure Date' to datetime format
df['Departure Date'] = pd.to_datetime(df['Departure Date'])

# Create an interactive line plot
fig = px.line(df, x='Departure Date', y='Age', color='Flight Status', 
              labels={'Age': 'Passenger Age', 'Departure Date': 'Departure Date'},
              title='Passenger Age over Time by Flight Status')

# Show the interactive plot
fig.show()

In [None]:

# Create an interactive box plot
fig = px.box(df, x='Flight Status', y='Age', color='Flight Status',
             labels={'Age': 'Passenger Age', 'Flight Status': 'Flight Status'},
             title='Box Plot of Passenger Age by Flight Status')

# Show the interactive plot
fig.show()

In [None]:
# Count the number of airports in each continent
airports_count = df.groupby('Continents')['Airport Name'].nunique().reset_index(name='Number of Airports')

# Count the number of passengers in each continent
passengers_count = df.groupby('Continents').size().reset_index(name='Number of Passengers')

# Merge the two dataframes on the 'Continents' column
merged_df = pd.merge(airports_count, passengers_count, on='Continents')

# Create a bubble map
fig = px.scatter(merged_df, x='Number of Airports', y='Number of Passengers', size='Number of Passengers',
                 color='Continents', hover_name='Continents',
                 title='Bubble Map for Number of Airports and Passengers in Each Continent',
                 size_max=50)

# Show the plot
fig.show()

In [None]:
import plotly.express as px

# Assuming df is your DataFrame
# Create a new column for the size based on the frequency of each 'Flight Status'
df['Flight_Status_Count'] = df.groupby('Flight Status')['Passenger ID'].transform('count')

# Create the bubble chart
bubble_chart = px.scatter(df, x='Age', y='Gender', size='Flight_Status_Count',
                          animation_frame='Departure Date', animation_group='Passenger ID',
                          title='Age vs Gender with Flight Status Bubble Size',
                          labels={'Age': 'Age', 'Gender': 'Gender'},
                          size_max=30, range_x=[df['Age'].min(), df['Age'].max()],
                          range_y=[df['Gender'].min(), df['Gender'].max()])

bubble_chart.update_layout(transition_duration=500)

# Show the plot
bubble_chart.show()

In [None]:
# Report on'''Executive Summary
'''This comprehensive analysis of the airline passengers and flights dataset has provided valuable insights into various aspects of passenger demographics, flight statuses, top destinations, and temporal trends. The findings presented in this report aim to enhance the understanding of the dataset and contribute to strategic decision-making for improving passenger experience and optimizing flight operations.
1. Distribution of Passengers with Age:
We categorized passengers into four groups: seniors, adults, youth, and children. The analysis revealed that seniors constitute the largest passenger group. This insight can guide efforts to tailor services and amenities to the needs of different age segments.
2. Distribution of Flight Statuses:
A pie chart illustrated the distribution of flight statuses, showing that the statuses "Cancelled," "Delayed," and "On Time" exhibit similar proportions. No significant outliers were observed, emphasizing the importance of consistently managing all flight statuses.
3. Top 5 Continents and Countries with the Most Visits:
The analysis identified North America as the most visited continent, with the USA being the top country. This information is crucial for resource allocation and marketing strategies, focusing on regions with high passenger traffic.
4. Top 10 Nationalities Traveling from Flights:
The dataset highlighted China as the top nationality among passengers traveling with the airline. This insight informs targeted marketing efforts and facilitates a better understanding of customer demographics.
5. Flight Status Throughout the Year (Months and Quarters):
Monthly and quarterly analyses of flight statuses revealed patterns in cancellations, delays, and on-time performance. This information aids in proactive planning and resource allocation during peak seasons or specific quarters with higher disruptions.
6. Passengers' Age Over Time by Flight Status:
The plot depicting passengers' age over time for different flight statuses provides a nuanced view of how age groups are affected by flight disruptions. This can guide personalized services or communication strategies.
7. Number of Passengers and Airports in Different Continents:
The examination of the number of passengers and airports in each continent informs infrastructure development decisions. Identifying continents with high passenger density can guide the establishment of new airports.
Dynamic Bubble Chart for Ages and Genders by Flight Status:
The dynamic bubble chart offers a visually engaging representation of the ages and genders of passengers based on flight status. This interactive visualization facilitates a deeper exploration of age and gender dynamics across different flight statuses.
Conclusion:
The insights gained from this analysis empower the airline to make informed decisions regarding customer experience, operational efficiency, and strategic planning. By understanding passenger demographics, destinations, and temporal patterns, the airline can tailor its services to enhance customer satisfaction, optimize resources, and improve overall operational performance.
This report serves as a foundation for ongoing analysis and strategic initiatives, providing a holistic view of the airline's performance and customer demographics.
The y-axis represents the age of passengers, helping you understand the distribution of ages for each flight status.
Now we have to find out the number of passengers and airports in each continent. So this helped us to know which are the continents which need more airports with respect to the passengers. This would help us in getting a better picture for the future reference.
After that we made a dynamic bubble chart to depict the ages and gender of the passengers based on their flight status.It would help us in getting the age wise data of passengers with their genders telling us about the flight statuses they faced each day.

Managerial Implications and Recommendations:
Age-Based Service Customization:
Implication: Given that seniors constitute the largest passenger group, there is an opportunity to enhance services catering specifically to their needs.
Recommendation: Implement age-friendly services such as priority boarding, assistance during transit, and amenities suitable for senior passengers.
Flight Status Management:
Implication: The balanced distribution of flight statuses suggests a consistent operational performance. However, understanding the reasons behind cancellations or delays could lead to proactive measures.
Recommendation: Conduct a detailed analysis of factors contributing to flight disruptions, focusing on mitigating common causes and improving communication with passengers during delays or cancellations.

Strategic Marketing Focus:
Implication: The USA and China emerge as top destinations and nationalities, respectively. Strategic marketing efforts can leverage this information for customer acquisition and retention.
Recommendation: Develop targeted marketing campaigns for the identified key markets, emphasizing personalized promotions, loyalty programs, and route optimizations.
Seasonal and Quarterly Planning:
Implication: Identifying patterns in flight statuses across months and quarters provides insights for proactive planning.
Recommendation: Allocate resources strategically during peak seasons, enhance staff training for efficient handling of disruptions, and implement contingency plans for high-impact quarters.
Age Group-Specific Strategies:
Implication: The age-based analysis over time indicates varying impacts on different age groups during flight disruptions.
Recommendation: Tailor communication strategies and amenities based on age groups to enhance the overall passenger experience and address specific needs during disruptions.
Continental Expansion:
Implication: North America emerges as the most visited continent. This information is valuable for future expansion plans.
Recommendation: Explore opportunities for new routes, partnerships, or increased frequency of flights to and from North America. Consider expanding operations to other continents with high passenger density.
Infrastructure Development:
Implication: Understanding the number of passengers and airports in each continent is crucial for infrastructure planning.
Recommendation: Invest in new airport facilities in continents with high passenger density. Collaborate with aviation authorities to ensure seamless passenger experiences at existing and new airports.
Data-Driven Decision-Making:
Implication: The dynamic bubble chart provides a real-time view of age and gender dynamics by flight status.
Recommendation: Continuously monitor the dynamic bubble chart to identify emerging trends, enabling swift decision-making based on evolving passenger demographics and preferences.

Customer Engagement Strategies:
Implication: The comprehensive analysis offers insights into customer behavior and preferences.
Recommendation: Implement targeted customer engagement strategies, including surveys, feedback mechanisms, and loyalty programs, to foster a customer-centric approach and enhance brand loyalty.
Operational Efficiency:
Implication: Operational efficiency is crucial for delivering a positive passenger experience.
Recommendation: Invest in technology and training programs to streamline operations, minimize delays, and improve overall efficiency. Regularly review and update operational protocols based on industry best practices.
'''