In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
# df1=pd.read_csv('/train.csv')
# df2=pd.read_csv('/test.csv')
# df = pd.concat([df1, df2], axis=0)

In [None]:
df=pd.read_csv('/content/train.csv')

In [None]:
df.head()

In [None]:
df.drop(['Unnamed: 0', 'id'], axis=1, inplace=True)

# 1- General Analysis

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
#df['Arrival Delay in Minutes'].fillna(df['Arrival Delay in Minutes'].mean(), inplace=True)
df.dropna(inplace=True)

In [None]:
df.duplicated().sum() / len(df) *100

In [None]:
df.describe()

# 2- Gender

In [None]:
gender=df['Gender'].value_counts()

In [None]:
fig = px.pie(df, names=gender.index, values=gender.values, color= gender.index,
             color_discrete_map={'Female':'cyan',
                                 'Male':'darkblue'})
fig.show()

# 3- Customer Type

In [None]:
df['Customer Type'].unique()

In [None]:
fig, axs = plt.subplot_mosaic('''AABB
                                 AABB
                                 ''')

axs['A'].bar(df['Customer Type'].value_counts().index, df['Customer Type'].value_counts().values, color=sns.color_palette('Set1'))
axs['A'].set_title('Customer Type')
axs['B'].pie(df['Customer Type'].value_counts(), labels=df['Customer Type'].unique(), startangle=110, autopct='%1.1f%%', colors=sns.color_palette('pastel'))
axs['B'].set_title('Customer Type')

plt.show()

# 4- Age

In [None]:
df['Age'].max()-df['Age'].min()

In [None]:
# sns.kdeplot(data=df, x='Age', fill=True)
# plt.hist(df['Age'], bins=8, color='darkblue')
# plt.scatter(df['Age'].value_counts().index, df['Age'].value_counts().values)

sns.histplot(df['Age'], bins=30, kde=True, color='green')
plt.title("Age Distribution")
plt.xlabel("Age")
plt.ylabel("Count")
plt.show()

# 5- Relationship between Age and Customer Type

In [None]:
sns.kdeplot(data=df, x='Age', hue='Customer Type', fill=True)

# 6- splitting Age into Groups

In [None]:
bins = [0, 18, 40, 60, 100]
labels = ['Children 0-18', 'Youth 19-40', 'Adults 41-60', 'Elders 61-100']

# Create the age group column
df['Age_Group'] = pd.cut(df['Age'], bins=bins, labels=labels, right=True)

# Calculate the counts of each age group
age_groups= df['Age_Group'].value_counts()


In [None]:
from wordcloud import WordCloud

# 7- Age Group Distribution

In [None]:
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(age_groups)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
fig = px.pie(
    names=age_groups.index,
    values=age_groups.values,
    title='Age Group Distribution')
fig.show()

# 8- Type of Travel

In [None]:
df['Type of Travel'].unique()

In [None]:
sns.countplot(x='Type of Travel', data=df,color='darkblue')

In [None]:
travel_type_counts = df.groupby('Age_Group')['Type of Travel'].value_counts().reset_index(name='Count')
travel_type_counts

# 9- Relationship between Age Groups and Type of Travel

In [None]:
plt.figure(figsize=(8, 6))
sns.barplot(data=travel_type_counts, x='Age_Group', y='Count', hue='Type of Travel')

# Labels and Title
plt.xlabel('Age Groups')
plt.ylabel('Frequency')
plt.title('Travel Type Frequency by Age Group')

# Show Plot
plt.legend(title="Type of Travel")
plt.show()

In [None]:
df['Class'].value_counts()

In [None]:
class_counts = df.groupby('Age_Group')['Class'].value_counts().reset_index(name='Count')
class_counts

# 10- Relationship between Age Groups and Class

In [None]:
plt.figure(figsize=(8, 6))
sns.barplot(data=class_counts, x='Age_Group', y='Count', hue='Class')

# Labels and Title
plt.xlabel('Age Groups')
plt.ylabel('Frequency')
plt.title('Class Frequency by Age Group')

# Show Plot
plt.legend(title="Class")
plt.show()

In [None]:
df_pivot = class_counts.pivot(index="Age_Group", columns="Class", values="Count")

# Creating the heatmap
fig = px.imshow(df_pivot,
                labels=dict(x="Class", y="Age Group", color="Count"),
                x=df_pivot.columns,
                y=df_pivot.index,
                color_continuous_scale="Blues")

fig.update_layout(title="Satisfaction Count per Age Group & Class")
fig.show()

# 11- Flight Distance

In [None]:
print('Pecentage of Flight Distance lower than 100 km: ' + str(df[df['Flight Distance']<100].size/len(df)*100))
print("In reality,There is no flight distances lower than 100.\nBut since that its amount is about 20% of the data I will not remove it")

In [None]:
sns.kdeplot(data= df, x='Flight Distance', fill=True)

# 12- Satisfaction

In [None]:
df['satisfaction'].value_counts()

In [None]:
sns.countplot(x='satisfaction', data=df,color='darkblue')

In [None]:
class_satisfaction = df.groupby('Class')['satisfaction'].value_counts().reset_index(name='Count')
class_satisfaction

# 13- Relationship between satisfaction and Class

In [None]:
plt.figure(figsize=(8, 6))
sns.barplot(data=class_satisfaction , x='Class', y='Count', hue='satisfaction')

# Labels and Title
plt.xlabel('Class')
plt.ylabel('satisfaction')
plt.title('Class satisfaction')

# Show Plot
plt.legend(title="satisfaction")
plt.show()

In [None]:
class_satisfaction.head()

In [None]:
class_satisfaction['satisfaction'] = class_satisfaction['satisfaction'].map({'neutral or dissatisfied': 0, 'satisfied': 1})
class_satisfaction = class_satisfaction[class_satisfaction['satisfaction']==1]
class_satisfaction.head()

In [None]:
class_satisfaction.drop('satisfaction', axis=1, inplace=True)
class_satisfaction.head()

In [None]:
fig = px.imshow([class_satisfaction["Count"].values],
                labels=dict(x="Class", y="Satisfied"),
                x=class_satisfaction["Class"],
                color_continuous_scale="Blues")

fig.update_layout(title="Satisfaction Count per Class")
fig.show()

# 14- Relationship between satisfaction and Fight Distance

In [None]:
# px.box(df, x="satisfaction", y="Flight Distance", title="Box Plot Example")
sns.boxplot(x="satisfaction", y="Flight Distance" , data=df)

# 15- Services Evaluation

In [None]:
# selected_columns = df.iloc[:, 6:20]

# # Melting the DataFrame (convert wide to long format)
# df_melted = selected_columns.melt(var_name="Feature", value_name="Value")

# # Plot using Seaborn
# plt.figure(figsize=(10, 12))  # Adjust figure size
# sns.boxplot(data=df_melted, x="Value", y="Feature", palette="Set2")

# plt.title("Box Plot of Features")
# plt.xlabel("Stars")
# plt.ylabel("Services evaluation")
# plt.show()

selected_columns = df.iloc[:, 6:20]

# Melting the DataFrame (convert wide to long format)
df_melted = selected_columns.melt(var_name="Feature", value_name="Value")

# Create Box Plot (ONE figure, all features on Y-axis)
fig = px.box(df_melted, x="Value", y="Feature", color="Feature",
             title="Box Plot of Features (Stars 0-5)",
             width=1000, height=800)


fig.show()

# 16- Relationship between satisfaction and Services Evaluation

In [None]:
import plotly.graph_objects as go
import plotly.subplots as sp

In [None]:
columns_to_plot = df.columns[6:20]

fig = sp.make_subplots(rows=1, cols=len(columns_to_plot), subplot_titles=columns_to_plot)

# Add box plots for each column
for i, col in enumerate(columns_to_plot):
    fig.add_trace(
        go.Box(x=df["satisfaction"], y=df[col], name=col),
        row=1, col=i+1
    )

# Update layout
fig.update_layout(
    title="Multiple Box Plots of Satisfaction vs Features",
    showlegend=False,
    height=500,
    width=250 * len(columns_to_plot),
)

# Show figure
fig.show()

# 17- Departure Delay in Minutes

In [None]:
sns.kdeplot(data= df, x='Departure Delay in Minutes', fill=True)

# 18- Relationship between Departure Delay and Arrival Delay

In [None]:
px.scatter(df, x='Departure Delay in Minutes', y='Arrival Delay in Minutes')

In [None]:
df['Delay'] = df['Departure Delay in Minutes'] + df['Arrival Delay in Minutes']

# 19- Relationship between satisfaction and Delay

In [None]:
sns.scatterplot(data=df, x="Flight Distance", y="Delay", alpha=0.65, hue="satisfaction")
plt.title("Flight Distance & Delay vs Satisfaction")
plt.xlabel("Flight Distance")
plt.ylabel("Delay")
plt.legend(title="Satisfaction")
plt.show()

# 20- Correlations

In [None]:
numerical_cols = df.select_dtypes(include=['number'])
corr_matrix = numerical_cols.corr()
plt.figure(figsize=(13, 7))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Matrix Heatmap")
plt.show()

In [None]:
df.drop(['Departure Delay in Minutes', 'Arrival Delay in Minutes'], axis=1, inplace=True)
# We can drop these two columns because they are high correlated with Delay column

In [None]:
# Categorical with Numerical
# ( Bar Plot or Box Plot)

# Numerical with a Group of Categorical and Numerical
# ( Scatter Plot, Violin Plot or line plot with hue)

# Categorical with a Group of Categorical and Numerical
# (Heatmap, Clustered Bar Plot or bar chart )