In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#LOAD DATA
data = pd.read_csv("Booking_data.csv")
data.head()

In [None]:
data['trip_type'].unique()


<font size="6">Data Preprocessing </font><br>
Data Preprocessing consists of: <br>
 - Checking the shape of the dataset and understanding its dimensions,
 - Checking if there are null values and remove them if any were found,
 - Checking of duplicate vlues, and replace them to ensure data integrity,
 - Gain insights into relationship between different data features, using a heatmap we visualize the correlation map.

In [None]:
data.columns

In [None]:
#checking data shape
print (f" The dataset contains {data.shape[0]} rows and {data.shape[1]} columns")

In [None]:
data.info()

In [None]:
# Get some statisctics about the dataset
data.describe()

In [None]:
#cheking for null value
data.isnull().sum()/len(data)*100

In [None]:
#Checking the duplicate value
duplicated_values = data.duplicated().sum()
print(f'The dataset contains {duplicated_values} duplicate values')

In [None]:
#Remove the duplicated values
data = data.drop_duplicates()
data.shape

In [None]:
#Insights about the relationship between different features
plt.figure(figsize =(10,8))
sns.heatmap(data.corr(), annot = True, cmap = "RdYlBu", fmt='.2f', annot_kws = None,linewidths = 1,)
plt.title("The correlation between data features")
plt.show()

<font size="6">Data Analysis Exploration  </font><br>

In [None]:
#The distribution of purchase lead in the data
plt.figure(figsize =(10,8))
sns.distplot(data['purchase_lead'],hist=True, bins =50)
plt.title("The distribution of purchase lead")
plt.show()

In [None]:
# Create a bar plot to visualize the top 10 Departure countries 
data['booking_origin'].value_counts().sort_values(ascending=False).nlargest(10).plot(kind='bar',figsize=(12,6),color=['#FE2E9A','#F6CEF5','#F79F81','#B4045F','#BEF781'])
plt.title("Top 10 Departure Contries")
plt.xlabel("Countries")
plt.xticks(rotation=90)
plt.ylabel("Count of values")

In [None]:
# Create a pie chart to visualize the ratio of Complete vs Pending booking
data['booking_complete'].value_counts().plot(kind='pie',
    explode=[0,0.2],
    labels=['Booking Pending',"Booking Completed"],
    colors=['#01DFD7','#81F7BE'],
    autopct='%1.2f%%',
    shadow=True)
plt.title("Booking Ratio")
plt.show()

In [None]:
# what is the most trip type booked by passengers
trip_type=['trip_type']
for i,col in enumerate(trip_type):
    plt.figure(figsize=(10,6))
    data[col].value_counts().plot(kind='pie',explode=[0,0.5,0.3],
    labels=['RoundTrip',"OneWay",'CircleTrip'],
    colors=['#FE2E9A','#F79F81','#BEF781'],
    autopct='%1.2f%%',
    shadow=True)
plt.title("Trip Type Ratio")
plt.show()

In [None]:
# Find the average flight duration of the top 20 destination in the bar charts
data.groupby('booking_origin')['flight_duration'].mean().sort_values(ascending=False).nlargest(20)\
.plot(kind='barh',figsize=(6,6))
plt.title("Average flight duration of the top 20 countries")
plt.xlabel("Country")
plt.ylabel("Average filght duration")
plt.show()

In [None]:
# Create countplot to visualize the relation between the day and the booking ratio
plt.figure(figsize=(10,5))
sns.countplot(data=data,x='flight_day',hue='booking_complete',palette=['grey','crimson'])
plt.title("How many tickets booked during the each day")
plt.show()

In [None]:
# Visualize the relation between the three flight option and the people who completed the booking 
df=['wants_extra_baggage', 'wants_preferred_seat',
       'wants_in_flight_meals']
plt.figure(figsize=(12,5))
for i,col in enumerate(df):
    plt.subplot(1,3,i+1)
    data[col].value_counts().plot(kind='pie',explode=[0,0.2],
    labels=['Booking Pending',"Booking Completed"],
    colors=['#FE2E9A','#BEF781'],
    autopct='%1.2f%%',
    shadow=True)

In [None]:
#Percentage of booking complete throught internet vs Mobile
data.groupby('sales_channel')['booking_complete'].sum().plot(kind='pie',
explode=[0,0.2],
labels =['Booking completed through the internet',"Booking completed through the Mobile"],
colors=['#8258FA','#FE2E9A'],
autopct='%1.2f%%',
shadow=True)
plt.title("Find the how much percentage of booking completed through the channel")

In [None]:
# Let's Compare How much percentage of the booking complete through mobile and internet
data.groupby('trip_type')['booking_complete'].sum().plot(kind='pie',
    explode=[0,0.2, 0.4],
   labels=['Round Trip',"One way", "Circle Trip"],
    colors=['#8258FA','#2EFE64','#FE2E9A'],
    autopct='%1.2f%%',
    shadow=True)
plt.title("Find the most type of trip that was booked")

In [None]:
# Analyse each colmuns  
for i in data.select_dtypes(include='int'):
    plt.figure(figsize=(10,8))
    sns.countplot(data=data,x=data[i])
    plt.xlabel(f'{i}')
    plt.ylabel('Count of values')
    plt.xticks(rotation=90)
    plt.show()

In [None]:
#Answering some questions
# Find the most busy day for booking
print('The most busy day for booking is:',data['flight_day'].value_counts().idxmax())
# Find the leas busy day for boking
print('\nThe least busy day for booking is:',data['flight_day'].value_counts().idxmin())
# And the we also do same thing about the booking origin 
print('\nThe most booking are coming from: ',data['booking_origin'].value_counts().idxmax())
print('\nThe least booking are coming from:',data['booking_origin'].value_counts().idxmin())

<font size="6">Machine learning Modeling  </font><br>

In [None]:
# Import the all required libraries for machine learning modeling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [None]:
# Covert the categorical data into the numerical using the labelencoder
for col in data.select_dtypes(include='object').columns:
    label_encoder=LabelEncoder()
    label_encoder.fit(data[col].unique())
    data[col]=label_encoder.transform(data[col])

In [None]:
data.head()

In [None]:
data['trip_type'].unique()

In [None]:
#Divided the data into independent and dependent variables
X=data.drop(['booking_complete','purchase_lead','route'],axis=1)#independents variables
y=data['booking_complete']#dependent variable
# Scalling the data 
scaler=StandardScaler()
X=scaler.fit_transform(X)
#Split the data into train and test data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=120)

In [None]:
# Create a function for machine learning model
def model_bulding(model,X_train,X_test,y_train,y_test):
    print(f'Name of the {model}')
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    score=accuracy_score(y_test,y_pred)
    print(f'\nThe Accuracy_score is {score*100:.2f}')
    print(f'\n{classification_report(y_test,y_pred)}')
    print(f'\n{confusion_matrix(y_test,y_pred)}')
    print('__'*20)

In [None]:
# Create all model into the dictionary formate to train the data on different algorithms then compare accuracy
models={
    'logistic':LogisticRegression(),
    'decisiontree':DecisionTreeClassifier(),
    'radnom':RandomForestClassifier(),
    'Kneighbors':KNeighborsClassifier(),
    'xGB':XGBClassifier(),
    'Cat':CatBoostClassifier(iterations=1)
    
}

In [None]:
# Then iterating through the each model and visualize the classification and confusion reports
for i in range(len(models)):
    model=list(models.values())[i]
    name=list(models.keys())[i]
    model_bulding(model,X_train,X_test,y_train,y_test)

<font size="6">Final Thoughts about the project  </font><br>
The dataset reveals intriguing findings. A noteworthy observation is that a substantial 91% of individuals chose not to reserve their tickets, whereas a mere 9% exhibited an inclination towards booking. This underscores the urgency of augmenting the quality of supplementary services such as baggage handling, personalized seat selection, and meal choices, given their substantial influence on customers' choices. Additionally, an avenue to explore involves incorporating online advertising to bolster booking rates.

Moreover, a striking trend emerges in favor of round trips as the predominant choice. Leveraging this prevailing preference, it is advisable to channel efforts towards optimizing and enhancing the round trip experience. Simultaneously, the prospect of enticing incentives and advertising campaigns can be explored for both one-way and circular trips. By grasping these trends and preferences, our marketing strategies can be tailored to target distinct trip preferences, thus attracting a higher volume of customers to make bookings.