# Import the Packages

In [1]:
import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

# Read the Data

In [2]:
data_path = r"D:\Flight-Price-Prediction\data\flight_price.csv"

flights_df = pd.read_csv(data_path)

In [3]:
flights_df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


**Observations**:
1. The `Airline` column has all the airline name categories.
2. The `Date_of_journey` column has all the entries as dates with the day first format.
3. The `Source` and `Destination` columns have city names and is categorical in nature.
4. The `Route` column marks the route of all the flights with information of the stops included along with it.
5. The `Dep Time` and `Arrival_Time` columns need some cleaning as they appear mixed types with time and date information.
6. The `Duration` column has the flight duration in both hours and minutes and has to be converted in a sinle format for better interpretation.
7. The `Total_Stops` is also categorical in natures and has values of total stops in text and should be converted to integer marking the number of stops the flight takes between its complete route.
8. The `Addtitional_info` column has to be investigated further.
9. The `Price` column is the target column and has price as integers.

In [6]:
print(f'The shape of the flights data has {flights_df.shape[0]} rows and {flights_df.shape[1]} columns')

The shape of the flights data has 10683 rows and 11 columns


In [7]:
flights_df.columns

Index(['Airline', 'Date_of_Journey', 'Source', 'Destination', 'Route',
       'Dep_Time', 'Arrival_Time', 'Duration', 'Total_Stops',
       'Additional_Info', 'Price'],
      dtype='object')

In [5]:
flights_df.sample(20)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
5847,Jet Airways,27/05/2019,Delhi,Cochin,DEL → BOM → COK,22:50,19:00 28 May,20h 10m,1 stop,No info,16079
9853,IndiGo,6/04/2019,Banglore,Delhi,BLR → DEL,10:30,13:20,2h 50m,non-stop,No info,4423
5273,Air Asia,12/04/2019,Banglore,Delhi,BLR → DEL,23:55,02:45 13 Apr,2h 50m,non-stop,No info,7080
5701,Air India,01/03/2019,Banglore,New Delhi,BLR → BOM → BHO → DEL,08:50,23:25 02 Mar,38h 35m,2 stops,No info,25430
9511,Vistara,12/03/2019,Mumbai,Hyderabad,BOM → DEL → HYD,07:30,19:55,12h 25m,1 stop,No info,12080
284,IndiGo,6/05/2019,Kolkata,Banglore,CCU → BLR,21:25,00:05 07 May,2h 40m,non-stop,No info,4804
9745,Jet Airways,9/05/2019,Delhi,Cochin,DEL → BOM → COK,08:00,04:25 10 May,20h 25m,1 stop,In-flight meal not included,12373
4419,Jet Airways,18/04/2019,Banglore,Delhi,BLR → DEL,17:45,20:45,3h,non-stop,No info,7229
3054,Jet Airways,18/06/2019,Banglore,Delhi,BLR → DEL,08:20,11:20,3h,non-stop,No info,8016
9941,SpiceJet,24/03/2019,Mumbai,Hyderabad,BOM → HYD,13:10,14:30,1h 20m,non-stop,No info,2753


In [9]:
# data types of columns in the data

flights_df.dtypes

Airline            object
Date_of_Journey    object
Source             object
Destination        object
Route              object
Dep_Time           object
Arrival_Time       object
Duration           object
Total_Stops        object
Additional_Info    object
Price               int64
dtype: object

In [10]:
# info about the dataframe

flights_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 918.2+ KB


In [11]:
# checking for missing values in the data

flights_df.isna().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              1
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        1
Additional_Info    0
Price              0
dtype: int64

In [27]:
# filter out the rows that have missing values

(
    flights_df
    .loc[
     flights_df
    .isna()
    .any(axis=1)
    , :
    ]
    
)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
9039,Air India,6/05/2019,Delhi,Cochin,,09:45,09:25 07 May,23h 40m,,No info,7480


Index value `9039` has missing values in columns `Route` and `Total Stops`

In [39]:
# check for duplicate rows in the data

(
    flights_df
    .duplicated()
    .sum()
)

220

**220 rows in the data are duplicated**

In [47]:
(
    flights_df
    .loc[
    flights_df
    .duplicated(keep=False)
    ]
    .sort_values(by=['Date_of_Journey','Airline'])
    
)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
6321,Air India,01/03/2019,Banglore,New Delhi,BLR → BOM → AMD → DEL,08:50,23:55 02 Mar,39h 5m,2 stops,No info,17135
9848,Air India,01/03/2019,Banglore,New Delhi,BLR → BOM → AMD → DEL,08:50,23:55 02 Mar,39h 5m,2 stops,No info,17135
572,Air India,03/03/2019,Banglore,New Delhi,BLR → DEL,21:10,23:55,2h 45m,non-stop,No info,7591
8168,Air India,03/03/2019,Banglore,New Delhi,BLR → DEL,21:10,23:55,2h 45m,non-stop,No info,7591
1552,IndiGo,03/03/2019,Banglore,New Delhi,BLR → DEL,16:55,19:45,2h 50m,non-stop,No info,8855
...,...,...,...,...,...,...,...,...,...,...,...
8446,Jet Airways,9/06/2019,Delhi,Cochin,DEL → NAG → BOM → COK,06:45,12:35 10 Jun,29h 50m,2 stops,No info,13376
8496,Jet Airways,9/06/2019,Delhi,Cochin,DEL → JAI → BOM → COK,09:40,12:35 10 Jun,26h 55m,2 stops,No info,13014
8967,Jet Airways,9/06/2019,Delhi,Cochin,DEL → JAI → BOM → COK,09:40,04:25 10 Jun,18h 45m,2 stops,In-flight meal not included,10368
9191,Jet Airways,9/06/2019,Delhi,Cochin,DEL → JAI → BOM → COK,09:40,12:35 10 Jun,26h 55m,2 stops,In-flight meal not included,10368


In [61]:
(
    flights_df
    .rename(columns=str.lower)
    .columns
)

Index(['airline', 'date_of_journey', 'source', 'destination', 'route',
       'dep_time', 'arrival_time', 'duration', 'total_stops',
       'additional_info', 'price'],
      dtype='object')

## Airline

In [76]:
(
    flights_df['Airline']
    .str.replace('Vistara Premium economy','Vistara')
    .str.replace('Multiple carriers Premium economy','Multiple carriers')
    .str.replace('Jet Airways Business','Jet Airways')
    .str.strip()
    .str.replace(' ','_')
    .str.title()
    .unique()
)

array(['Indigo', 'Air_India', 'Jet_Airways', 'Spicejet',
       'Multiple_Carriers', 'Goair', 'Vistara', 'Air_Asia', 'Trujet'],
      dtype=object)

**Steps**
1. Replace the duplicate flight carriers and make them one.
2. Strip extra whitespaces from categories.
3. Replace the space between two words with "_".
4. Convert the words into Title case.

## Date_of_Journey

In [87]:
(
    pd.to_datetime(flights_df['Date_of_Journey'],dayfirst=True)
)

0       2019-03-24
1       2019-05-01
2       2019-06-09
3       2019-05-12
4       2019-03-01
           ...    
10678   2019-04-09
10679   2019-04-27
10680   2019-04-27
10681   2019-03-01
10682   2019-05-09
Name: Date_of_Journey, Length: 10683, dtype: datetime64[ns]

**Steps:**
1. Convert the column to datetime.
2. Extract the month and day information out of the datetime column.
3. Drop the original column after the extraction step.

## Source

In [102]:
(
    flights_df['Source']
    .unique()
)

array(['Banglore', 'Kolkata', 'Delhi', 'Chennai', 'Mumbai'], dtype=object)

In [97]:
(
    flights_df['Source']
    .value_counts()
)

Source
Delhi       4537
Kolkata     2871
Banglore    2197
Mumbai       697
Chennai      381
Name: count, dtype: int64

**No data cleaning steps to be performed in `Source` column**

## Destination

In [99]:
(
    flights_df['Destination']
    .unique()
)

array(['New Delhi', 'Banglore', 'Cochin', 'Kolkata', 'Delhi', 'Hyderabad'],
      dtype=object)

**Steps**
1. Merge the `New Delhi` and `Delhi` category together

In [100]:
(
    flights_df['Destination']
    .value_counts()
)

Destination
Cochin       4537
Banglore     2871
Delhi        1265
New Delhi     932
Hyderabad     697
Kolkata       381
Name: count, dtype: int64

In [105]:
(
    flights_df['Destination']
    .str.replace('New Delhi','Delhi')
    .value_counts()
)

Destination
Cochin       4537
Banglore     2871
Delhi        2197
Hyderabad     697
Kolkata       381
Name: count, dtype: int64

In [101]:
932 + 1265

2197

## Dep_Time

In [119]:
flights_df.columns

Index(['Airline', 'Date_of_Journey', 'Source', 'Destination', 'Route',
       'Dep_Time', 'Arrival_Time', 'Duration', 'Total_Stops',
       'Additional_Info', 'Price'],
      dtype='object')

In [136]:
(
    pd.to_datetime(flights_df['Dep_Time'],format='mixed')
    .dt.time
)

0        22:20:00
1        05:50:00
2        09:25:00
3        18:05:00
4        16:50:00
           ...   
10678    19:55:00
10679    20:45:00
10680    08:20:00
10681    11:30:00
10682    10:55:00
Name: Dep_Time, Length: 10683, dtype: object

**Convert the column to datetime and extract the time information out of it**

# Data Cleaning

In [137]:
def data_cleaning(df:pd.DataFrame):
    return (
        df
        .drop_duplicates()
        .rename(columns=str.lower)
        .assign(
            airline = lambda df_:df_['airline']
                                .str.replace('Vistara Premium economy','Vistara')
                                .str.replace('Multiple carriers Premium economy','Multiple carriers')
                                .str.replace('Jet Airways Business','Jet Airways')
                                .str.strip()
                                .str.replace(' ','_')
                                .str.title(),
            date_of_journey = lambda df_: pd.to_datetime(df_['date_of_journey'],dayfirst=True),
            day_of_journey = lambda df_: df_['date_of_journey'].dt.day,
            month_of_journey = lambda df_: df_['date_of_journey'].dt.month
        )
        .drop(columns='date_of_journey')
        .assign(
            destination = lambda df_: df_['destination']
                                    .str.replace('New Delhi','Delhi')
        )
        .drop(columns=['route'])
        .rename(columns={
            "dep_time": "departure_time"
        })
        .assign(
            departure_time = lambda df_: pd.to_datetime(df_['departure_time'],format='mixed')
                                        .dt.time

        )
    )

In [138]:
data_cleaning(flights_df)

Unnamed: 0,airline,source,destination,departure_time,arrival_time,duration,total_stops,additional_info,price,day_of_journey,month_of_journey
0,Indigo,Banglore,Delhi,22:20:00,01:10 22 Mar,2h 50m,non-stop,No info,3897,24,3
1,Air_India,Kolkata,Banglore,05:50:00,13:15,7h 25m,2 stops,No info,7662,1,5
2,Jet_Airways,Delhi,Cochin,09:25:00,04:25 10 Jun,19h,2 stops,No info,13882,9,6
3,Indigo,Kolkata,Banglore,18:05:00,23:30,5h 25m,1 stop,No info,6218,12,5
4,Indigo,Banglore,Delhi,16:50:00,21:35,4h 45m,1 stop,No info,13302,1,3
...,...,...,...,...,...,...,...,...,...,...,...
10678,Air_Asia,Kolkata,Banglore,19:55:00,22:25,2h 30m,non-stop,No info,4107,9,4
10679,Air_India,Kolkata,Banglore,20:45:00,23:20,2h 35m,non-stop,No info,4145,27,4
10680,Jet_Airways,Banglore,Delhi,08:20:00,11:20,3h,non-stop,No info,7229,27,4
10681,Vistara,Banglore,Delhi,11:30:00,14:10,2h 40m,non-stop,No info,12648,1,3
