# Import The Packages

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from pathlib import Path

# Load the Data

In [2]:
data_path = r'D:\Flight-Price-Prediction\data\flight_price.csv'

df = pd.read_csv(data_path)

df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


## Missing Values

In [3]:
# check for missing values in the data

df.isna().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              1
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        1
Additional_Info    0
Price              0
dtype: int64

In [4]:
# print the rows of data that have missing values

(
    df.
    loc[
        df
        .isna()
        .any(axis=1)
    ]
   
)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
9039,Air India,6/05/2019,Delhi,Cochin,,09:45,09:25 07 May,23h 40m,,No info,7480


In [5]:
# fill the route and total stops in the row

(
    df
    .loc[
        df
        .isna()
        .any(axis=1)
    ]
    .fillna({
        "Route":"No Route Info",
        "Total_Stops":"0"
    })
   
)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
9039,Air India,6/05/2019,Delhi,Cochin,No Route Info,09:45,09:25 07 May,23h 40m,0,No info,7480


## Duplicate rows

In [6]:
# check for duplicates in the data

(
    df
    .duplicated()
    .sum()
)

220

**There are 220 rows in the data that are duplicate rows**

In [7]:
# check what rows are duplicated

(
    df
    .loc[
        df
        .duplicated(keep=False)
    ]
    .sort_values(by=df.columns[:5].to_list())
    .head(30)
)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
6321,Air India,01/03/2019,Banglore,New Delhi,BLR → BOM → AMD → DEL,08:50,23:55 02 Mar,39h 5m,2 stops,No info,17135
9848,Air India,01/03/2019,Banglore,New Delhi,BLR → BOM → AMD → DEL,08:50,23:55 02 Mar,39h 5m,2 stops,No info,17135
572,Air India,03/03/2019,Banglore,New Delhi,BLR → DEL,21:10,23:55,2h 45m,non-stop,No info,7591
8168,Air India,03/03/2019,Banglore,New Delhi,BLR → DEL,21:10,23:55,2h 45m,non-stop,No info,7591
1495,Air India,1/04/2019,Kolkata,Banglore,CCU → DEL → COK → BLR,10:00,01:20 02 Apr,15h 20m,2 stops,No info,10408
9913,Air India,1/04/2019,Kolkata,Banglore,CCU → DEL → COK → BLR,10:00,01:20 02 Apr,15h 20m,2 stops,No info,10408
4603,Air India,1/05/2019,Kolkata,Banglore,CCU → DEL → COK → BLR,10:00,13:45 02 May,27h 45m,2 stops,No info,15164
5042,Air India,1/05/2019,Kolkata,Banglore,CCU → DEL → COK → BLR,10:00,13:45 02 May,27h 45m,2 stops,No info,15164
6377,Air India,1/05/2019,Kolkata,Banglore,CCU → DEL → COK → BLR,10:00,13:45 02 May,27h 45m,2 stops,No info,15164
3598,Air India,1/05/2019,Kolkata,Banglore,CCU → GAU → DEL → BLR,09:50,08:55 02 May,23h 5m,2 stops,No info,13227


**Here we an look at the duplicate entries**

## Data Types of columns


In [8]:
(
    df
    .dtypes
)

Airline            object
Date_of_Journey    object
Source             object
Destination        object
Route              object
Dep_Time           object
Arrival_Time       object
Duration           object
Total_Stops        object
Additional_Info    object
Price               int64
dtype: object

**All the columns are object and only the target `price` column is integer**

1. `Date_of_Journey` should be a datetime column.
2. `Dep_Time` and `Arrival_Time` should also carry the time information.
3. `Duration` should be a number column.
4. `Total_Stops` also seems to be an integer type column.

# Data Cleaning column by column

### Airline

In [9]:
(
    df
    .loc[:,'Airline']
    .str.replace(" Premium economy","")
    .str.replace(" Business","")
    .str.title()
    .str.replace(" ","_")
    .unique()
)

array(['Indigo', 'Air_India', 'Jet_Airways', 'Spicejet',
       'Multiple_Carriers', 'Goair', 'Vistara', 'Air_Asia', 'Trujet'],
      dtype=object)

In [10]:
(
    df
    .loc[:,'Airline']
    .str.replace(" Premium economy","")
    .str.replace(" Business","")
    .str.title()
    .str.replace(" ","_")
)

0             Indigo
1          Air_India
2        Jet_Airways
3             Indigo
4             Indigo
            ...     
10678       Air_Asia
10679      Air_India
10680    Jet_Airways
10681        Vistara
10682      Air_India
Name: Airline, Length: 10683, dtype: object

### Date of Journey

In [11]:
(
    pd
    .to_datetime(
        df.loc[:,'Date_of_Journey'], dayfirst=True
    )
)

0       2019-03-24
1       2019-05-01
2       2019-06-09
3       2019-05-12
4       2019-03-01
           ...    
10678   2019-04-09
10679   2019-04-27
10680   2019-04-27
10681   2019-03-01
10682   2019-05-09
Name: Date_of_Journey, Length: 10683, dtype: datetime64[ns]

### Source

In [12]:
(
    df
    .loc[:,'Source']
    .unique()
)

array(['Banglore', 'Kolkata', 'Delhi', 'Chennai', 'Mumbai'], dtype=object)

### Destination

In [13]:
(
    df
    .loc[:,'Destination']
    .str.replace("New ","")
    .unique()
)

array(['Delhi', 'Banglore', 'Cochin', 'Kolkata', 'Hyderabad'],
      dtype=object)

### Dep_time

In [14]:
df.columns

Index(['Airline', 'Date_of_Journey', 'Source', 'Destination', 'Route',
       'Dep_Time', 'Arrival_Time', 'Duration', 'Total_Stops',
       'Additional_Info', 'Price'],
      dtype='object')

In [15]:
(
    df
    .loc[:,'Dep_Time']
    .str.contains("[^0-9:]",regex=True)
    .sum()
)

0

In [16]:
(
    df
    .loc[
    df
    .Dep_Time
    .str.contains("[^0-9:]"),
    'Dep_Time'
    ]
)

Series([], Name: Dep_Time, dtype: object)

In [17]:
(
    pd
    .to_datetime(df.loc[:,'Dep_Time'],format='mixed')
    .dt.time
)

0        22:20:00
1        05:50:00
2        09:25:00
3        18:05:00
4        16:50:00
           ...   
10678    19:55:00
10679    20:45:00
10680    08:20:00
10681    11:30:00
10682    10:55:00
Name: Dep_Time, Length: 10683, dtype: object

### Arrival_time

In [18]:
(
    df
    .loc[
    df
    .Arrival_Time
    .str.contains("[^0-9:]",regex=True)
    ,'Arrival_Time']
    .str.split(" ",n=1)
    .str.get(-1)
)

0        22 Mar
2        10 Jun
6        13 Mar
7        02 Mar
8        13 Mar
          ...  
10666    13 Jun
10667    13 Mar
10672    28 Jun
10673    28 May
10674    13 Mar
Name: Arrival_Time, Length: 4335, dtype: object

In [19]:
(
    df
    .loc[
    df
    .Arrival_Time
    .str.contains("[^0-9:]",regex=True)
    ,'Arrival_Time']
    .str.split(" ",n=1)
    .str.get(-1)
    .shape
)

(4335,)

**Only 4335 rows contains the date information. Just because we have the date of journey, this infomation can be removed from the column**

In [20]:
(
    pd
    .to_datetime(df.Arrival_Time,format='mixed').dt.time
)

0        01:10:00
1        13:15:00
2        04:25:00
3        23:30:00
4        21:35:00
           ...   
10678    22:25:00
10679    23:20:00
10680    11:20:00
10681    14:10:00
10682    19:15:00
Name: Arrival_Time, Length: 10683, dtype: object

### Duration

In [21]:
(
    df
    .Duration
)

0        2h 50m
1        7h 25m
2           19h
3        5h 25m
4        4h 45m
          ...  
10678    2h 30m
10679    2h 35m
10680        3h
10681    2h 40m
10682    8h 20m
Name: Duration, Length: 10683, dtype: object

In [22]:
(
    df
    .Duration
    .loc[
    lambda ser : ~(
        ser
        .str.contains('h')
    )
    ]
)

6474    5m
Name: Duration, dtype: object

**A flight duration of 5m does not seems logical and lets investigate further**

In [23]:
(
    ~df
    .Duration
    .str.contains('h')
)


0        False
1        False
2        False
3        False
4        False
         ...  
10678    False
10679    False
10680    False
10681    False
10682    False
Name: Duration, Length: 10683, dtype: bool

In [24]:
(
    df
    .loc[
        ~df
        .Duration
        .str.contains('h')
    ]
    .index
)

Index([6474], dtype='int64')

In [25]:
(
    df
    .Duration
    .loc[
    lambda ser : ~(
        ser
        .str.contains('m')
    )
    ]
    .unique()
)

array(['19h', '23h', '22h', '12h', '3h', '5h', '10h', '18h', '24h', '15h',
       '16h', '8h', '14h', '20h', '13h', '11h', '9h', '27h', '26h', '4h',
       '7h', '30h', '21h', '28h', '47h', '6h', '25h', '38h', '34h'],
      dtype=object)

**The individual hour values seems to be fine and no cleaning required**

In [26]:
(
    df
    .Duration
    .drop(index=[6474])
    .str.split(" ",expand=True)
    .set_axis(['hours','minutes'],axis=1)
    .assign(**{
        'hours': lambda df_ : (
            df_
            .hours
            .str.replace("h","")
            .astype(int)
            .mul(60)
        ),
        'minutes': lambda df_ : (
            df_
            .minutes
            .str.replace('m',"")
            .fillna("0")
            .astype('int')
        )
        }
    )
    .sum(axis=1)
)

0         170
1         445
2        1140
3         325
4         285
         ... 
10678     150
10679     155
10680     180
10681     160
10682     500
Length: 10682, dtype: int64

In [27]:
def duration_in_minutes(ser):
    return (
        ser
        .str.split(" ",expand=True)
        .set_axis(['hours','minutes'],axis=1)
        .assign(**{
            'hours': lambda df_ : (
                df_
                .hours
                .str.replace("h","")
                .astype(int)
                .mul(60)
            ),
            'minutes': lambda df_ : (
                df_
                .minutes
                .str.replace('m',"")
                .fillna("0")
                .astype('int')
            )
            }
        )
        .sum(axis=1)
    )

### Total_stops

In [28]:
(
    df
    .Total_Stops
    .str.replace("non-stop","0")
    .str.replace(" stops?","",regex=True)
    .fillna("0")
    .astype(int)
)

0        0
1        2
2        2
3        1
4        1
        ..
10678    0
10679    0
10680    0
10681    0
10682    2
Name: Total_Stops, Length: 10683, dtype: int32

### Additional_Info

In [29]:
(
    df
    .Additional_Info
    .str.replace('No info',"No Info")
    .nunique()
)

9

# Data Cleaning process

In [30]:
def clean_data(df):
    return (
        df
        .drop_duplicates()
        .rename(mapper=str.lower,axis=1)
        .assign(**{
            'airline': lambda df_: (
                df_
                .loc[:,'airline']
                .str.replace(" Premium economy","")
                .str.replace(" Business","")
                .str.title()
                .str.replace(" ","_")
            ),
            'date_of_journey': lambda df_: (
                pd.to_datetime(df_.loc[:,'date_of_journey'],dayfirst=True)
            ),
            'destination': lambda df_: (
                df_
                .loc[:,'destination']
                .str.replace("New ","")
            )
        })
        .drop(columns='route')
        .rename({'dep_time':'departure_time'},axis=1)
        .assign(**{
            'departure_time': lambda df_: (
                pd
                .to_datetime(df_.loc[:,'departure_time'],format='mixed')
                .dt.time
            ),
            'arrival_time': lambda df_: (
                pd
                .to_datetime(df_.loc[:,'arrival_time'],format='mixed')
                .dt.time
            )
        })
        .drop(index=[6474])
        .assign(**{
             'duration': lambda df_ : (
                 df_
                 .duration
                 .pipe(duration_in_minutes)
             ),
            'total_stops': lambda df_: (
                df_
                .total_stops
                .str.replace("non-stop","0")
                .str.replace(" stops?","",regex=True)
                .fillna("0")
                .astype(int)
            ),
            'additional_info': lambda df_:(
                df_
                .additional_info
                .str.replace('No info',"No Info")
            )
        }
        )
    )

In [31]:
cleaned_data = clean_data(df)

cleaned_data

Unnamed: 0,airline,date_of_journey,source,destination,departure_time,arrival_time,duration,total_stops,additional_info,price
0,Indigo,2019-03-24,Banglore,Delhi,22:20:00,01:10:00,170,0,No Info,3897
1,Air_India,2019-05-01,Kolkata,Banglore,05:50:00,13:15:00,445,2,No Info,7662
2,Jet_Airways,2019-06-09,Delhi,Cochin,09:25:00,04:25:00,1140,2,No Info,13882
3,Indigo,2019-05-12,Kolkata,Banglore,18:05:00,23:30:00,325,1,No Info,6218
4,Indigo,2019-03-01,Banglore,Delhi,16:50:00,21:35:00,285,1,No Info,13302
...,...,...,...,...,...,...,...,...,...,...
10678,Air_Asia,2019-04-09,Kolkata,Banglore,19:55:00,22:25:00,150,0,No Info,4107
10679,Air_India,2019-04-27,Kolkata,Banglore,20:45:00,23:20:00,155,0,No Info,4145
10680,Jet_Airways,2019-04-27,Banglore,Delhi,08:20:00,11:20:00,180,0,No Info,7229
10681,Vistara,2019-03-01,Banglore,Delhi,11:30:00,14:10:00,160,0,No Info,12648


In [32]:
(
    cleaned_data
    .loc[:,'airline']
    .unique()
)

cleaned_data.dtypes

airline                    object
date_of_journey    datetime64[ns]
source                     object
destination                object
departure_time             object
arrival_time               object
duration                    int64
total_stops                 int32
additional_info            object
price                       int64
dtype: object

In [33]:
(
    cleaned_data
    .loc[:,'destination']
    .unique()
)

array(['Delhi', 'Banglore', 'Cochin', 'Kolkata', 'Hyderabad'],
      dtype=object)

In [34]:
6474 in cleaned_data.index

False

# Split the Data

In [36]:
train_data, test_data = train_test_split(df,test_size=0.2,random_state=42)

print('The shape of training data is ',train_data.shape)
print('The shape of test data is ',test_data.shape)

The shape of training data is  (8546, 11)
The shape of test data is  (2137, 11)


# Save the Data

In [38]:
def save_dataframe(filename,dataframe):
    save_path = Path(r"D:\Flight-Price-Prediction\new_notebooks\new_data") 
    full_path = save_path / filename
    dataframe.to_csv(full_path,index=False)

In [39]:
# save the train data

save_dataframe(filename='train.csv',
              dataframe=train_data)

# save the test data
save_dataframe(filename='test.csv',
              dataframe=test_data)