In [188]:
# import libraries
import pandas as pd

In [189]:
df = pd.read_excel("Data_Train.xlsx")
df.head(2)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662


# Data Profiling and inspection

### checking the data types

In [190]:
# checking the data types
df.dtypes

Airline            object
Date_of_Journey    object
Source             object
Destination        object
Route              object
Dep_Time           object
Arrival_Time       object
Duration           object
Total_Stops        object
Additional_Info    object
Price               int64
dtype: object

* data types of dataframe are object and int

### Descritive statistic

In [191]:
df.describe() # for numerical data

Unnamed: 0,Price
count,10683.0
mean,9087.064121
std,4611.359167
min,1759.0
25%,5277.0
50%,8372.0
75%,12373.0
max,79512.0


* total count of price is 10683.
* mean of price is 9087.064121.
* median of price is 8372.
* standard deviation of price is 4611.359167.
* minimum value of price is 1759.
* data falls below 25% is 5277.
* data falls below 75% is 12373.
* maximum value of price is 79512.

In [192]:
df.describe(include="object") # for categorical data

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info
count,10683,10683,10683,10683,10682,10683,10683,10683,10682,10683
unique,12,44,5,6,128,222,1343,368,5,10
top,Jet Airways,18/05/2019,Delhi,Cochin,DEL → BOM → COK,18:55,19:00,2h 50m,1 stop,No info
freq,3849,504,4537,4537,2376,233,423,550,5625,8345


* there are 12 unique airline and Jet Airways is the most appear with 3849 times
* 18/05/2019 is the most appear date of journey with 504 times
* Delhi is the most appear source  with 4537 times
* Cochin is the most appear destination  with 4537 times
* DEL-BOM-COK is the most appear route  with 2376 times
* most of depature time and arrival are 18:55 and 19:00 respectively.


### Shape

In [193]:
# checking the shape of data
df.shape

(10683, 11)

* total number of rows is 10683.
* total number of columns is 11.

### Overview overall information

In [194]:
# viewing overall information of data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 918.2+ KB


* 'Airline', 'Date_of_Journey', 'Source', 'Destination', 'Route',
       'Dep_Time', 'Arrival_Time', 'Duration', 'Total_Stops',
       'Additional_Info' are string and 'Price' is int.
* Route has 10682 non-null values 
* Total_Stops has 10682 non-null values 

### Checking null value

In [195]:
df.isna().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              1
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        1
Additional_Info    0
Price              0
dtype: int64

* Route and Total_Stops has 1 missing value

### Removing null rows

In [196]:
df.isna().any(axis='columns').sum()

1

In [197]:
df.loc[df.isna().any(axis='columns')] # viewing null values row

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
9039,Air India,6/05/2019,Delhi,Cochin,,09:45,09:25 07 May,23h 40m,,No info,7480


In [198]:
df.dropna(inplace=True) # removing the null rows

In [199]:
df.head(2)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662


### Converting Date_of_journy according to datetime

In [200]:
data_of_journey = df["Date_of_Journey"]
data_time  = pd.to_datetime(data_of_journey)
df["Date_of_Journey"] = data_time

  data_time  = pd.to_datetime(data_of_journey)


### Extracting month and day and updating in original data

In [201]:
df['month'] = pd.DatetimeIndex(df['Date_of_Journey']).month # extracting month from Date_of_Journey and adding new columns month with extracted values 
df['day'] = pd.DatetimeIndex(df['Date_of_Journey']).day # extracting day from Date_of_Journey and adding new columns day with extracted values

In [202]:
df.head(2)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,month,day
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,3,24
1,Air India,2019-05-01,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662,5,1


In [203]:
df['month'].dtypes # checking the data types of month

dtype('int32')

### Creating Date_of_journey DataFrame

In [204]:
Date_of_Journey = df.loc[:,["Date_of_Journey"]]
Date_of_Journey["month"] = pd.DatetimeIndex(df['Date_of_Journey']).month
Date_of_Journey["day"] = pd.DatetimeIndex(df['Date_of_Journey']).day
Date_of_Journey.head(2)

Unnamed: 0,Date_of_Journey,month,day
0,2019-03-24,3,24
1,2019-05-01,5,1


### dropping Data_of_journey from original dataframe

In [205]:
df.drop(columns="Date_of_Journey", inplace=True)
df.head(2)

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,month,day
0,IndiGo,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,3,24
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662,5,1


### Extracting the  Dep_Time

In [206]:
dep_hours =  pd.DatetimeIndex(df['Dep_Time']).hour # Extracting hours from Dep_Time
dep_minutes =  pd.DatetimeIndex(df['Dep_Time']).minute # Extracting minutes from Dep_Time

### adding depature hours and minutes columns in original data

In [207]:
df['Depature_hour'] = dep_hours # adding new columns Depature_hour with value dep_hours
df['Depature_minute'] = dep_minutes # adding new columns Depature_minute with value dep_minutes

In [208]:
df.head(2)

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,month,day,Depature_hour,Depature_minute
0,IndiGo,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,3,24,22,20
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662,5,1,5,50


### Creating Dep_Time DataFrame

In [209]:
Dep_Time = df.loc[:,["Dep_Time"]]
Dep_Time["Dep_hour"] = pd.DatetimeIndex(df['Dep_Time']).hour
Dep_Time["Dep_minute"] = pd.DatetimeIndex(df['Dep_Time']).minute
Dep_Time.head(2)

Unnamed: 0,Dep_Time,Dep_hour,Dep_minute
0,22:20,22,20
1,05:50,5,50


### Droping Dep_Time column from original data

In [210]:
df.drop(columns="Dep_Time", inplace=True)
df.head(2)

Unnamed: 0,Airline,Source,Destination,Route,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,month,day,Depature_hour,Depature_minute
0,IndiGo,Banglore,New Delhi,BLR → DEL,01:10 22 Mar,2h 50m,non-stop,No info,3897,3,24,22,20
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,13:15,7h 25m,2 stops,No info,7662,5,1,5,50


### Extracting arrival time

In [211]:
arr_hour =  pd.DatetimeIndex(df['Arrival_Time']).hour # Extracting hour from Dep_Time
arr_minute =  pd.DatetimeIndex(df['Arrival_Time']).minute # Extracting minute from Dep_Time

### adding arrival hours and minutes columns in original data

In [212]:
df["Arrival_hour"] = arr_hour # adding new column "Arrival_hour" with arr_hour value
df["Arrival_minute"] = arr_minute # adding new column "Arrival_minute" with arr_minute value

In [213]:
df.head(2)

Unnamed: 0,Airline,Source,Destination,Route,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,month,day,Depature_hour,Depature_minute,Arrival_hour,Arrival_minute
0,IndiGo,Banglore,New Delhi,BLR → DEL,01:10 22 Mar,2h 50m,non-stop,No info,3897,3,24,22,20,1,10
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,13:15,7h 25m,2 stops,No info,7662,5,1,5,50,13,15


### Creating Arrival_Time DataFrame

In [214]:
Arrival_time = df.loc[:,["Arrival_Time"]]
Arrival_time["Arrival_hour"] = arr_hour
Arrival_time["Arrival_minute"] = arr_minute
Arrival_time.head(2)

Unnamed: 0,Arrival_Time,Arrival_hour,Arrival_minute
0,01:10 22 Mar,1,10
1,13:15,13,15


### droping Arrival_Time from original data

In [215]:
df.drop(columns="Arrival_Time", inplace=True)
df.head(2)

Unnamed: 0,Airline,Source,Destination,Route,Duration,Total_Stops,Additional_Info,Price,month,day,Depature_hour,Depature_minute,Arrival_hour,Arrival_minute
0,IndiGo,Banglore,New Delhi,BLR → DEL,2h 50m,non-stop,No info,3897,3,24,22,20,1,10
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,7h 25m,2 stops,No info,7662,5,1,5,50,13,15


### Converting duration according to timedelta

In [216]:
df["Duration"] = pd.to_timedelta(df['Duration']) # converting duration according to timedelta and updating values

In [217]:
df.head(1)

Unnamed: 0,Airline,Source,Destination,Route,Duration,Total_Stops,Additional_Info,Price,month,day,Depature_hour,Depature_minute,Arrival_hour,Arrival_minute
0,IndiGo,Banglore,New Delhi,BLR → DEL,0 days 02:50:00,non-stop,No info,3897,3,24,22,20,1,10


### Extracting hour and minute from Duration

In [218]:
duration_hour = df["Duration"].dt.components.hours # extracting hour from duration
duration_minute = df["Duration"].dt.components.minutes # extracting minute from duration

### adding new column in original data

In [219]:
df["Duration_hour"] = duration_hour # adding new column "Duration_hour" with duration_hour value
df["Duration_minute"] = duration_minute # adding new column "Duration_minute" with duration_minute value

In [220]:
df.head(2)

Unnamed: 0,Airline,Source,Destination,Route,Duration,Total_Stops,Additional_Info,Price,month,day,Depature_hour,Depature_minute,Arrival_hour,Arrival_minute,Duration_hour,Duration_minute
0,IndiGo,Banglore,New Delhi,BLR → DEL,0 days 02:50:00,non-stop,No info,3897,3,24,22,20,1,10,2,50
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,0 days 07:25:00,2 stops,No info,7662,5,1,5,50,13,15,7,25


### Creating Duration DataFrame

In [221]:
Duration = df.loc[:,["Duration"]]
Duration["Duration_hour"] = duration_hour
Duration["Duration_minute"] = duration_minute
Duration.head(2)

Unnamed: 0,Duration,Duration_hour,Duration_minute
0,0 days 02:50:00,2,50
1,0 days 07:25:00,7,25


### droping "Duration" column from original data

In [222]:
df.drop(columns="Duration", inplace=True)
df.head(2)

Unnamed: 0,Airline,Source,Destination,Route,Total_Stops,Additional_Info,Price,month,day,Depature_hour,Depature_minute,Arrival_hour,Arrival_minute,Duration_hour,Duration_minute
0,IndiGo,Banglore,New Delhi,BLR → DEL,non-stop,No info,3897,3,24,22,20,1,10,2,50
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,2 stops,No info,7662,5,1,5,50,13,15,7,25
