## Importing Libraries

In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Read_Data

In [3]:
project_dir = "/Users/mukulagarwal/Desktop/Python_Code/flights_sagemaker_project"
data_dir = "Data"

def get_data(name):
    file_name = f"{name}.csv"
    file_path = os.path.join(project_dir,data_dir,file_name)
    return pd.read_csv(file_path)

In [4]:
flights = get_data("flight_price")
flights

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302
...,...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,9/04/2019,Kolkata,Banglore,CCU → BLR,19:55,22:25,2h 30m,non-stop,No info,4107
10679,Air India,27/04/2019,Kolkata,Banglore,CCU → BLR,20:45,23:20,2h 35m,non-stop,No info,4145
10680,Jet Airways,27/04/2019,Banglore,Delhi,BLR → DEL,08:20,11:20,3h,non-stop,No info,7229
10681,Vistara,01/03/2019,Banglore,New Delhi,BLR → DEL,11:30,14:10,2h 40m,non-stop,No info,12648


## Preliminary Analysis

In [5]:
flights.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 918.2+ KB


In [6]:
flights.isnull().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              1
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        1
Additional_Info    0
Price              0
dtype: int64

Observations
- The dataset has 10,683 rows and 11 features
- Columns "route" and "Total_Stops" has 1 missing value
- Date_of_Journey, Dep_Time and Arrival_Time have object data type should have date_time data type

### Check for data types

In [7]:
flights.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [8]:
flights.dtypes

Airline            object
Date_of_Journey    object
Source             object
Destination        object
Route              object
Dep_Time           object
Arrival_Time       object
Duration           object
Total_Stops        object
Additional_Info    object
Price               int64
dtype: object

In [9]:
value = flights.loc[8,'Date_of_Journey']
print(type(value))

<class 'str'>


In [10]:
value = flights.loc[8,'Dep_Time']
print(type(value))

<class 'str'>


Observation
- `Date of Journery` : is object data type, should have been date_time
- `Dep_time` and `Arrival_time` should be changed to Date_Time
- The type of `duration` feature is mixed. It should be integer.
- Also same is the cases with `total stops` where we have mixed data types. Integer + string formatted as string. We can extract data from this column and convert this to integers 

### Check for Duplicates

In [11]:
flights.duplicated().sum()

220

In [12]:
(
    flights.loc[flights.duplicated(keep=False)].
    sort_values(["Airline","Date_of_Journey","Source","Destination"])
)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
6321,Air India,01/03/2019,Banglore,New Delhi,BLR → BOM → AMD → DEL,08:50,23:55 02 Mar,39h 5m,2 stops,No info,17135
9848,Air India,01/03/2019,Banglore,New Delhi,BLR → BOM → AMD → DEL,08:50,23:55 02 Mar,39h 5m,2 stops,No info,17135
572,Air India,03/03/2019,Banglore,New Delhi,BLR → DEL,21:10,23:55,2h 45m,non-stop,No info,7591
8168,Air India,03/03/2019,Banglore,New Delhi,BLR → DEL,21:10,23:55,2h 45m,non-stop,No info,7591
1495,Air India,1/04/2019,Kolkata,Banglore,CCU → DEL → COK → BLR,10:00,01:20 02 Apr,15h 20m,2 stops,No info,10408
...,...,...,...,...,...,...,...,...,...,...,...
2692,SpiceJet,24/03/2019,Banglore,New Delhi,BLR → DEL,05:45,08:35,2h 50m,non-stop,No check-in baggage included,4273
2870,SpiceJet,24/03/2019,Banglore,New Delhi,BLR → DEL,05:45,08:35,2h 50m,non-stop,No check-in baggage included,4273
3711,SpiceJet,24/03/2019,Banglore,New Delhi,BLR → DEL,20:30,23:20,2h 50m,non-stop,No check-in baggage included,3873
2634,Vistara,24/03/2019,Banglore,New Delhi,BLR → DEL,11:30,14:10,2h 40m,non-stop,No info,5403


Observations
- There are 220 duplicates. These must be removed

## Detailed Analysis

In [13]:
flights.columns

Index(['Airline', 'Date_of_Journey', 'Source', 'Destination', 'Route',
       'Dep_Time', 'Arrival_Time', 'Duration', 'Total_Stops',
       'Additional_Info', 'Price'],
      dtype='object')

In [73]:
[len(i) for i in "2,9".split(',')]

[1, 1]

In [51]:
f(9)

[1, 1]

In [81]:
import re
flights['Duration'].apply(lambda x:re.findall((r'\d+'),x))

0        [2, 50]
1        [7, 25]
2           [19]
3        [5, 25]
4        [4, 45]
          ...   
10678    [2, 30]
10679    [2, 35]
10680        [3]
10681    [2, 40]
10682    [8, 20]
Name: Duration, Length: 10683, dtype: object

In [72]:
import re
flights.assign(duration = [(x[0],x[1]) if len(x)>1 else x[0] for x in flights['Duration'].str.findall(r'\d+')], 
               arrival_time_month = lambda df_ : df_['Arrival_Time'].str.findall(r'\d{2}:\d{2}').str.get(0).astype('datetime64[ns]').dt.time 
               )
 

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,duration,arrival_time_month
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,"(2, 50)",01:10:00
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662,"(7, 25)",13:15:00
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882,19,04:25:00
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218,"(5, 25)",23:30:00
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302,"(4, 45)",21:35:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,9/04/2019,Kolkata,Banglore,CCU → BLR,19:55,22:25,2h 30m,non-stop,No info,4107,"(2, 30)",22:25:00
10679,Air India,27/04/2019,Kolkata,Banglore,CCU → BLR,20:45,23:20,2h 35m,non-stop,No info,4145,"(2, 35)",23:20:00
10680,Jet Airways,27/04/2019,Banglore,Delhi,BLR → DEL,08:20,11:20,3h,non-stop,No info,7229,3,11:20:00
10681,Vistara,01/03/2019,Banglore,New Delhi,BLR → DEL,11:30,14:10,2h 40m,non-stop,No info,12648,"(2, 40)",14:10:00


In [310]:
import re
flights.assign(duration = lambda df_ :[int(x[0])*60 + int(x[1]) if len(x)>1 else int(x[0])*60 for x in df_['Duration'].str.findall(r'\d+')], 
               arrival_time_month = lambda df_ : df_['Arrival_Time'].str.findall(r'\d{2}:\d{2}').str.get(0).astype('datetime64[ns]').dt.time 
               )
 

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,duration,arrival_time_month
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,170,01:10:00
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662,445,13:15:00
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882,1140,04:25:00
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218,325,23:30:00
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302,285,21:35:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,9/04/2019,Kolkata,Banglore,CCU → BLR,19:55,22:25,2h 30m,non-stop,No info,4107,150,22:25:00
10679,Air India,27/04/2019,Kolkata,Banglore,CCU → BLR,20:45,23:20,2h 35m,non-stop,No info,4145,155,23:20:00
10680,Jet Airways,27/04/2019,Banglore,Delhi,BLR → DEL,08:20,11:20,3h,non-stop,No info,7229,180,11:20:00
10681,Vistara,01/03/2019,Banglore,New Delhi,BLR → DEL,11:30,14:10,2h 40m,non-stop,No info,12648,160,14:10:00


In [96]:
list(flights.columns[flights.dtypes=='object'])

['Airline',
 'Date_of_Journey',
 'Source',
 'Destination',
 'Route',
 'Dep_Time',
 'Arrival_Time',
 'Duration',
 'Total_Stops',
 'Additional_Info']

In [97]:
flights.filter(items=list(flights.columns[flights.dtypes=='object']),axis=1)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info
...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,9/04/2019,Kolkata,Banglore,CCU → BLR,19:55,22:25,2h 30m,non-stop,No info
10679,Air India,27/04/2019,Kolkata,Banglore,CCU → BLR,20:45,23:20,2h 35m,non-stop,No info
10680,Jet Airways,27/04/2019,Banglore,Delhi,BLR → DEL,08:20,11:20,3h,non-stop,No info
10681,Vistara,01/03/2019,Banglore,New Delhi,BLR → DEL,11:30,14:10,2h 40m,non-stop,No info


In [98]:
{
    col : flights[col].str.strip()
    for col in flights.select_dtypes(include='O').columns
}       

{'Airline': 0             IndiGo
 1          Air India
 2        Jet Airways
 3             IndiGo
 4             IndiGo
             ...     
 10678       Air Asia
 10679      Air India
 10680    Jet Airways
 10681        Vistara
 10682      Air India
 Name: Airline, Length: 10683, dtype: object,
 'Date_of_Journey': 0        24/03/2019
 1         1/05/2019
 2         9/06/2019
 3        12/05/2019
 4        01/03/2019
             ...    
 10678     9/04/2019
 10679    27/04/2019
 10680    27/04/2019
 10681    01/03/2019
 10682     9/05/2019
 Name: Date_of_Journey, Length: 10683, dtype: object,
 'Source': 0        Banglore
 1         Kolkata
 2           Delhi
 3         Kolkata
 4        Banglore
            ...   
 10678     Kolkata
 10679     Kolkata
 10680    Banglore
 10681    Banglore
 10682       Delhi
 Name: Source, Length: 10683, dtype: object,
 'Destination': 0        New Delhi
 1         Banglore
 2           Cochin
 3         Banglore
 4        New Delhi
            ...   

In [84]:
flights['Airline'].unique()

array(['IndiGo', 'Air India', 'Jet Airways', 'SpiceJet',
       'Multiple carriers', 'GoAir', 'Vistara', 'Air Asia',
       'Vistara Premium economy', 'Jet Airways Business',
       'Multiple carriers Premium economy', 'Trujet'], dtype=object)

In [89]:
(
    flights['Airline']
    .str.replace(" Premium economy", " ")
    .str.replace(" Business", " ")
    .str.title()
    .unique()
)

array(['Indigo', 'Air India', 'Jet Airways', 'Spicejet',
       'Multiple Carriers', 'Goair', 'Vistara', 'Air Asia', 'Vistara ',
       'Jet Airways ', 'Multiple Carriers ', 'Trujet'], dtype=object)

In [95]:
flights['Date_of_Journey'].astype('datetime64[ns]')

0       2019-03-24
1       2019-01-05
2       2019-09-06
3       2019-12-05
4       2019-01-03
           ...    
10678   2019-09-04
10679   2019-04-27
10680   2019-04-27
10681   2019-01-03
10682   2019-09-05
Name: Date_of_Journey, Length: 10683, dtype: datetime64[ns]

In [105]:
flights['Source'].unique()

array(['Banglore', 'Kolkata', 'Delhi', 'Chennai', 'Mumbai'], dtype=object)

In [106]:
flights['Destination'].unique()

array(['New Delhi', 'Banglore', 'Cochin', 'Kolkata', 'Delhi', 'Hyderabad'],
      dtype=object)

In [109]:
flights['Dep_Time']

0        22:20
1        05:50
2        09:25
3        18:05
4        16:50
         ...  
10678    19:55
10679    20:45
10680    08:20
10681    11:30
10682    10:55
Name: Dep_Time, Length: 10683, dtype: object

In [195]:
(
    flights['Dep_Time']
    .loc[lambda ser: ser.str.contains("[^0-9:]")]
)

Series([], Name: Dep_Time, dtype: object)

In [134]:
flights['Arrival_Time'].loc[lambda ser: ser.str.contains("[^0-9:]")].str.split(" ",n=1).str.get(1).unique()

array(['22 Mar', '10 Jun', '13 Mar', '02 Mar', '10 May', '04 Mar',
       '13 Jun', '28 May', '19 Mar', '07 May', '02 Jun', '16 Jun',
       '19 May', '16 May', '28 Jun', '02 May', '28 Mar', '19 Jun',
       '04 Apr', '25 Mar', '07 Mar', '25 Jun', '07 Jun', '25 May',
       '13 May', '16 Mar', '22 May', '10 Apr', '04 Jun', '20 May',
       '28 Apr', '25 Apr', '10 Mar', '19 Apr', '13 Apr', '02 Apr',
       '23 Mar', '22 Apr', '11 May', '07 Apr', '03 May', '08 Mar',
       '03 Mar', '05 Mar', '22 Jun', '04 May', '26 May', '16 Apr',
       '26 Jun', '29 May', '29 Jun', '29 Mar', '23 May', '17 Jun'],
      dtype=object)

In [143]:
(
    flights['Duration']
    .loc[lambda ser: ~ser.str.contains("m")]
)

2        19h
18       23h
33       22h
44       12h
53        3h
        ... 
10591    23h
10638    14h
10639    38h
10673    15h
10680     3h
Name: Duration, Length: 1031, dtype: object

In [144]:
(
    flights['Duration']
    .loc[lambda ser: ~ser.str.contains("h")]
)

6474    5m
Name: Duration, dtype: object

In [146]:
flights.iloc[[6474]]

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
6474,Air India,6/03/2019,Mumbai,Hyderabad,BOM → GOI → PNQ → HYD,16:50,16:55,5m,2 stops,No info,17327


Observation
- The data record for index 6474, has duration for 5 minutes. This is clearly wrong, so we will drop it

In [162]:
(
    flights['Duration']
    .drop(index = [6474])
    .str.split(" ",expand = True)
    .set_axis(["hour","minutes"],axis=1)
    .assign(
        hour = lambda df_:df_['hour'].str.replace('h', "").astype(int).mul(60)
    )
    .assign(
        minutes = lambda df_:df_['minutes'].str.replace('m',"").fillna(0).astype(int)
    )
    .sum(axis=1)
)

0         170
1         445
2        1140
3         325
4         285
         ... 
10678     150
10679     155
10680     180
10681     160
10682     500
Length: 10682, dtype: int64

In [164]:
(
    flights['Duration']
    .drop(index = [6474])
    .str.split(" ",expand = True)
    .set_axis(["hour","minutes"],axis=1)
    .assign(
        hour = lambda df_:df_['hour'].str.replace('h', "").astype(int).mul(60)
    )
    .assign(
        minutes = lambda df_:df_['minutes'].str.replace('m',"").fillna(0).astype(int)
    )
    .sum(axis=1)
    .rename("Duration Minures")
    .to_frame()
    .join(flights['Duration'].drop(index = [6474]))
)

Unnamed: 0,Duration Minures,Duration
0,170,2h 50m
1,445,7h 25m
2,1140,19h
3,325,5h 25m
4,285,4h 45m
...,...,...
10678,150,2h 30m
10679,155,2h 35m
10680,180,3h
10681,160,2h 40m


In [176]:
flights['Additional_Info'].unique()

array(['No info', 'In-flight meal not included',
       'No check-in baggage included', '1 Short layover', 'No Info',
       '1 Long layover', 'Change airports', 'Business class',
       'Red-eye flight', '2 Long layover'], dtype=object)

## Cleaning Operations

In [167]:
def convert_to_minutes(ser):
    return (
        ser
        .str.split(" ",expand = True)
        .set_axis(["hour","minutes"],axis=1)
        .assign(
            hour = lambda df_:df_['hour'].str.replace('h', "").astype(int).mul(60)
        )
        .assign(
            minutes = lambda df_:df_['minutes'].str.replace('m',"").fillna(0).astype(int)
        )
        .sum(axis=1)
    )

In [181]:
def clean_data(df):
    return (
        df
        .drop(index = [6474])
        .drop_duplicates()
        .assign(
            **{
                col : df[col].str.strip()
                for col in df.select_dtypes(include='O').columns
            }  
        )
        .assign(
            Airline = lambda df_ : df_['Airline']
                .str.replace(" Premium economy", " ")
                .str.replace(" Business", " ")
                .str.title() ,
            Date_of_Journey = lambda df_ : df_['Date_of_Journey'].astype('datetime64[ns]'),
            Dep_Time = lambda df_ : df_['Dep_Time'].astype('datetime64[ns]').dt.time,
            Arrival_Time = lambda df_ : df_['Arrival_Time'].astype('datetime64[ns]').dt.time,
            Duration = lambda df_:df_['Duration'].pipe(convert_to_minutes),
            Total_Stops = lambda df_ :df_['Total_Stops'].replace("non-stop",0).replace(" stops?","",regex=True).astype(float),
            Additional_Info = lambda df_ :df_['Additional_Info'].replace("No info","No Info")
        )
        .drop(columns = ['Route'])
        .rename(columns = str.lower)
    )

In [183]:
flights_cleaned = clean_data(flights)

### Split the data

In [1]:
flights_final = flights_cleaned.sample(1000)
X = flights_final.drop(columns=['price'])
y = flights_final['price']

X_,X_test,y_,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train,X_val,y_train,y_val = train_test_split(X_,y_,test_size=0.2,random_state=42)

print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

NameError: name 'flights_cleaned' is not defined

### Export data.csv

In [190]:
def export_path(X,y,name):
    filename = f"{name}.csv"
    file_path = os.path.join(project_dir,data_dir,filename)
    X.join(y).to_csv(file_path,index = False)
    return pd.read_csv(file_path).head()

In [191]:
export_path(X_train,y_train,"train")

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-12-03,Banglore,New Delhi,08:55:00,21:20:00,745,1.0,No Info,13817
1,Indigo,2019-03-06,Mumbai,Hyderabad,01:40:00,03:10:00,90,0.0,No Info,2754
2,Jet Airways,2019-09-05,Kolkata,Banglore,20:00:00,04:40:00,520,1.0,In-flight meal not included,8586
3,Multiple Carriers,2019-05-27,Delhi,Cochin,11:30:00,19:15:00,465,1.0,No Info,15898
4,Indigo,2019-03-18,Chennai,Kolkata,07:55:00,10:15:00,140,0.0,No Info,3850


In [192]:
export_path(X_val,y_val,"val")

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Spicejet,2019-03-04,Banglore,Delhi,09:30:00,12:20:00,170,0.0,No Info,3971
1,Indigo,2019-03-15,Kolkata,Banglore,20:25:00,23:05:00,160,0.0,No Info,4462
2,Indigo,2019-03-03,Kolkata,Banglore,20:25:00,23:05:00,160,0.0,No Info,6038
3,Jet Airways,2019-01-05,Mumbai,Hyderabad,19:35:00,21:05:00,90,0.0,In-flight meal not included,2228
4,Jet Airways,2019-06-06,Banglore,Delhi,07:10:00,10:10:00,180,0.0,In-flight meal not included,6478


In [193]:
export_path(X_test,y_test,"test")

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-09-05,Kolkata,Banglore,20:25:00,10:55:00,870,1.0,In-flight meal not included,8066
1,Jet Airways,2019-03-03,Delhi,Cochin,05:40:00,18:50:00,2230,2.0,No Info,21314
2,Jet Airways,2019-01-03,Banglore,New Delhi,08:00:00,18:25:00,625,1.0,No Info,26890
3,Jet Airways,2019-06-06,Delhi,Cochin,21:25:00,19:00:00,1295,2.0,In-flight meal not included,10588
4,Multiple Carriers,2019-03-21,Delhi,Cochin,07:00:00,13:20:00,380,1.0,No Info,9042


### Ignore this Piece (Practice/Experiment)

In [130]:
hr = flights['Duration'].loc[lambda ser: ser.str.contains('h')].str.split('h').str.get(0).astype(int).fillna(0)

In [131]:
df = pd.DataFrame(index = range(len(flights)))
df['h'] = hr

In [136]:
min = flights['Duration'].loc[lambda ser: ser.str.contains('m')].str.findall((r'(\d+)\s*m')).str.get(0).astype(int)
df['min'] = min

In [142]:
df['total_time'] = df['h'].mul(60).fillna(0) + df['min'].fillna(0)
df['total_time'].isna().sum()

0

In [145]:
df.fillna(0,inplace=True)
df

Unnamed: 0,h,min,total_time
0,2.0,50.0,170.0
1,7.0,25.0,445.0
2,19.0,0.0,1140.0
3,5.0,25.0,325.0
4,4.0,45.0,285.0
...,...,...,...
10678,2.0,30.0,150.0
10679,2.0,35.0,155.0
10680,3.0,0.0,180.0
10681,2.0,40.0,160.0


In [140]:
pd.__version__

'2.2.2'