In [4]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv("Flight_Price.csv")
df.head(15)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR ? DEL,22:20,3/22/2025 1:10,2h 50m,non-stop,No info,3897
1,Air India,1/5/2019,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,5:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/6/2019,Delhi,Cochin,DEL ? LKO ? BOM ? COK,9:25,6/10/2025 4:25,19h,2 stops,No info,13882
3,IndiGo,12/5/2019,Kolkata,Banglore,CCU ? NAG ? BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,1/3/2019,Banglore,New Delhi,BLR ? NAG ? DEL,16:50,21:35,4h 45m,1 stop,No info,13302
5,SpiceJet,24/06/2019,Kolkata,Banglore,CCU ? BLR,9:00,11:25,2h 25m,non-stop,No info,3873
6,Jet Airways,12/3/2019,Banglore,New Delhi,BLR ? BOM ? DEL,18:55,3/13/2025 10:25,15h 30m,1 stop,In-flight meal not included,11087
7,Jet Airways,1/3/2019,Banglore,New Delhi,BLR ? BOM ? DEL,8:00,3/2/2025 5:05,21h 5m,1 stop,No info,22270
8,Jet Airways,12/3/2019,Banglore,New Delhi,BLR ? BOM ? DEL,8:55,3/13/2025 10:25,25h 30m,1 stop,In-flight meal not included,11087
9,Multiple carriers,27/05/2019,Delhi,Cochin,DEL ? BOM ? COK,11:25,19:15,7h 50m,1 stop,No info,8625


## Data Preprocessing

In [6]:
df["Date_of_Journey"] = pd.to_datetime(df["Date_of_Journey"], dayfirst=True)

In [7]:
df['Date_of_Journey'] = df['Date_of_Journey'].dt.strftime('%d-%m-%Y')

In [8]:
df.drop_duplicates(inplace=True)

In [9]:
df.dropna(inplace=True)

In [10]:
df.isnull().sum().sum()

0

In [11]:
df["Route"] = df["Route"].str.replace('?','to',regex=False)

In [12]:
import pandas as pd

df['Dep_Time'] = df.apply(
    lambda row: (pd.to_datetime(row['Arrival_Time']).normalize() - pd.Timedelta(days=1)).strftime('%m/%d/%Y') + ' ' + row['Dep_Time']
    if '/' in str(row['Arrival_Time']) else row['Dep_Time'], axis=1
)


In [13]:
df["Airline"].value_counts()

Airline
Jet Airways                          3700
IndiGo                               2043
Air India                            1694
Multiple carriers                    1196
SpiceJet                              815
Vistara                               478
Air Asia                              319
GoAir                                 194
Multiple carriers Premium economy      13
Jet Airways Business                    6
Vistara Premium economy                 3
Trujet                                  1
Name: count, dtype: int64

In [14]:
df["Source"].value_counts()

Source
Delhi       4345
Kolkata     2860
Banglore    2179
Mumbai       697
Chennai      381
Name: count, dtype: int64

In [15]:
df["Destination"].value_counts()

Destination
Cochin       4345
Banglore     2860
Delhi        1265
New Delhi     914
Hyderabad     697
Kolkata       381
Name: count, dtype: int64

In [16]:
df["Total_Stops"].value_counts()

Total_Stops
1 stop      5625
non-stop    3475
2 stops     1318
3 stops       43
4 stops        1
Name: count, dtype: int64

In [17]:
df["Total_Stops"] = df["Total_Stops"].replace({"1 stop":1, "non-stop":0,"2 stops":2,"3 stops":3,"4 stops":4})

In [18]:
df = df.drop(columns=["Date_of_Journey","Dep_Time","Arrival_Time","Route","Additional_Info"], axis=1)

In [19]:
df = df.join(pd.get_dummies(df["Airline"], dtype=int, drop_first=True, prefix="Airline")).drop("Airline", axis=1)
df = df.join(pd.get_dummies(df["Source"], dtype=int, drop_first=True, prefix="Source")).drop("Source", axis=1)
df = df.join(pd.get_dummies(df["Destination"], dtype=int, drop_first=True, prefix="Destination")).drop("Destination", axis=1)

In [20]:
def convert_duration(duration):
    # Split the duration into hours and minutes
    hours = 0
    minutes = 0
    
    if "h" in duration:
        hours = int(duration.split("h")[0].strip())
    if "m" in duration:
        minutes = int(duration.split("h")[-1].replace("m", "").strip())
    
    # Convert to decimal and round to 2 decimal places
    return round(hours + (minutes / 60), 2)

# Apply the function to the 'Duration' column
df['Duration'] = df['Duration'].apply(convert_duration)


In [21]:
df.head(15)

Unnamed: 0,Duration,Total_Stops,Price,Airline_Air India,Airline_GoAir,Airline_IndiGo,Airline_Jet Airways,Airline_Jet Airways Business,Airline_Multiple carriers,Airline_Multiple carriers Premium economy,...,Airline_Vistara Premium economy,Source_Chennai,Source_Delhi,Source_Kolkata,Source_Mumbai,Destination_Cochin,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_New Delhi
0,2.83,0,3897,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,7.42,2,7662,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,19.0,2,13882,0,0,0,1,0,0,0,...,0,0,1,0,0,1,0,0,0,0
3,5.42,1,6218,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,4.75,1,13302,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5,2.42,0,3873,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
6,15.5,1,11087,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7,21.08,1,22270,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
8,25.5,1,11087,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9,7.83,1,8625,0,0,0,0,0,1,0,...,0,0,1,0,0,1,0,0,0,0


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10462 entries, 0 to 10682
Data columns (total 23 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Duration                                   10462 non-null  float64
 1   Total_Stops                                10462 non-null  int64  
 2   Price                                      10462 non-null  int64  
 3   Airline_Air India                          10462 non-null  int32  
 4   Airline_GoAir                              10462 non-null  int32  
 5   Airline_IndiGo                             10462 non-null  int32  
 6   Airline_Jet Airways                        10462 non-null  int32  
 7   Airline_Jet Airways Business               10462 non-null  int32  
 8   Airline_Multiple carriers                  10462 non-null  int32  
 9   Airline_Multiple carriers Premium economy  10462 non-null  int32  
 10  Airline_SpiceJet           

## Training Regression Model