In [14]:
# -----------------------------
# 1. Import libraries
# -----------------------------
import pandas as pd

In [15]:
# -----------------------------
# 2. Load dataset
# -----------------------------
df = pd.read_csv('Clean_Dataset_Final.csv')
print(df.head())

    airline   flight source_city departure_time stops   arrival_time  \
0  SpiceJet  SG-8709       Delhi        Evening  zero          Night   
1  SpiceJet  SG-8157       Delhi  Early_Morning  zero        Morning   
2   AirAsia   I5-764       Delhi  Early_Morning  zero  Early_Morning   
3   Vistara   UK-995       Delhi        Morning  zero      Afternoon   
4   Vistara   UK-963       Delhi        Morning  zero        Morning   

  destination_city    class  duration  days_left  price  
0           Mumbai  Economy      2.17          1   5953  
1           Mumbai  Economy      2.33          1   5953  
2           Mumbai  Economy      2.17          1   5956  
3           Mumbai  Economy      2.25          1   5955  
4           Mumbai  Economy      2.33          1   5955  


In [16]:
# -----------------------------
# 3. Drop unnecessary column
# -----------------------------
df = df.drop(['flight'], axis=1) 

In [17]:
# -----------------------------
# 4. Explore categorical values
# -----------------------------
print(df.airline.value_counts())
print(df.source_city.value_counts())
print(df.destination_city.value_counts())
print(df.departure_time.value_counts())
print(df.arrival_time.value_counts())
print(df.stops.value_counts())
print(df['class'].value_counts())

print("Min duration:", df['duration'].min())
print("Max duration:", df['duration'].max())
print("Median duration:", df['duration'].median())

airline
Vistara      127859
Air_India     80892
Indigo        43120
GO_FIRST      23173
AirAsia       16098
SpiceJet       9011
Name: count, dtype: int64
source_city
Delhi        61343
Mumbai       60896
Bangalore    52061
Kolkata      46347
Hyderabad    40806
Chennai      38700
Name: count, dtype: int64
destination_city
Mumbai       59097
Delhi        57360
Bangalore    51068
Kolkata      49534
Hyderabad    42726
Chennai      40368
Name: count, dtype: int64
departure_time
Morning          71146
Early_Morning    66790
Evening          65102
Night            48015
Afternoon        47794
Late_Night        1306
Name: count, dtype: int64
arrival_time
Night            91538
Evening          78323
Morning          62735
Afternoon        38139
Early_Morning    15417
Late_Night       14001
Name: count, dtype: int64
stops
one            250863
zero            36004
two_or_more     13286
Name: count, dtype: int64
class
Economy     206666
Business     93487
Name: count, dtype: int64
Min duration:

In [18]:
# -----------------------------
# 5. Encode categorical features
# -----------------------------
# Encode class (Business = 1, Economy = 0)
df['class'] = df['class'].apply(lambda x: 1 if x == 'Business' else 0)

# Label-encode stops
df['stops'] = pd.factorize(df['stops'])[0]

# One-hot encoding for other categories
df = df.join(pd.get_dummies(df['airline'], prefix='airline')).drop('airline', axis=1)
df = df.join(pd.get_dummies(df['source_city'], prefix='source')).drop('source_city', axis=1)
df = df.join(pd.get_dummies(df['destination_city'], prefix='dest')).drop('destination_city', axis=1)
df = df.join(pd.get_dummies(df['arrival_time'], prefix='arrival')).drop('arrival_time', axis=1)
df = df.join(pd.get_dummies(df['departure_time'], prefix='departure')).drop('departure_time', axis=1)

In [19]:
# -----------------------------
# 6. Fix datatype issues
# -----------------------------
cols = [
    'airline_AirAsia','airline_Air_India','airline_GO_FIRST','airline_Indigo',
    'airline_SpiceJet','airline_Vistara','arrival_Evening','arrival_Late_Night',
    'arrival_Morning','arrival_Night','departure_Afternoon','departure_Early_Morning',
    'departure_Evening','departure_Late_Night','departure_Morning','departure_Night'
]

df[cols] = df[cols].astype(int)


In [20]:
# -----------------------------
# 7. Final processed dataset
# -----------------------------
print(df.info())
df.to_csv("Processed_Dataset.csv", index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300153 entries, 0 to 300152
Data columns (total 35 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   stops                    300153 non-null  int64  
 1   class                    300153 non-null  int64  
 2   duration                 300153 non-null  float64
 3   days_left                300153 non-null  int64  
 4   price                    300153 non-null  int64  
 5   airline_AirAsia          300153 non-null  int32  
 6   airline_Air_India        300153 non-null  int32  
 7   airline_GO_FIRST         300153 non-null  int32  
 8   airline_Indigo           300153 non-null  int32  
 9   airline_SpiceJet         300153 non-null  int32  
 10  airline_Vistara          300153 non-null  int32  
 11  source_Bangalore         300153 non-null  bool   
 12  source_Chennai           300153 non-null  bool   
 13  source_Delhi             300153 non-null  bool   
 14  sour