In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('Clean_Dataset.csv') 
print("Dataset loaded successfully from CSV")

print(f"Dataset shape: {df.shape}")
print("\nFirst 5 rows:")
display(df.head())

print("\n Dataset Info:")
print(df.info())

print("\n Price Statistics:")
print(df['price'].describe())

Dataset loaded successfully from CSV
Dataset shape: (300153, 12)

First 5 rows:


Unnamed: 0.1,Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955



 Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300153 entries, 0 to 300152
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        300153 non-null  int64  
 1   airline           300153 non-null  object 
 2   flight            300153 non-null  object 
 3   source_city       300153 non-null  object 
 4   departure_time    300153 non-null  object 
 5   stops             300153 non-null  object 
 6   arrival_time      300153 non-null  object 
 7   destination_city  300153 non-null  object 
 8   class             300153 non-null  object 
 9   duration          300153 non-null  float64
 10  days_left         300153 non-null  int64  
 11  price             300153 non-null  int64  
dtypes: float64(1), int64(3), object(8)
memory usage: 27.5+ MB
None

 Price Statistics:
count    300153.000000
mean      20889.660523
std       22697.767366
min        1105.000000
25%        4783.00000

In [None]:
df_clean = df.copy()

print("Missing values per column:")
print(df_clean.isnull().sum())

for col in df_clean.columns:
    if df_clean[col].isnull().sum() > 0:
        if df_clean[col].dtype == 'object':
            df_clean[col].fillna('Unknown', inplace=True)
        else:
            df_clean[col].fillna(df_clean[col].median(), inplace=True)

print(" Missing values handled")

def duration_to_minutes(duration):
    """Convert duration string to total minutes"""
    try:
        if isinstance(duration, str):
            if 'h' in duration and 'm' in duration:
                hours = int(duration.split('h')[0])
                minutes = int(duration.split('h')[1].split('m')[0])
                return hours * 60 + minutes
            elif 'h' in duration:
                return int(duration.replace('h', '')) * 60
            else:
                return int(duration.replace('m', ''))
        return 120 
    except:
        return 120

df_clean['duration_minutes'] = df_clean['duration'].apply(duration_to_minutes)

df_clean['dep_hour'] = pd.to_datetime(df_clean['dep_time']).dt.hour
df_clean['arrival_hour'] = pd.to_datetime(df_clean['arrival_time']).dt.hour
df_clean['is_peak_hour'] = ((df_clean['dep_hour'] >= 7) & (df_clean['dep_hour'] <= 9)) | \
                          ((df_clean['dep_hour'] >= 17) & (df_clean['dep_hour'] <= 19))

print(" Time features extracted")

Missing values per column:
Unnamed: 0          0
airline             0
flight              0
source_city         0
departure_time      0
stops               0
arrival_time        0
destination_city    0
class               0
duration            0
days_left           0
price               0
dtype: int64
 Missing values handled
 Time features extracted


In [None]:
df_clean['route'] = df_clean['source_city'] + '-' + df_clean['destination_city']
 
np.random.seed(42)
df_clean['demand_factor'] = np.random.uniform(0.3, 2.5, len(df_clean))
 
df_clean['competitor_price'] = df_clean['price'] * np.random.uniform(0.8, 1.3, len(df_clean))

df_clean['is_weekend'] = pd.to_datetime(df_clean['departure_time']).dt.dayofweek.isin([5, 6]).astype(int)
df_clean['booking_urgency'] = 1 / (df_clean['days_left'] + 1)
 
route_popularity = df_clean['route'].value_counts().to_dict()
df_clean['route_popularity'] = df_clean['route'].map(route_popularity)
 
df_clean['price_ratio_vs_competitor'] = df_clean['price'] / df_clean['competitor_price']
df_clean['price_advantage'] = (df_clean['competitor_price'] - df_clean['price']) / df_clean['competitor_price']

print(" Engineered features created:")
new_features = ['duration_minutes', 'demand_factor', 'competitor_price', 'is_peak_hour', 
                'is_weekend', 'booking_urgency', 'route_popularity', 'price_ratio_vs_competitor', 'price_advantage']
print(new_features)

 Engineered features created:
['duration_minutes', 'demand_factor', 'competitor_price', 'is_peak_hour', 'is_weekend', 'booking_urgency', 'route_popularity', 'price_ratio_vs_competitor', 'price_advantage']
