In [1]:
import os
import matplotlib
import sklearn
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error
from feature_engine.selection import SelectBySingleFeaturePerformance
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
    MinMaxScaler,
    PowerTransformer,
    FunctionTransformer
)
from feature_engine.encoding import (
	RareLabelEncoder,
    MeanEncoder,
    CountFrequencyEncoder
)
from feature_engine.datetime import DatetimeFeatures

In [2]:
PROJECT_DIR = 'R:\Jaydeep\Flight-Price-Prediction'
DATA_DIR = 'data'
MAIN_DATASET_NAME = 'flight_price'

In [3]:
def get_dataset(dataset_name):
    file_name = f'{dataset_name}.csv'
    file_path = os.path.join(PROJECT_DIR, DATA_DIR, file_name)
    return pd.read_csv(file_path)

In [4]:
df = get_dataset(MAIN_DATASET_NAME)

In [5]:
df.duplicated().sum()

220

In [6]:
df.isnull().sum().sum()

2

In [7]:
def clean_column_names(df):
    return df.rename(columns=str.lower)

def strip_string_columns(df):
    string_columns = df.select_dtypes(include='O').columns
    for col in string_columns:
        df[col] = df[col].str.strip()
    return df

def clean_airline_names(df):
    df['airline'] = (
        df['airline']
        .str.replace(" Premium economy", "")
        .str.replace(" Business", "")
        .str.title()
    )
    return df

def convert_dates(df):
    df['date_of_journey'] = pd.to_datetime(df['date_of_journey'], dayfirst=True)
    return df

def convert_times(df):
    df['dep_time'] =  pd.to_datetime(df['dep_time'], format='mixed').dt.time
    df['arrival_time'] = pd.to_datetime(df['arrival_time'], format='mixed').dt.time
    return df

def convert_duration(df):
    duration_split = df['duration'].str.split(" ", expand=True).set_axis(["hour", "minute"], axis=1)
    duration_split['hour'] = duration_split['hour'].str.replace("h", "").astype(int).mul(60)
    duration_split['minute'] = duration_split['minute'].str.replace("m", "").fillna("0").astype(int)
    df['duration_minute'] = duration_split.sum(axis=1)
    df = df.drop(columns=['duration'])
    return df

def convert_total_stops(df):
    df['total_stops'] = (
        df['total_stops']
        .replace("non-stop", "0")  
        .str.replace(" stops?", "", regex=True)  
        .pipe(lambda ser: pd.to_numeric(ser)) 
    )
    return df

def lower_additional_info(df):
    df['additional_info'] = df['additional_info'].str.lower()
    return df

def preprocess_df(df):
    df = df.drop(index=df[df['Duration'].isin(['5m'])].index, columns=['Route'])
    df = df.drop_duplicates()
    df = df.dropna()
    df = clean_column_names(df)         
    df = strip_string_columns(df)       
    df = clean_airline_names(df)        
    df = convert_dates(df)         
    df = convert_times(df)             
    df = convert_duration(df)       
    df = convert_total_stops(df)      
    df = lower_additional_info(df)     
    df = df.drop_duplicates()
    df = df.dropna()
    return df

In [8]:
df = preprocess_df(df)

In [9]:
df = df[['airline',
 'date_of_journey',
 'source',
 'destination',
 'dep_time',
 'arrival_time',
 'total_stops',
 'additional_info',
 'duration_minute',
 'price']]

In [10]:
X = df.drop(columns="price")
y = df.price.copy()

X_, X_test, y_, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_, y_, test_size=0.2, random_state=42)

print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)

(6693, 9) (6693,)
(1674, 9) (1674,)
(2092, 9) (2092,)


In [11]:
def get_metrics(model, X, y):
    y_pred = model.predict(X)
    r2 = r2_score(y, y_pred)
    n = len(y)
    p = X.shape[1]
    adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    print(f'R^2: {r2:.4f}')
    print(f'Adjusted R^2: {adj_r2:.4f}')
    print(f'RMSE: {rmse:.4f}')

In [12]:
airline_transformer = Pipeline(steps=[
    ('grouper', RareLabelEncoder(tol=0.1, replace_with='other', n_categories=2)),
    ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

date_tranformer = Pipeline(steps=[
    ('date_to_features', DatetimeFeatures(features_to_extract=['month', 'day_of_week', 'day_of_year'], yearfirst=True, format='mixed')),
    ('min_max_scaler', MinMaxScaler())
])

location_transformer = Pipeline(steps=[
    ('grouper', RareLabelEncoder(tol=0.1, replace_with='other', n_categories=2)),
    ('mean_encoder', MeanEncoder()),
    ('power_transformer', PowerTransformer())
])

def is_north(X):
	columns = X.columns.to_list()
	north_cities = {"Delhi", "Kolkata", "Mumbai", "New Delhi"}
	return (
		X
		.assign(**{
			f"{col}_is_north": X.loc[:, col].isin(north_cities).astype(int)
			for col in columns
		})
		.drop(columns=columns)
	)


location_union_transformer = FeatureUnion(transformer_list=[
	("location_transformer", location_transformer),
	("is_north_transformer", FunctionTransformer(func=is_north))
])

time_transformer = Pipeline(
    steps = [
        ('dt', DatetimeFeatures(features_to_extract=['hour', 'minute'])),
        ('scaler', MinMaxScaler())
    ]
)

def part_of_day(X, start=0 , mid=8, end=16):
    columns = X.columns.to_list()
    X_temp = X.assign(
        **{
            col: pd.to_datetime(X.loc[:, col]).dt.hour
            for col in columns
        }
    )
    return (
        X_temp
        .assign(
            **{
                f'{col}_part_of_day': np.select(
                    [
                        X_temp.loc[:, col].between(start, mid, inclusive='left'),
                        X_temp.loc[:, col].between(mid, end, inclusive='left')
                    ], choicelist = ['start', 'mid'] ,default = 'end'
                )
                for col in columns
            }
        ).drop(columns=columns)
    )

part_of_day_transformer = Pipeline(
    steps = [
        ('part_of_day_func', FunctionTransformer(func=part_of_day)),
        ('count_fre_encoder', CountFrequencyEncoder()),
        ('min_max_scaler', MinMaxScaler())
    ]
)

time_union_transformer = FeatureUnion(
    transformer_list=[
        ('time_transformer', time_transformer),
        ('part_of_day_transformer', part_of_day_transformer)
    ]
)

duration_log_transformer = FunctionTransformer(func=np.log)

def is_direct_flight(X):
    return X.assign(
        is_direct_flight = X.total_stops.eq(0).astype(int)
    )

total_stops_transformer = FunctionTransformer(func=is_direct_flight)

column_transformer = ColumnTransformer(transformers=[
    ('airline_transformer', airline_transformer, ['airline']),
    ('date_transformer', date_tranformer, ['date_of_journey']),
    ('location_union_transformer', location_union_transformer, ['source', 'destination']),
    ('time_union_transformer', time_union_transformer, ['dep_time', 'arrival_time']),
    ('duration_log_transformer', duration_log_transformer, ['duration_minute']),
    ('total_stops_trasformer', total_stops_transformer, ['total_stops'])
])

In [13]:
X_train['dep_time'] = pd.to_datetime(X_train['dep_time'].astype(str), format='%H:%M:%S')
X_train['arrival_time'] = pd.to_datetime(X_train['arrival_time'].astype(str), format='%H:%M:%S')

X_val['dep_time'] = pd.to_datetime(X_val['dep_time'].astype(str), format='%H:%M:%S')
X_val['arrival_time'] = pd.to_datetime(X_val['arrival_time'].astype(str), format='%H:%M:%S')

X_test['dep_time'] = pd.to_datetime(X_test['dep_time'].astype(str), format='%H:%M:%S')
X_test['arrival_time'] = pd.to_datetime(X_test['arrival_time'].astype(str), format='%H:%M:%S')

In [14]:
X_val.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1674 entries, 494 to 10080
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   airline          1674 non-null   object        
 1   date_of_journey  1674 non-null   datetime64[ns]
 2   source           1674 non-null   object        
 3   destination      1674 non-null   object        
 4   dep_time         1674 non-null   datetime64[ns]
 5   arrival_time     1674 non-null   datetime64[ns]
 6   total_stops      1674 non-null   int64         
 7   additional_info  1674 non-null   object        
 8   duration_minute  1674 non-null   int64         
dtypes: datetime64[ns](3), int64(2), object(4)
memory usage: 130.8+ KB


In [15]:
models = [LinearRegression(), SVR(),  KNeighborsRegressor(), DecisionTreeRegressor(), RandomForestRegressor(), GradientBoostingRegressor()]

In [16]:
for model in models:
    pipeline = Pipeline(steps=[
    ('preprocessor', column_transformer),  # ColumnTransformer
    ('model', model)  # RandomForestRegressor
    ])
    pipeline.fit(X_train, y_train)
    print(f'Model Name: {model}')
    get_metrics(pipeline, X_val, y_val)
    print('=============================')

Model Name: LinearRegression()
R^2: 0.5728
Adjusted R^2: 0.5705
RMSE: 3044.2407
Model Name: SVR()
R^2: 0.0520
Adjusted R^2: 0.0469
RMSE: 4534.6868
Model Name: KNeighborsRegressor()
R^2: 0.6126
Adjusted R^2: 0.6106
RMSE: 2898.7150
Model Name: DecisionTreeRegressor()
R^2: 0.6634
Adjusted R^2: 0.6616
RMSE: 2701.9897
Model Name: RandomForestRegressor()
R^2: 0.7961
Adjusted R^2: 0.7950
RMSE: 2102.9392
Model Name: GradientBoostingRegressor()
R^2: 0.7757
Adjusted R^2: 0.7745
RMSE: 2205.8532


In [17]:
pipeline = Pipeline(steps=[
    ('preprocessor', column_transformer),
    ('RandomForestRegressor', RandomForestRegressor(max_depth=20, max_features=0.5, n_estimators=150))  
])


In [18]:
pipeline.fit(X_train, y_train)

print()
print('==========Train DATASET==========')
get_metrics(pipeline, X_train, y_train)

print()
print('==========VALIDATION DATASET==========')
get_metrics(pipeline, X_val, y_val)

print()
print('==========TEST DATASET==========')
get_metrics(pipeline, X_test, y_test)


R^2: 0.9522
Adjusted R^2: 0.9522
RMSE: 1014.7369

R^2: 0.8060
Adjusted R^2: 0.8049
RMSE: 2051.5594

R^2: 0.8008
Adjusted R^2: 0.8000
RMSE: 2025.2181
