# Load Libraries 


In [1]:
import pandas as pd  # For data handling
import numpy as np  # For numerical computations
import matplotlib.pyplot as plt  # For visualization
import seaborn as sns  # For advanced visualizations

from sklearn.model_selection import train_test_split, GridSearchCV  # For model training & hyperparameter tuning
from sklearn.preprocessing import LabelEncoder, StandardScaler  # For data preprocessing
from sklearn.ensemble import RandomForestRegressor  # Random Forest Model
from xgboost import XGBRegressor  # XGBoost Model
from sklearn.linear_model import LinearRegression  # Linear Regression Model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score  # Model evaluation metrics

# Data collection

In [3]:
airfare_dataset = pd.read_csv("cleaned_data.csv")

In [5]:
airfare_dataset.head(5)

Unnamed: 0.1,Unnamed: 0,airline,flight_code,source_city,time_taken,stop,destinate_city,price,Class,Days_Left,dep_time_category,arr_time_category
0,0,SpiceJet,SG-8709,Delhi,130,non-stop,Mumbai,5953,Economy,1,Evening,Night
1,1,SpiceJet,SG-8157,Delhi,140,non-stop,Mumbai,5953,Economy,1,Early Morning,Morning
2,2,AirAsia,I5-764,Delhi,130,non-stop,Mumbai,5956,Economy,1,Early Morning,Early Morning
3,3,Vistara,UK-995,Delhi,135,non-stop,Mumbai,5955,Economy,1,Morning,Afternoon
4,4,Vistara,UK-963,Delhi,140,non-stop,Mumbai,5955,Economy,1,Morning,Morning


In [6]:
airfare_dataset.shape

(298926, 12)

# Data Preprocessing & Feature Engineering

In [7]:
# Drop unnecessary columns (index column and flight code)
airfare_dataset.drop(columns=['Unnamed: 0', 'flight_code'], inplace=True)

In [9]:
airfare_dataset.isnull().sum()

airline              0
source_city          0
time_taken           0
stop                 0
destinate_city       0
price                0
Class                0
Days_Left            0
dep_time_category    0
arr_time_category    0
dtype: int64

In [12]:
airfare_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 298926 entries, 0 to 298925
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   airline            298926 non-null  object
 1   source_city        298926 non-null  object
 2   time_taken         298926 non-null  int64 
 3   stop               298926 non-null  object
 4   destinate_city     298926 non-null  object
 5   price              298926 non-null  int64 
 6   Class              298926 non-null  object
 7   Days_Left          298926 non-null  int64 
 8   dep_time_category  298926 non-null  object
 9   arr_time_category  298926 non-null  object
dtypes: int64(3), object(7)
memory usage: 22.8+ MB


In [15]:
airfare_dataset.describe()

Unnamed: 0,time_taken,price,Days_Left
count,298926.0,298926.0,298926.0
mean,724.486993,20884.235132,26.036661
std,420.109394,22675.125606,13.550043
min,50.0,1105.0,1.0
25%,405.0,4757.0,15.0
50%,670.0,7424.0,26.0
75%,960.0,42521.0,38.0
max,1815.0,98972.0,49.0


In [17]:
list[airfare_dataset.columns]

list[Index(['airline', 'source_city', 'time_taken', 'stop', 'destinate_city',
       'price', 'Class', 'Days_Left', 'dep_time_category',
       'arr_time_category'],
      dtype='object')]

In [13]:
# splitting 
categorical_columns = airfare_dataset.select_dtypes(include="object")
numerical_columns = airfare_dataset.select_dtypes(include="int64")

In [11]:
categorical_columns

Unnamed: 0,airline,source_city,stop,destinate_city,Class,dep_time_category,arr_time_category
0,SpiceJet,Delhi,non-stop,Mumbai,Economy,Evening,Night
1,SpiceJet,Delhi,non-stop,Mumbai,Economy,Early Morning,Morning
2,AirAsia,Delhi,non-stop,Mumbai,Economy,Early Morning,Early Morning
3,Vistara,Delhi,non-stop,Mumbai,Economy,Morning,Afternoon
4,Vistara,Delhi,non-stop,Mumbai,Economy,Morning,Morning
...,...,...,...,...,...,...,...
298921,Vistara,Chennai,1-stop,Hyderabad,Business,Morning,Evening
298922,Vistara,Chennai,1-stop,Hyderabad,Business,Afternoon,Night
298923,Vistara,Chennai,1-stop,Hyderabad,Business,Early Morning,Night
298924,Vistara,Chennai,1-stop,Hyderabad,Business,Early Morning,Evening


In [14]:
numerical_columns

Unnamed: 0,time_taken,price,Days_Left
0,130,5953,1
1,140,5953,1
2,130,5956,1
3,135,5955,1
4,140,5955,1
...,...,...,...
298921,605,69265,49
298922,625,77105,49
298923,830,79099,49
298924,600,81585,49


# Label Encoding

In [18]:
# Apply Label Encoding for categorical features
le = LabelEncoder()
for col in categorical_columns:
    airfare_dataset[col] = le.fit_transform(airfare_dataset[col])

# Feature & Label splitting

In [19]:
# Define features and target variable
X = airfare_dataset.drop(columns=['price'])  # Independent variables
y = airfare_dataset['price']  # Target variable

# Feature Scaling

In [20]:
# Standardize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-Test Split & Data Scaling

In [21]:
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [24]:
print(X.shape, X_train.shape, X_test.shape)

(298926, 9) (239140, 9) (59786, 9)


#  Model Training & Evaluation

In [None]:
# Function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
    print(f"MSE: {mean_squared_error(y_test, y_pred)}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}")
    print(f"R2 Score: {r2_score(y_test, y_pred)}")