In [3]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn import metrics
import pickle

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load dataset
print("Loading dataset...\n\n")
train_data = pd.read_excel('Flight Dataset/Data_Train.xlsx')

# Display initial rows and structure
print("\nFirst few rows of the dataset:")
print(train_data.head())
print("\nDataset info:")
train_data.info()

In [6]:
# Check for null values
print("\nChecking for missing values:")
print(train_data.isnull().sum())


Checking for missing values:
Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              1
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        1
Additional_Info    0
Price              0
dtype: int64


In [None]:
# Drop missing values
train_data.dropna(inplace=True)
print("\nDataset after dropping missing values:")
train_data.info()
print("\nFirst few rows after dropping missing values:")
print(train_data.head())

In [None]:
# Preserve original 'Airline', 'Source', 'Destination', and 'Price' for visualization
airline_price_data = train_data[['Airline', 'Price']].copy()
source_price_data = train_data[['Source', 'Price']].copy()
destination_price_data = train_data[['Destination', 'Price']].copy()

In [None]:
# Normalize 'New Delhi' in Source and Destination
def normalize_city(city):
    return "Delhi" if city == "New Delhi" else city

train_data['Source'] = train_data['Source'].apply(normalize_city)
train_data['Destination'] = train_data['Destination'].apply(normalize_city)
print("\nDataset after normalizing 'New Delhi':")
print(train_data.head())

In [None]:
# Feature extraction: Journey day and month
train_data['Journey_day'] = pd.to_datetime(train_data['Date_of_Journey'], format='%d/%m/%Y').dt.day
train_data['Journey_month'] = pd.to_datetime(train_data['Date_of_Journey'], format='%d/%m/%Y').dt.month
train_data.drop('Date_of_Journey', axis=1, inplace=True)
print("\nDataset after extracting Journey_day and Journey_month:")
print(train_data.head())

In [None]:
# Departure time (hour and minute)
train_data['Dep_hour'] = pd.to_datetime(train_data['Dep_Time']).dt.hour
train_data['Dep_min'] = pd.to_datetime(train_data['Dep_Time']).dt.minute
train_data.drop('Dep_Time', axis=1, inplace=True)
print("\nDataset after extracting Dep_hour and Dep_min:")
print(train_data.head())

In [None]:
# Arrival time (hour and minute)
train_data['Arrival_hour'] = pd.to_datetime(train_data['Arrival_Time']).dt.hour
train_data['Arrival_min'] = pd.to_datetime(train_data['Arrival_Time']).dt.minute
train_data.drop('Arrival_Time', axis=1, inplace=True)
print("\nDataset after extracting Arrival_hour and Arrival_min:")
print(train_data.head())

In [None]:
# Duration (hour and minute)
print("\nProcessing duration feature...")
duration = list(train_data['Duration'])
for i in range(len(duration)):
    if "h" not in duration[i]:
        duration[i] = "0h " + duration[i]
    if "m" not in duration[i]:
        duration[i] += " 0m"

train_data['Duration_hours'] = [int(d.split()[0][:-1]) for d in duration]
train_data['Duration_mins'] = [int(d.split()[1][:-1]) for d in duration]
train_data.drop('Duration', axis=1, inplace=True)
print("\nDataset after processing Duration:")
print(train_data.head())

In [None]:
# Replace Total_Stops with numerical values
train_data['Total_Stops'] = train_data['Total_Stops'].replace({
    'non-stop': 0, '1 stop': 1, '2 stops': 2, '3 stops': 3, '4 stops': 4
})
print("\nDataset after encoding Total_Stops:")
print(train_data.head())

In [None]:
# Categorical encoding
print("\nEncoding categorical variables...")
airline = pd.get_dummies(train_data['Airline'], drop_first=True)
source = pd.get_dummies(train_data['Source'], drop_first=True)
destination = pd.get_dummies(train_data['Destination'], drop_first=True)

In [None]:
# Drop unnecessary columns
train_data.drop(['Route', 'Additional_Info', 'Airline', 'Source', 'Destination'], axis=1, inplace=True)
print("\nDataset after dropping unnecessary columns:")
print(train_data.head())

In [None]:
# Combine data with encoded features
final_data = pd.concat([train_data, airline, source, destination], axis=1)
print("\nFinal dataset after combining with one-hot encoded variables:")
print(final_data.head())

In [None]:
# Splitting data into features and target
X = final_data.drop('Price', axis=1)
y = final_data['Price']

In [None]:
# Visualizations and analysis
print("\nAnalyzing the dataset with graphs...")

In [None]:
# Airline price distribution using preserved data
plt.figure(figsize=(12, 6))
sns.boxplot(x='Airline', y='Price', data=airline_price_data)
plt.title('Airline vs Price')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Source vs Price using preserved data
plt.figure(figsize=(8, 6))
sns.boxplot(x='Source', y='Price', data=source_price_data)
plt.title('Source vs Price')
plt.show()

In [None]:
# Destination vs Price using preserved data
plt.figure(figsize=(10, 6))
sns.boxplot(x='Destination', y='Price', data=destination_price_data)
plt.title('Destination vs Price')
plt.show()

In [None]:
# Heatmap for correlations
plt.figure(figsize=(12, 10))
sns.heatmap(final_data.corr(), annot= False, cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.show()

In [None]:
# Feature importance using ExtraTreesRegressor
print("\nEvaluating feature importance...")
extra_tree = ExtraTreesRegressor()
extra_tree.fit(X, y)
feature_importances = pd.Series(extra_tree.feature_importances_, index=X.columns)
print("\nTop 20 Feature Importances:")
print(feature_importances.nlargest(20))

In [None]:
# Plot feature importance
plt.figure(figsize=(12, 8))
feature_importances.nlargest(20).plot(kind='barh')
plt.title('Top 20 Feature Importances')
plt.show()

In [None]:
# Splitting dataset into training and testing sets
print("\nSplitting dataset into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print(f"Training data shape: {X_train.shape}, Test data shape: {X_test.shape}")

In [None]:
# Hyperparameter tuning for Random Forest using RandomizedSearchCV
print("\nPerforming hyperparameter tuning for Random Forest...")
n_estimators = [int(x) for x in np.linspace(start=100, stop=1200, num=12)]
max_features = ['log2', 'sqrt']
max_depth = [int(x) for x in np.linspace(5, 30, num=6)]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10]

random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf
}

rf = RandomForestRegressor(random_state=42)
rf_random = RandomizedSearchCV(
    estimator=rf, 
    param_distributions=random_grid, 
    scoring='neg_mean_squared_error', 
    n_iter=10, 
    cv=5, 
    verbose=2, 
    random_state=42, 
    n_jobs=-1
)

In [None]:
# Train the model
print("\nTraining the Random Forest model with hyperparameter tuning...")
rf_random.fit(X_train, y_train)
print("Best parameters for Random Forest:", rf_random.best_params_)

In [None]:
# Model evaluation
print("\nEvaluating the model...")
predictions = rf_random.predict(X_test)
r2 = metrics.r2_score(y_test, predictions)
print(f"R2 Score: {r2}")

In [None]:
# Residual plot
plt.figure(figsize=(8, 6))
sns.histplot(y_test - predictions, kde=True, bins=30)
plt.title('Residual Distribution')
plt.show()

In [None]:
# Actual vs Predicted
plt.figure(figsize=(8, 6))
plt.scatter(y_test, predictions, alpha=0.5)
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Actual vs Predicted Prices')
plt.show()

In [7]:
# Save the model
print("\nSaving the model...")
with open('Trained_Model.pkl', 'wb') as file:
    pickle.dump(rf_random, file)
print("Model saved successfully as 'flight_rf.pkl'.")


Saving the model...
Model saved successfully as 'flight_rf.pkl'.


In [8]:
import pandas as pd
import pickle

# Load the trained model
model = pickle.load(open('Trained_Model.pkl', 'rb'))

# Sample input data (you can modify these values for testing)
dep_time = '2024-11-20T15:00'
arrival_time = '2024-11-20T17:30'
Total_stops = 1
airline = 'IndiGo'
Source = 'Delhi'
Destination = 'Cochin'

# Convert Dep_Time and Arrival_Time to relevant time features
Journey_day = pd.to_datetime(dep_time, format="%Y-%m-%dT%H:%M").day
Journey_month = pd.to_datetime(dep_time, format="%Y-%m-%dT%H:%M").month
Departure_hour = pd.to_datetime(dep_time, format="%Y-%m-%dT%H:%M").hour
Departure_min = pd.to_datetime(dep_time, format="%Y-%m-%dT%H:%M").minute

Arrival_hour = pd.to_datetime(arrival_time, format="%Y-%m-%dT%H:%M").hour
Arrival_min = pd.to_datetime(arrival_time, format="%Y-%m-%dT%H:%M").minute

# Calculate duration
dur_hour = abs(Arrival_hour - Departure_hour)
dur_min = abs(Arrival_min - Departure_min)

# Create dictionaries for airline, source, and destination
airline_dict = {
    'Jet Airways': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    'IndiGo': [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    'Air India': [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
    'Multiple carriers': [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
    'SpiceJet': [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
    'Vistara': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
    'GoAir': [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
    'Multiple carriers Premium economy': [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
    'Jet Airways Business': [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
    'Vistara Premium economy': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
    'Trujet': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
}

source_dict = {
    'Delhi': [1, 0, 0, 0],
    'Kolkata': [0, 1, 0, 0],
    'Mumbai': [0, 0, 1, 0],
    'Chennai': [0, 0, 0, 1]
}

destination_dict = {
    'Cochin': [1, 0, 0, 0],
    'Delhi': [0, 1, 0, 0],
    'Hyderabad': [0, 0, 1, 0],
    'Kolkata': [0, 0, 0, 1],
    'Banglore': [0, 0, 0, 0]  # Assuming Bangalore isn't mapped
}

# Get the airline, source, and destination values from the input
airline_data = airline_dict.get(airline, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
source_data = source_dict.get(Source, [0, 0, 0, 0])
destination_data = destination_dict.get(Destination, [0, 0, 0, 0])

# Prepare the feature vector for prediction (X_test)
X_test = [
    Total_stops, Journey_day, Journey_month, Departure_hour, Departure_min,
    Arrival_hour, Arrival_min, dur_hour, dur_min,
    *airline_data, *source_data, *destination_data
]

# Test the model prediction
output = model.predict([X_test])

# Print the predicted price 
print(f"Predicted Price: ₹{round(output[0], 2)}")


Predicted Price: ₹6441.79
