In [7]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None) #For displaying all the columns of dataset

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


from sklearn.model_selection import RandomizedSearchCV
import pickle
from dev.dev_run_v0 import load_data
import joblib
from dev.preprocessing import preprocess_data
import requests
import json



In [8]:
PREPROCESS_URL = "http://127.0.0.1:5002/preprocess"
TRAIN_URL = "http://127.0.0.1:5003/train"
PREDICT_URL = "http://127.0.0.1:5004/predict"

Prepocess data

In [9]:
input_data = {
    "Date_of_Journey": "12/03/2024",
    "Airline": "Indigo",
    "Source": "Delhi",
    "Destination": "Mumbai"
}

response = requests.post(PREPROCESS_URL, json=input_data)
preprocessed_data = response.json()
print("Preprocessed Data:", preprocessed_data)


Preprocessed Data: {'processed_data': [[2.42006373200812, 0.04030142261776561, 3.0932322705876767, 12, 3]], 'status': 'Success'}


Train a Model

In [10]:
response = requests.post(TRAIN_URL)
train_status = response.json()
print("Training Status:", train_status)

Training Status: {'MAE': 1550.9828692286362, 'message': 'Model trained successfully!', 'status': 'Success'}


Predict Flight Price

In [11]:
if preprocessed_data["status"] == "Success":
    prediction_input = {"processed_data": preprocessed_data["processed_data"]}
    response = requests.post(PREDICT_URL, json=prediction_input)
    prediction_result = response.json()
    print("Predicted Price:", prediction_result)
else:
    print("Preprocessing Failed:", preprocessed_data["message"])


Predicted Price: {'predicted_price': [7740.42831189399], 'status': 'Success'}


In [None]:
print(preprocessed_data)


In [None]:
# df = pd.read_excel('./data/Data_Train.xlsx')

# Load the dataset from the defined path
df = load_data('Data_Train.xlsx') 

In [None]:
df

In [None]:
df.info()


In [None]:
def extract_date_hour(dataset, col):
    '''This function will create two columns containing Hours and Minutes
       extracted from `col` column and then dropped it from the dataset.'''
    
    dataset[col + '_hour'] = dataset[col].dt.hour
    dataset[col + '_minute'] = dataset[col].dt.minute
    dataset.drop(col, axis=1, inplace=True)
    
    return dataset

In [None]:
# Changing the datatype from object to datetime. 
df['Date_of_Journey'] = pd.to_datetime(df['Date_of_Journey'])
df['Dep_Time']        = pd.to_datetime(df['Dep_Time'])
df['Arrival_Time']    = pd.to_datetime(df['Arrival_Time'])


#Extracting Day and Month.
df['Day']   = df['Date_of_Journey'].dt.day
df['Month'] = df['Date_of_Journey'].dt.month
df.drop('Date_of_Journey', axis=1, inplace=True)


#Extracting Hour and Minute.
df = extract_date_hour(df, 'Arrival_Time')
df = extract_date_hour(df, 'Dep_Time')

In [None]:

# Initialize LabelEncoder
encoder = LabelEncoder()

# Apply LabelEncoder to each categorical feature
categorical_cols = ['Airline', 'Source', 'Destination', 'Route', 'Total_Stops', 'Additional_Info']
for col in categorical_cols:
    df[col] = encoder.fit_transform(df[col])

# df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


In [None]:
df

In [None]:
duration = list(df['Duration'])
for i in range(len(duration)):
    if len(duration[i].split(' ')) == 2:
        pass
    else:
        if 'h' in duration[i]:
            duration[i]=duration[i] + ' 0m'
        else:
            duration[i]='0h '+ duration[i]

df['Duration'] = duration

In [None]:
df['Hour']   = df['Duration'].apply(lambda x:x.split(' ')[0][0:-1]).astype('int32')
df['Minute'] = df['Duration'].apply(lambda x:x.split(' ')[1][0:-1]).astype('int32')
df.drop('Duration', axis=1, inplace=True)

In [None]:
# Visualize the price distribution using a boxplot
plt.figure(figsize=(10, 6))
sns.boxplot(x=df['Price'])
plt.title("Boxplot of Flight Prices")
plt.show()

# Visualize the price distribution using a histogram
plt.figure(figsize=(10, 6))
sns.histplot(df['Price'], bins=50, kde=True)
plt.title("Distribution of Flight Prices")
plt.show()

In [None]:
# Select the numerical columns
numerical_cols = ['Arrival_Time_hour', 'Arrival_Time_minute', 'Dep_Time_hour', 
                  'Dep_Time_minute', 'Hour', 'Minute', 'Day', 'Month']

# Plot boxplots for each numerical feature
plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols, 1):
    plt.subplot(3, 3, i)
    sns.boxplot(x=df[col])
    plt.title(f"Boxplot of {col}")

plt.tight_layout()
plt.show()


In [None]:
long_flights = df[df['Hour'] > 20]
print(f"Number of long flights (duration > 20 hours): {len(long_flights)}")

# Show sample long-haul flights
long_flights[['Airline', 'Route', 'Total_Stops', 'Hour', 'Minute', 'Price']].head()

## Splitting and Scaling Data

In [None]:
# Define the feature matrix and target
X = df.drop(['Price'], axis=1)  # Assuming 'Price' is the target variable
y = df['Price']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Scaling completed successfully!")

In [None]:
# Train the RandomForest model
regressor = RandomForestRegressor(random_state=42)
regressor.fit(X_train_scaled, y_train)

# Predictions on the test set
y_pred = regressor.predict(X_test_scaled)

In [None]:
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Model Evaluation Metrics:")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"R² Score: {r2}")

## Airfare Prediction (Classification)


In [None]:
X_airfare = df.drop(['Price'], axis=1)
y_airfare = df['Price']

X_train_airfare, X_test_airfare, y_train_airfare, y_test_airfare = train_test_split(X_airfare, y_airfare, test_size=0.2, random_state=42)

airfare_model = RandomForestClassifier()
airfare_model.fit(X_train_airfare, y_train_airfare)