In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score , mean_absolute_error

In [72]:
# Load the data from the Excel file
df = pd.read_excel('Data_Train.xlsx')

In [None]:
print(df.head(10))

# Step 1: Data Preprocessing


In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
missing_data = df[df['Route'].isnull() | df['Total_Stops'].isnull()]
print(missing_data)

In [77]:
df.dropna(inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [80]:
df.drop_duplicates(inplace = True)

In [None]:
df.duplicated().sum()

In [None]:
df.info()

# Exploratory Data Analysis

In [None]:
df.describe()

In [84]:
# Convert 'Date_of_Journey' to datetime format and extract day and month
df['Journey_Day'] = pd.to_datetime(df['Date_of_Journey'], format='%d/%m/%Y').dt.day
df['Journey_Month'] = pd.to_datetime(df['Date_of_Journey'], format='%d/%m/%Y').dt.month

In [85]:
# Convert 'Dep_Time' and 'Arrival_Time' to datetime format and extract hour and minute
df['Dep_Hour'] = pd.to_datetime(df['Dep_Time'], format='%H:%M').dt.hour
df['Dep_Minute'] = pd.to_datetime(df['Dep_Time'], format='%H:%M').dt.minute

In [86]:
# Handle 'Arrival_Time' separately due to potential date information included
arrival_times = df['Arrival_Time'].str.extract(r'(\d{2}:\d{2})')
df['Arrival_Hour'] = pd.to_datetime(arrival_times[0], format='%H:%M').dt.hour
df['Arrival_Minute'] = pd.to_datetime(arrival_times[0], format='%H:%M').dt.minute

In [87]:
# Drop the original columns that have been transformed
df.drop(['Date_of_Journey', 'Dep_Time', 'Arrival_Time'], axis=1, inplace=True)

In [88]:
# Convert 'Duration' to total minutes
duration = df['Duration'].str.extract(r'(?:(\d+)h)?\s*(?:(\d+)m)?')
duration = duration.fillna(0).astype(int)
df['Duration_Minutes'] = duration[0] * 60 + duration[1]

In [89]:
# Drop the original 'Duration' column
df.drop(['Duration'], axis=1, inplace=True)

In [None]:
df['Total_Stops'] = df['Total_Stops'].str.extract('(\d+)').fillna(0).astype(int)

In [None]:
df

# Visualization

In [92]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import plotly.express as px
from matplotlib.colors import LinearSegmentedColormap
import plotly.express as px


In [None]:
df.columns

* Histogram of Price Distribution

In [None]:

# Histogram for Price Distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['Price'], bins=50, kde=True, color='royalblue')
plt.title('Price Distribution')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()


In [None]:
df["Source"].value_counts()

* Distribution of flights by Source

In [None]:
# Pie chart to show the distribution of flights by source
fig = px.pie(df, names='Source', title='Distribution of Flights by Source', color='Airline', 
             color_discrete_sequence=px.colors.sequential.Reds)

# Update the layout to increase the size of the pie chart
fig.update_layout(
    width=800,  # Adjust width as needed
    height=800,  # Adjust height as needed
)

fig.show()


* Histogram of Airline Distribution

In [None]:
px.histogram(data_frame=df, x="Airline", color_discrete_sequence=['royalblue'])

* Distribution of Price by Airline

In [None]:
# Create a histogram to show the distribution of prices by airline
fig = px.histogram(data_frame=df, x="Airline", y="Price", color="Airline", 
                   color_discrete_sequence=px.colors.sequential.Reds)
fig.update_layout(title="Distribution of Price by Airline", xaxis_title="Airline", yaxis_title="Price")
fig.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.kdeplot(data=df, x="Price", hue="Airline", fill=True, common_norm=False, palette="Set1")
plt.title("Price Distribution by Airline")
plt.xlabel("Price")
plt.ylabel("Density")
plt.show()

* Distribution of Journeys by Day

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='Journey_Day', data=df, palette=['royalblue'])
plt.title('Distribution of Journeys by Day')
plt.xlabel('Journey Day')
plt.ylabel('Count')
plt.show()


* Distribution of Price by Journey_Day

In [None]:
# Boxplot to show price distribution by Journey_Day
plt.figure(figsize=(10, 6))
sns.boxplot(x="Journey_Day", y="Price", data=df, palette="Paired")
plt.title("Boxplot of Price by Journey Day")
plt.xlabel("Journey Day")
plt.ylabel("Price")
plt.show()

boxplot helps to identify the median, quartiles, and potential outliers for each column.

* Distribution of Journeys by Month

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='Journey_Month', data=df, palette=['royalblue'])
plt.title('Distribution of Journeys by Month')
plt.xlabel('Journey Month')
plt.ylabel('Count')
plt.show()

* Distribution of Price by Journey_Month

In [None]:
# Boxplot to show price distribution by Journey_Month
plt.figure(figsize=(10, 6))
sns.boxplot(x="Journey_Month", y="Price", data=df, palette="Set2")
plt.title("Boxplot of Price by Journey Month")
plt.xlabel("Journey Month")
plt.ylabel("Price")
plt.show()

*  Flight Price vs Duration by Airline

In [104]:
diverse_colors = [
    '#FF5733',  # Red-Orange
    '#33FF57',  # Green
    '#3357FF',  # Blue
    '#FF33A6',  # Pink
    '#FFD633',  # Yellow
    '#33FFF9',  # Aqua
    '#8D33FF',  # Purple
    '#FF8633',  # Tangerine
    '#85FF33',  # Lime
    '#FF3333',  # Bright Red
    '#33FF85',  # Mint Green
    '#000000'   # Black
]


In [None]:
fig = px.scatter(df, x='Duration_Minutes', y='Price', color='Airline', 
                 color_discrete_sequence=diverse_colors)
fig.update_layout(title="Price vs Duration by Airline", xaxis_title="Duration in Minutes", yaxis_title="Price")
fig.show()


* Distribution of Price by Destination

In [None]:
# Bar plot showing price distribution by destination
fig = px.bar(df, x='Destination', y='Price', color='Destination', 
             color_discrete_sequence=['royalblue'])
fig.update_layout(title="Price Distribution by Destination", xaxis_title="Destination", yaxis_title="Price")
fig.show()


* Frequency Distribution of Total Stops in Flights

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='Total_Stops', data=df, palette="Reds")
plt.title("Count of Total Stops")
plt.xlabel("Total Stops")
plt.ylabel("Count")
plt.show()


* Distribution of Price by Total Stops (Strip Plot)

In [None]:
plt.figure(figsize=(12, 6))
sns.stripplot(x='Total_Stops', y='Price', data=df, jitter=True, palette='Set3')
plt.title('Price Distribution by Total Stops')
plt.xlabel('Total Stops')
plt.ylabel('Price')
plt.show()


* Frequency Distribution of Additional Information Categories

In [None]:
px.histogram(data_frame=df, x="Additional_Info", color_discrete_sequence=['royalblue'])

* Price Distribution Across Additional Information Categories

In [None]:
# Create a histogram to show the distribution of prices by Additional_Info
fig = px.histogram(data_frame=df, x="Additional_Info", y="Price", color="Additional_Info", 
                   color_discrete_sequence=px.colors.sequential.Reds)
fig.update_layout(title="Distribution of Price by Additional_Info", xaxis_title="Additional_Info", yaxis_title="Price")
fig.show()

* Distribution of Flights by Route

In [None]:
fig = px.histogram(data_frame=df, x="Route", color="Route", 
                   color_discrete_sequence=px.colors.sequential.Reds)
fig.update_layout(
    title="Distribution of Flights by Route",
    xaxis_title="Route",
    yaxis_title="Count"
)
fig.show()

In [None]:

numeric_df = df.select_dtypes(include=['int'])
corr_matrix = numeric_df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm',linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
df.info()

In [114]:
# Encode categorical features using LabelEncoder
label_encoder = LabelEncoder()
categorical_columns = ['Airline', 'Source', 'Destination', 'Route', 'Additional_Info']
for column in categorical_columns:
    df[column] = label_encoder.fit_transform(df[column])

In [None]:
df.info()

In [116]:
df.to_excel('Data_Processed.xlsx', index=False)

## Step 2: Modeling 




In [117]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVR
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression

# from xgboost import XGBClassifier

In [118]:
df_Processed = pd.read_excel('Data_Processed.xlsx')

In [119]:
X = df_Processed.drop(['Price'], axis=1)
y = df_Processed['Price']

In [120]:
Num_Columns = X.select_dtypes(include="number")

In [121]:
Num_Steps = [
    ("Num_Imputer", KNNImputer()),  
    ("Scaler", StandardScaler())   
]
Num_Pipeline = Pipeline(steps=Num_Steps)


In [122]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [123]:
models = {
    "RandomForestRegressor": RandomForestRegressor(n_estimators=200, max_depth=20, min_samples_split=5, min_samples_leaf=2, random_state=42),
    "LinearRegression": LinearRegression(),
    "SVR": SVR()
}

In [124]:
results = {}

for model_name, model in models.items():
    cv_results = cross_validate(
        estimator=model,
        X=X,
        y=y,
        scoring=['neg_mean_squared_error', 'r2'],
        cv=5,  # Number of folds
        return_train_score=True
    )
    
    # Calculate average performance metrics
    avg_train_r2 = cv_results['train_r2'].mean()
    avg_test_r2 = cv_results['test_r2'].mean()
    avg_train_mse = -cv_results['train_neg_mean_squared_error'].mean()
    avg_test_mse = -cv_results['test_neg_mean_squared_error'].mean()
    
    results[model_name] = {
        "Mean Train MSE": avg_train_mse,
        "Mean Test MSE": avg_test_mse,
        "Mean Train R2": avg_train_r2,
        "Mean Test R2": avg_test_r2,
        "Overfitting": avg_train_r2 - avg_test_r2 > 0.1
    }


In [None]:
# Display results
for model_name, result in results.items():
    print(f"Model: {model_name}")
    print(f"Mean Train MSE: {result['Mean Train MSE']}")
    print(f"Mean Test MSE: {result['Mean Test MSE']}")
    print(f"Mean Train R2: {result['Mean Train R2']}")
    print(f"Mean Test R2: {result['Mean Test R2']}")
    print(f"Overfitting: {result['Overfitting']}")
    print("-" * 30)

In [None]:
pipeline = Pipeline([
    ("model", RandomForestRegressor())
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

model = pipeline.named_steps["model"]
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)


print(f"Model:RandomForestRegressor")
print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")
print(f"Training Score: {round(train_score*100,2)} %")
print(f"Testing Score: {round(test_score*100,2)} %")
print(f"Overfitting: {train_score - test_score > 0.1}")
print("-" * 30)

# Hyperparameter Tuning

In [None]:
pipeline["model"].get_params()

In [128]:
modeel= RandomForestRegressor()
param={}
param = {
    'regressor': [modeel],
    'regressor__n_estimators': [100, 200, 300,400],  # Changed 'model__' to 'regressor__'
    'regressor__max_depth': [None,3, 5,]  # Changed 'model__' to 'regressor__'
}

In [129]:
pipeline_best = Pipeline([
    ("regressor", modeel)
])
params=[param]

In [130]:
#Grid Search CV
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

grid_search = GridSearchCV(pipeline_best, params, cv=3, return_train_score=True).fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
print(round(grid_search.cv_results_["mean_test_score"].mean()*100,2),"%")

In [None]:
print(round(grid_search.best_score_*100,2),'%')

In [None]:
print(round(grid_search.cv_results_["mean_test_score"].mean()*100,2),"%")
print(round(grid_search.cv_results_["mean_train_score"].mean()*100,2),"%")

In [None]:
grid_search.best_estimator_

In [None]:
pipeline = Pipeline([
    ("model", RandomForestRegressor())
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

model = pipeline.named_steps["model"]
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)


print(f"Training Score: {round(train_score*100,2)} %")
print(f"Testing Score: {round(test_score*100,2)} %")

print(f"Model:RandomForestRegressor")
print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

print(f"Overfitting: {train_score - test_score > 0.1}")
print("-" * 30)

# Model Saving & Deployment

In [137]:
import joblib
import pandas as pd
import streamlit as st

In [None]:
print(X.columns)
print(pipeline)

In [None]:

joblib.dump(pipeline , "Third_Group.pkl")
joblib.dump(X.columns,"Inputs.pkl")

In [180]:
%%writefile app.py
import joblib
import pandas as pd
import streamlit as st

# Load the model and input features
model = joblib.load("Third_Group.pkl")
inputs = joblib.load("Inputs.pkl")

# Define mappings for categorical features
airline_mapping = {
    "Air Asia": 0,
    "Air India": 1,
    "GoAir": 2,
    "IndiGo": 3,
    "Jet Airways": 4,
    "Multiple carriers": 5,
    "SpiceJet": 6,
    "Vistara": 7
}

source_mapping = {
    "Banglore": 0,
    "Kolkata": 1,
    "Delhi": 2,
    "Chennai": 3,
    "Mumbai": 4
}

destination_mapping = {
    "New Delhi": 0,
    "Banglore": 1,
    "Cochin": 2,
    "Kolkata": 3,
    "Delhi": 4,
    "Hyderabad": 5
}

stops_mapping = {
    "non-stop": 0,
    "1 stop": 1,
    "2 stops": 2,
    "3 stops": 3,
    "4 stops": 4
}

additional_info_mapping = {
    "No info": 0,
    "Business class": 1,
    "Economy class": 2
}

route_mapping = {
    "Direct": 0,
    "Indirect": 1
}

def prediction(Airline, Source, Destination, Route, Additional_Info,
               Journey_Day, Journey_Month, Dep_Hour, Dep_Minute,
               Arrival_Hour, Arrival_Minute, Duration_Minutes, Total_Stops):
    # Encode categorical features
    Airline = airline_mapping.get(Airline, -1)
    Source = source_mapping.get(Source, -1)
    Destination = destination_mapping.get(Destination, -1)
    Route = route_mapping.get(Route, -1)
    Additional_Info = additional_info_mapping.get(Additional_Info, -1)
    Total_Stops = stops_mapping.get(Total_Stops, -1)
    
    # Create DataFrame with encoded values in the correct order
    feature_values = [
        Airline,
        Source,
        Destination,
        Route,
        Additional_Info,
        Journey_Day,
        Journey_Month,
        Dep_Hour,
        Dep_Minute,
        Arrival_Hour,
        Arrival_Minute,
        Duration_Minutes,
        Total_Stops
    ]
    
    # Ensure the feature order matches the training
    df = pd.DataFrame([feature_values], columns=inputs)
    
    result = model.predict(df)[0]
    return result

def main():
    st.title("Flight Fare Prediction App")
    
    # Create two tabs
    tab1, tab2 = st.tabs(["About","Prediction"])
    
    with tab1:
        st.image("plane.jpg") 
        st.header("About the Model")
        st.write("""
            This application uses a machine learning model to predict airline ticket prices based on various parameters such as airline, source, destination, and more.
            
            **Features:**
            - **Airline:** The airline operating the flight.
            - **Source:** The departure city.
            - **Destination:** The arrival city.
            - **Route:** Direct or indirect flight.
            - **Additional Info:** Class of service.
            - **Journey Date:** Day and month of the journey.
            - **Departure and Arrival Times:** Hours and minutes.
            - **Duration:** Flight duration in minutes.
            - **Total Stops:** Number of stops during the journey.
        """)    
    
    with tab2:
    
        st.header("Predict Flight Fare")
        
        Airline = st.selectbox("Airline", list(airline_mapping.keys()))
        Source = st.selectbox("Source", list(source_mapping.keys()))
        Destination = st.selectbox("Destination", list(destination_mapping.keys()))
        Route = st.selectbox("Route", list(route_mapping.keys()))
        Additional_Info = st.selectbox("Additional Info", list(additional_info_mapping.keys()))
        Journey_Day = st.slider("Journey Day", 1, 31, 15)
        Journey_Month = st.slider("Journey Month", 1, 12, 6)
        Dep_Hour = st.slider("Departure Hour", 0, 23, 12)
        Dep_Minute = st.slider("Departure Minute", 0, 59, 30)
        Arrival_Hour = st.slider("Arrival Hour", 0, 23, 15)
        Arrival_Minute = st.slider("Arrival Minute", 0, 59, 45)
        Duration_Minutes = st.slider("Duration Minutes", 0, 2860, 30)
        Total_Stops = st.selectbox("Total Stops", list(stops_mapping.keys()))
        
        if st.button("Predict Fare"):
            result = prediction(
                Airline, Source, Destination, Route, Additional_Info,
                Journey_Day, Journey_Month, Dep_Hour, Dep_Minute,
                Arrival_Hour, Arrival_Minute, Duration_Minutes, Total_Stops
            )
            st.success(f"Estimated Ticket Price: ₹ {result:.2f}")
if __name__ == "__main__":
    main()

Overwriting app.py
