In [21]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import time, datetime
import re
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error,mean_absolute_percentage_error
from IPython.display import display 
pd.set_option("display.max_columns",None)
import json,pickle,joblib

In [22]:
business_df = pd.read_csv(r"E:Flight recommendation Project\business.csv")
clean_df =pd.read_csv(r"E:Flight recommendation Project\Clean_Dataset.csv")
economy_df=pd.read_csv(r"E:Flight recommendation Project\economy.csv")

In [23]:
business_df["class"] = "business"
economy_df["class"] = "economy"

In [25]:
new = pd.concat([economy_df, business_df], ignore_index=True)
# Display the first row of the new DataFrame
new.head(1)

Unnamed: 0,date,airline,ch_code,num_code,dep_time,from,time_taken,stop,arr_time,to,price,class
0,11-02-2022,SpiceJet,SG,8709,18:55,Delhi,02h 10m,non-stop,21:05,Mumbai,5953,economy


In [26]:
new.num_code = new.num_code.astype("str")
new["flight"] = new["ch_code"] +"-"+ new["num_code"]
new.drop(["ch_code","num_code"],axis = 1,inplace = True)
new.head(1)

Unnamed: 0,date,airline,dep_time,from,time_taken,stop,arr_time,to,price,class,flight
0,11-02-2022,SpiceJet,18:55,Delhi,02h 10m,non-stop,21:05,Mumbai,5953,economy,SG-8709


In [27]:
new.rename({"dep_time": "departure_time", "from": "source_city", 
            "time_taken": "duration", "stop": "stops", "arr_time": "arrival_time",
           "to":"destination_city"}, axis = 1, inplace = True)
new.head(1)

Unnamed: 0,date,airline,departure_time,source_city,duration,stops,arrival_time,destination_city,price,class,flight
0,11-02-2022,SpiceJet,18:55,Delhi,02h 10m,non-stop,21:05,Mumbai,5953,economy,SG-8709


In [28]:
dd = pd.DataFrame(new["date"].str.split("-",expand = True).to_numpy().astype(int),columns = ["day","month","year"])
new["days_left"] = np.where(dd["month"] > 2, dd["day"] +18, np.where(dd["month"] == 2, dd["day"] -10, dd["day"]))
new.head(1)

Unnamed: 0,date,airline,departure_time,source_city,duration,stops,arrival_time,destination_city,price,class,flight,days_left
0,11-02-2022,SpiceJet,18:55,Delhi,02h 10m,non-stop,21:05,Mumbai,5953,economy,SG-8709,1


In [29]:
new.drop("date",axis = 1,inplace = True)

In [30]:
s = (pd.to_datetime(new["departure_time"]).dt.hour % 24 + 4) // 4 #give numbers from 1 to 6 #(return a series)
s.replace({1: 'Late Night', 2: 'Early Morning', 3: 'Morning', 
                      4: 'Afternoon', 5: 'Evening', 6: 'Night'} ,inplace = True) # to replace values 1:latenight to 6: night
new["departure_time"] = s 
new.head(1)

Unnamed: 0,airline,departure_time,source_city,duration,stops,arrival_time,destination_city,price,class,flight,days_left
0,SpiceJet,Evening,Delhi,02h 10m,non-stop,21:05,Mumbai,5953,economy,SG-8709,1


In [31]:
temp = pd.DataFrame(new["arrival_time"].str.split(":",expand = True).to_numpy().astype(int), 
                    columns = ["hour","minute"])
new["arrival_time"] = pd.cut(x = temp["hour"], bins = 6, labels = 
                             ["Late Night","Early Morning","Morning", "Afternoon", "Evening", "Night"])
new.head(1)

Unnamed: 0,airline,departure_time,source_city,duration,stops,arrival_time,destination_city,price,class,flight,days_left
0,SpiceJet,Evening,Delhi,02h 10m,non-stop,Night,Mumbai,5953,economy,SG-8709,1


In [32]:
temp = pd.DataFrame(new["duration"].str.split(expand = True).to_numpy().astype(str), 
                    columns = ["hour","minute"])
temp["hour"] = temp["hour"].apply(lambda x: re.sub("[^0-9]","",x)).astype(int)
temp["minute"] = temp["minute"].apply(lambda r: re.sub("[^0-9]","",r)) # we has no values in minute colum some where 
temp["minute"] = np.where(temp["minute"] == "", 0, temp["minute"]) # replaceing "" with 0
temp["minute"] = temp["minute"].astype(int) #converting data type
new["duration"] = np.around((temp["hour"] + (temp["minute"]/60)),2) #savin the total hours in duration feature
new.head(1)

Unnamed: 0,airline,departure_time,source_city,duration,stops,arrival_time,destination_city,price,class,flight,days_left
0,SpiceJet,Evening,Delhi,2.17,non-stop,Night,Mumbai,5953,economy,SG-8709,1


In [33]:
new["stops"] = new["stops"].apply(lambda r: re.sub("[^0-9]","",r)) # taking only digits
new["stops"] = np.where(new["stops"] == "", 0, new["stops"]) # replacign "" with 0
new["stops"] = new["stops"].astype(int) # changing object to int
new.head(1)

Unnamed: 0,airline,departure_time,source_city,duration,stops,arrival_time,destination_city,price,class,flight,days_left
0,SpiceJet,Evening,Delhi,2.17,0,Night,Mumbai,5953,economy,SG-8709,1


In [34]:
new["price"] = new["price"].apply(lambda r: re.sub("[^0-9]","",r)) # taking only digits
new["price"] = new["price"].astype(int) # changing object to int
new.head(1)

Unnamed: 0,airline,departure_time,source_city,duration,stops,arrival_time,destination_city,price,class,flight,days_left
0,SpiceJet,Evening,Delhi,2.17,0,Night,Mumbai,5953,economy,SG-8709,1


In [35]:
new = new[["airline","flight","source_city","departure_time","stops",
     "arrival_time","destination_city","class","duration","days_left","price"]]

In [36]:
new.to_csv("Clean_flight_data.csv", index = False)

In [37]:
df = new
df.drop_duplicates(inplace = True)

In [38]:
df1 = df.drop("flight", axis = 1)
df1.head(1)

Unnamed: 0,airline,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,SpiceJet,Delhi,Evening,0,Night,Mumbai,economy,2.17,1,5953


In [39]:
df1["departure_time"].replace({'Late Night':0,'Early Morning':1,'Morning':2,
                               'Afternoon':3,'Evening':4,'Night':5},inplace=True)
df1["arrival_time"].replace({'Late Night':0,'Early Morning':1,'Morning':2,
                             'Afternoon':3,'Evening':4,'Night':5},inplace=True)
df1["class"].replace({"economy":0,"business":1},inplace=True)
df1.head(3)

Unnamed: 0,airline,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,SpiceJet,Delhi,4,0,5,Mumbai,0,2.17,1,5953
1,SpiceJet,Delhi,1,0,2,Mumbai,0,2.33,1,5953
2,AirAsia,Delhi,1,0,1,Mumbai,0,2.17,1,5956


In [41]:
ohe = OneHotEncoder()
airline_encoded = ohe.fit_transform(df1[["airline"]]).toarray()

# Get the unique airline names to use as column names
airline_columns = list(df1["airline"].unique())

# Add the encoded columns to the DataFrame
df1[airline_columns] = airline_encoded

# Use pd.get_dummies for "destination_city" and "source_city"
df1 = pd.concat([df1, pd.get_dummies(df1["destination_city"], prefix="destination_city")], axis=1)
df1 = pd.concat([df1, pd.get_dummies(df1["source_city"], prefix="source_city")], axis=1)

# Drop the original columns
df1.drop(["airline", "source_city", "destination_city"], axis=1, inplace=True)

# Display the first row of the updated DataFrame
df1.head(1)

Unnamed: 0,departure_time,stops,arrival_time,class,duration,days_left,price,SpiceJet,AirAsia,Vistara,GO FIRST,Indigo,Air India,Trujet,StarAir,destination_city_Bangalore,destination_city_Chennai,destination_city_Delhi,destination_city_Hyderabad,destination_city_Kolkata,destination_city_Mumbai,source_city_Bangalore,source_city_Chennai,source_city_Delhi,source_city_Hyderabad,source_city_Kolkata,source_city_Mumbai
0,4,0,5,0,2.17,1,5953,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,False,False,False,False,False,True,False,False,True,False,False,False


In [42]:
x = df1.drop("price", axis = 1)
y = df1["price"]

In [43]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.33,random_state=42)

In [44]:
def get_prediction(model):
    model.fit(x_train,y_train)
    y_pred = model.predict(x_train)
    maep_tr = mean_absolute_percentage_error(y_train,y_pred)
    r2_tr  = r2_score(y_train,y_pred)
    y_pred1 = model.predict(x_test)
    maep_te = mean_absolute_percentage_error(y_test,y_pred1)
    r2_te  = r2_score(y_test,y_pred1)
    return print(f"""At Training: \n%MAE: {maep_tr}\nR2_Score: {r2_tr}
    \nAt Testing: \n%MAE: {maep_te}\nR2_Score: {r2_te}""")

In [45]:
lst = [("Linear Regression",LinearRegression()),("Decision Tree",DecisionTreeRegressor()),
      ("Random Forest",RandomForestRegressor()),("AdaBoost",AdaBoostRegressor()),
      ("XG Boost",XGBRegressor())]
for name,model in lst:
    print(f"The Performance of {name} without Scaling::")
    get_prediction(model)
    print("*"*50)

The Performance of Linear Regression without Scaling::
At Training: 
%MAE: 0.42915827741556495
R2_Score: 0.9093493429288637
    
At Testing: 
%MAE: 0.43094742584555423
R2_Score: 0.9101065276667425
**************************************************
The Performance of Decision Tree without Scaling::
At Training: 
%MAE: 0.0022414509454554417
R2_Score: 0.999382532773985
    
At Testing: 
%MAE: 0.07696001492174516
R2_Score: 0.9763885830307838
**************************************************
The Performance of Random Forest without Scaling::
At Training: 
%MAE: 0.028141072835190514
R2_Score: 0.9974834657666326
    
At Testing: 
%MAE: 0.0732605461832977
R2_Score: 0.9851887844051552
**************************************************
The Performance of AdaBoost without Scaling::
At Training: 
%MAE: 0.3556245379462249
R2_Score: 0.9334407607276116
    
At Testing: 
%MAE: 0.3588700346580736
R2_Score: 0.9330477335881461
**************************************************
The Performance of XG Boos

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:arrival_time: category

In [46]:
std = StandardScaler()
x_train = pd.DataFrame(std.fit_transform(x_train), columns = x_train.columns)
x_test = pd.DataFrame(std.transform(x_test),columns = x_test.columns)

In [47]:
lst = [("Linear Regression",LinearRegression()),("Decision Tree",DecisionTreeRegressor()),
      ("Random Forest",RandomForestRegressor()),("AdaBoost",AdaBoostRegressor()),
      ("XG Boost",XGBRegressor())]
for name,model in lst:
    print(f"The Performance of {name} with Scaling::")
    get_prediction(model)
    print("*"*50)

The Performance of Linear Regression with Scaling::
At Training: 
%MAE: 0.42915827741548573
R2_Score: 0.9093493429288637
    
At Testing: 
%MAE: 0.43094742584547613
R2_Score: 0.9101065276667425
**************************************************
The Performance of Decision Tree with Scaling::
At Training: 
%MAE: 0.0022414509454554417
R2_Score: 0.999382532773985
    
At Testing: 
%MAE: 0.0764889138751138
R2_Score: 0.9766204990520959
**************************************************
The Performance of Random Forest with Scaling::
At Training: 
%MAE: 0.028084396414355173
R2_Score: 0.9974921953522866
    
At Testing: 
%MAE: 0.0729174630827932
R2_Score: 0.9852098427205469
**************************************************
The Performance of AdaBoost with Scaling::
At Training: 
%MAE: 0.357329511537765
R2_Score: 0.932364703292238
    
At Testing: 
%MAE: 0.36058979931650315
R2_Score: 0.9321265178532412
**************************************************
The Performance of XG Boost with Scaling

In [48]:
rf = RandomForestRegressor()
rf.fit(x_train,y_train)

In [49]:
y_pred = rf.predict(x_test)
y_pred

array([35652.92, 38099.  , 10968.83, ...,  1604.  , 10750.65, 64152.84],
      shape=(99086,))

In [50]:
r2     = r2_score(y_test,y_pred)
mape    = mean_absolute_percentage_error(y_test,y_pred)
print(f"The Performace of the model::\nR2_score: {r2}\n%MAE: {mape}")

The Performace of the model::
R2_score: 0.985325613094146
%MAE: 0.07293036986014217


In [51]:
# creating dictionary for feature information, we will export it in json file
dictionary = {"class":{"economy":0,"business":1},
               "departure_time":{'Late Night':0,'Early Morning':1,'Morning':2,'Afternoon':3,'Evening':4,'Night':5},
               "arrival_time":{'Late Night':0,'Early Morning':1,'Morning':2,'Afternoon':3,'Evening':4,'Night':5},
              "columns":list(x.columns)}
with open("json_data.json", "w") as f:
     json.dump(dictionary,f)

In [52]:
# exporting scaling model file
with open("std_sclr.pkl", "wb") as f:
     pickle.dump(std,f)
#exporting random forest model file
with open("model.pkl", "wb") as f:
     pickle.dump(rf,f)

In [77]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Load the dataset
df = pd.read_csv('Clean_flight_data.csv')

# Filter flights from Mumbai to Delhi
df = df[(df['source_city'] == 'Delhi') & (df['destination_city'] == 'Mumbai')]

# Select relevant features
features = ['airline', 'departure_time', 'stops', 'arrival_time', 'duration', 'days_left', 'class']
X = df[features]
y = df['price']

# Encode categorical variables
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X[['airline', 'departure_time', 'arrival_time', 'class']]).toarray()

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X[['duration', 'days_left']])

# Combine encoded and scaled features
X_processed = np.hstack((X_encoded, X_scaled))

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Train Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict prices
predicted_prices = rf.predict(X_processed)

# Sort flights by predicted price
df['predicted_price'] = predicted_prices
sorted_flights = df.sort_values(by='predicted_price')

# Display sorted flights
print(sorted_flights[['airline', 'flight', 'departure_time', 'stops', 'arrival_time', 'duration', 'days_left', 'price', 'predicted_price']])

         airline   flight departure_time  stops arrival_time  duration  \
9379    SpiceJet  SG-8723        Morning      0      Morning      2.17   
9381    SpiceJet  SG-8709        Evening      0        Night      2.33   
9382    SpiceJet  SG-8169        Evening      0        Night      2.33   
9380    SpiceJet  SG-8701  Early Morning      0      Morning      2.25   
9405     AirAsia   I5-482          Night      0        Night      2.25   
...          ...      ...            ...    ...          ...       ...   
206972   Vistara   UK-707        Evening      1        Night     26.83   
206983   Vistara   UK-705  Early Morning      1    Afternoon      6.50   
206986   Vistara   UK-819      Afternoon      1        Night      7.58   
206798   Vistara   UK-707        Evening      1        Night     26.83   
206895   Vistara   UK-707        Evening      1        Night     26.83   

        days_left  price  predicted_price  
9379           47   2281      2301.540000  
9381           47   228

In [82]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error

# Load dataset
df = pd.read_csv(r'Clean_flight_data.csv')
df["departure_time"].replace({'Late Night':0,'Early Morning':1,'Morning':2,
                               'Afternoon':3,'Evening':4,'Night':5}, inplace=True)
df["arrival_time"].replace({'Late Night':0,'Early Morning':1,'Morning':2,
                             'Afternoon':3,'Evening':4,'Night':5}, inplace=True)
df["class"].replace({"economy":0, "business":1}, inplace=True)

# One-Hot Encoding for categorical variables
ohe = OneHotEncoder()
airline_encoded = ohe.fit_transform(df[["airline"]]).toarray()
airline_labels = ohe.categories_[0]
df_airline = pd.DataFrame(airline_encoded, columns=airline_labels)

df = pd.concat([df, df_airline], axis=1)
df = pd.get_dummies(df, columns=["source_city", "destination_city"], drop_first=True)

# Splitting data into training and testing sets
X = df.drop(["price", "airline", "flight"], axis=1)
y = df["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

# Filter flights from Delhi to Mumbai
df_filtered = df[(df["source_city_Delhi"] == 1) & (df["destination_city_Mumbai"] == 1)]

df_filtered = df_filtered.copy()
df_filtered["predicted_price"] = model.predict(df_filtered.drop(["price"], axis=1))

# Sort flights by predicted price
sorted_flights = df_filtered.sort_values(by="predicted_price")

# Display the results in tabular format
columns_to_display = ["airline", "flight", "departure_time", "stops", "arrival_time", "duration", "days_left", "class", "price", "predicted_price"]
print(sorted_flights[columns_to_display].to_string(index=False))


Mean Absolute Error: 1084.1205504026939


ValueError: could not convert string to float: 'SpiceJet'

In [85]:
!pip uninstall scikit-learn

^C


In [86]:
!pip install --upgrade scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.3.2-cp38-cp38-win_amd64.whl (9.3 MB)
Collecting joblib>=1.1.1
  Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Installing collected packages: joblib, scikit-learn
  Attempting uninstall: joblib
    Found existing installation: joblib 1.0.1
    Uninstalling joblib-1.0.1:
      Successfully uninstalled joblib-1.0.1
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.24.1
    Uninstalling scikit-learn-0.24.1:
      Successfully uninstalled scikit-learn-0.24.1
Successfully installed joblib-1.4.2 scikit-learn-1.3.2


In [7]:
from sklearn import __version__ as sklearn_version

In [20]:
import sklearn
print('numpy',np.__version__)
print('pandas',pd.__version__)
print('scikit-learn',sklearn.__version__)
import sys
print('python', sys.version)

numpy 2.2.3
pandas 2.2.3
scikit-learn 1.6.1
python 3.13.2 (tags/v3.13.2:4f8bb39, Feb  4 2025, 15:23:48) [MSC v.1942 64 bit (AMD64)]
