In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import pickle
import matplotlib.pyplot as plt

In [None]:
# Read the preprocessed CSV file
dataset = pd.read_csv('/content/Clean_Dataset.csv')

# Inspect the DataFrame
print(dataset.head())

   Unnamed: 0   airline   flight source_city departure_time stops  \
0           0  SpiceJet  SG-8709       Delhi        Evening  zero   
1           1  SpiceJet  SG-8157       Delhi  Early_Morning  zero   
2           2   AirAsia   I5-764       Delhi  Early_Morning  zero   
3           3   Vistara   UK-995       Delhi        Morning  zero   
4           4   Vistara   UK-963       Delhi        Morning  zero   

    arrival_time destination_city    class  duration  days_left  price  
0          Night           Mumbai  Economy      2.17          1   5953  
1        Morning           Mumbai  Economy      2.33          1   5953  
2  Early_Morning           Mumbai  Economy      2.17          1   5956  
3      Afternoon           Mumbai  Economy      2.25          1   5955  
4        Morning           Mumbai  Economy      2.33          1   5955  


In [None]:
dataset.columns

Index(['Unnamed: 0', 'airline', 'flight', 'source_city', 'departure_time',
       'stops', 'arrival_time', 'destination_city', 'class', 'duration',
       'days_left', 'price'],
      dtype='object')

In [None]:
indep_X=dataset[['duration', 'days_left', 'airline', 'source_city',
       'departure_time', 'stops', 'arrival_time', 'destination_city', 'class']]

In [None]:
dep_Y=dataset[['price']]

# Feature Selection

# Recursive Feature Elimination

In [None]:
def split_scalar(indep_X,dep_Y):
        X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size = 0.25, random_state = 0)

        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)
        return X_train, X_test, y_train, y_test

In [None]:
def r2_prediction(regressor,X_test,y_test):
    y_pred = regressor.predict(X_test)
    from sklearn.metrics import r2_score
    r2=r2_score(y_test,y_pred)
    return r2

In [None]:
def Linear(X_train,y_train,X_test):
        from sklearn.linear_model import LinearRegression
        regressor = LinearRegression()
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2

In [None]:
def Decision(X_train,y_train,X_test):
        from sklearn.tree import DecisionTreeRegressor
        regressor = DecisionTreeRegressor(random_state = 0)
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2

In [None]:
def random(X_train,y_train,X_test):
        from sklearn.ensemble import RandomForestRegressor
        regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2

In [None]:
def xgboost(X_train,y_train,X_test):
        from xgboost import XGBRegressor
        regressor = XGBRegressor(n_jobs=5,learning_rate=0.1,max_depth=10,random_state=1)
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return r2

In [None]:
def rfeFeature(indep_X, dep_Y, n):
    rfelist = []
    colnames_list = []
    r2_values = []

    # Split data before RFE to prevent data leakage
    X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)

    # Apply one-hot encoding to categorical features in training data
    X_train_encoded = pd.get_dummies(X_train, columns=['airline', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class'], drop_first=True)
    X_test_encoded = pd.get_dummies(X_test, columns=['airline', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class'], drop_first=True)

    # Align columns - this is important if some categories are not present in both train and test sets
    X_train_encoded, X_test_encoded = X_train_encoded.align(X_test_encoded, join='outer', axis=1, fill_value=0)

    from sklearn.linear_model import LinearRegression
    lin = LinearRegression()

    from sklearn.tree import DecisionTreeRegressor
    dec = DecisionTreeRegressor(random_state=0)

    from sklearn.ensemble import RandomForestRegressor
    rf = RandomForestRegressor(n_estimators=10, random_state=0)

    from xgboost import XGBRegressor
    xgb = XGBRegressor(n_jobs=5, learning_rate=0.1, max_depth=10, random_state=1)

    rfemodellist = [lin, dec, rf, xgb]

    for model in rfemodellist:
        log_rfe = RFE(estimator=model, n_features_to_select=n)
        # Fit RFE only on the encoded training data
        log_fit = log_rfe.fit(X_train_encoded, y_train)
        # Transform both train and test data
        X_train_rfe = log_fit.transform(X_train_encoded)
        X_test_rfe = log_fit.transform(X_test_encoded)
        rfelist.append((X_train_rfe, X_test_rfe)) # Store train and test RFE results

        # Get the column names selected by RFE
        selected_columns = [col for col, selected in zip(X_train_encoded.columns, log_rfe.support_) if selected]
        colnames_list.append(selected_columns)

        # Fit the model and calculate and store the R2 value
        model.fit(X_train_rfe, y_train)  # Fit the model on RFE-selected training data
        r2 = r2_prediction(model, X_test_rfe, y_test) # Predict on RFE-selected test data
        r2_values.append(r2)

    return rfelist, colnames_list, r2_values

# Call the function with your data
rfelist, colnames_list, r2_values = rfeFeature(indep_X, dep_Y, 5)

# Print the selected column names and R2 values for each model
for model_name, selected_columns, r2_value in zip(["Linear", "Decision", "Random", "XGBoost"], colnames_list, r2_values):
    print(f"Model: {model_name}")
    print("Selected Columns:", selected_columns)
    print(f"R2 Value: {r2_value}\n")

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

Model: Linear
Selected Columns: ['airline_Vistara', 'source_city_Kolkata', 'stops_zero', 'destination_city_Kolkata', 'class_Economy']
R2 Value: 0.9028580218801604

Model: Decision
Selected Columns: ['duration', 'days_left', 'airline_Air_India', 'arrival_time_Evening', 'class_Economy']
R2 Value: 0.927990040035466

Model: Random
Selected Columns: ['duration', 'days_left', 'airline_Air_India', 'airline_Vistara', 'class_Economy']
R2 Value: 0.9335546169447858

Model: XGBoost
Selected Columns: ['duration', 'airline_Air_India', 'stops_two_or_more', 'destination_city_Delhi', 'class_Economy']
R2 Value: 0.9441820979118347



# Model Creation

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(indep_X, dep_Y, test_size=0.30, random_state=0)

# Apply one-hot encoding to categorical features
x_train_encoded = pd.get_dummies(x_train, columns=['airline', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class'], drop_first=True)
x_test_encoded = pd.get_dummies(x_test, columns=['airline', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class'], drop_first=True)

# Align columns - this is important if some categories are not present in both train and test sets
x_train_encoded, x_test_encoded = x_train_encoded.align(x_test_encoded, join='outer', axis=1, fill_value=0)


from sklearn.tree import DecisionTreeRegressor
regressor_dt=DecisionTreeRegressor(criterion='squared_error', splitter='random')
# Use the encoded data for training
regressor_dt=regressor_dt.fit(x_train_encoded,y_train)

In [None]:
y_pred=regressor_dt.predict(x_test_encoded)

In [None]:
from sklearn.metrics import r2_score
r_score=r2_score(y_test,y_pred)

In [None]:
r_score

0.9849978611103386

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
regressor_gbr = GradientBoostingRegressor(n_estimators=500,max_depth=4,min_samples_split=5,learning_rate=0.01,loss="squared_error")
regressor_gbr.fit(x_train_encoded, y_train)

  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


In [None]:
y_pred=regressor_gbr.predict(x_test_encoded)

In [None]:
from sklearn.metrics import r2_score
r_score=r2_score(y_test,y_pred)
r_score

0.9849978611103386

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor_rf = RandomForestRegressor(n_estimators=100,random_state=0)
regressor_rf.fit(x_train_encoded, y_train.values.ravel())

In [None]:
y_pred=regressor_rf.predict(x_test_encoded)

In [None]:
r_score

0.9849978611103386

In [None]:
# When comparing, Decision Tree algorithm gives maximum accuracy of 97%

In [None]:
import pickle
Finalised_Model="Finalized_model.sav"

In [None]:
pickle.dump(regressor_dt,open(Finalised_Model,'wb'))