# Seminar 1: Regression Models - Group 20 ( Jwan Mardini, Mohamad Alloush, Abshir Muhumed Abdi )

## Dataset5 - Air Travel - Price prediction

### Importing, merging and shuffling datasets 

In [7]:
import pandas as pd

In [9]:
# Load the datasets
business_df = pd.read_csv('business.csv')
economy_df = pd.read_csv('economy.csv')

# Add a new column to indicate the type
business_df['type'] = 'business'
economy_df['type'] = 'economy'

# Concatenate the datasets
merged_df = pd.concat([business_df, economy_df], ignore_index=True)

shuffled_df = merged_df.sample(frac=1).reset_index(drop=True)

shuffled_df.head(5)

Unnamed: 0,date,airline,ch_code,num_code,dep_time,from,time_taken,stop,arr_time,to,price,type
0,26-02-2022,Air India,AI,640,06:45,Bangalore,05h 30m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,12:15,Delhi,42521,business
1,13-03-2022,Vistara,UK,810,07:00,Bangalore,14h 15m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,21:15,Mumbai,7212,economy
2,29-03-2022,Vistara,UK,871,20:35,Delhi,26h 25m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,23:00,Mumbai,41281,business
3,01-03-2022,Air India,AI,569,06:20,Chennai,17h 20m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,23:40,Kolkata,54481,business
4,22-02-2022,Indigo,6E,269,09:55,Chennai,06h 40m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,16:35,Mumbai,8665,economy


### Preproccessing

#### Formatting and categorizing 

##### Formatting

In [14]:
import re

def convert_time_to_minutes(time_str):
    total_minutes = 0

    # Clean up the time string
    time_str = re.sub(r"\s+", " ", time_str.strip())  # Remove extra spaces

    # Handle decimal hours, e.g., '1.03h'
    decimal_hours_match = re.match(r"^(\d+\.\d+)h$", time_str)
    if decimal_hours_match:
        return int(float(decimal_hours_match.group(1)) * 60)

    # Handle valid formats like '2h 10m', '2h', or '10m'
    hours_match = re.search(r'(\d+)h', time_str)
    minutes_match = re.search(r'(\d+)m', time_str)

    if hours_match:
        hours = int(hours_match.group(1))  # Extract hours
        total_minutes += hours * 60

    if minutes_match:
        minutes = int(minutes_match.group(1))  # Extract minutes
        total_minutes += minutes

    return total_minutes

# Apply conversion to the 'time_taken' column
shuffled_df['time_taken_min'] = shuffled_df['time_taken'].apply(convert_time_to_minutes)


In [16]:
shuffled_df = shuffled_df.drop('time_taken', axis=1)

In [18]:
shuffled_df["time_taken_min"] = shuffled_df["time_taken_min"].apply(lambda x: x / 60)

In [20]:
shuffled_df.rename(columns={'time_taken_min': 'time_taken'}, inplace=True)

In [22]:
shuffled_df['stop_cleaned'] = shuffled_df['stop'].str.strip().str.replace(r'\n\s*', '', regex=True)

In [24]:
shuffled_df = shuffled_df.drop('stop', axis=1)

In [26]:
shuffled_df['price'] = shuffled_df['price'].str.replace(',', '').astype(float)

In [28]:
shuffled_df['date'] = pd.to_datetime(shuffled_df['date'], format='%d-%m-%Y')
shuffled_df['dep_time'] = pd.to_datetime(shuffled_df['dep_time'], format='%H:%M').dt.time
shuffled_df['arr_time'] = pd.to_datetime(shuffled_df['arr_time'], format='%H:%M').dt.time

##### Catogarizing using OneHotEncoder 

In [31]:
from sklearn.preprocessing import OneHotEncoder

stop_cleaned_cat = shuffled_df[["stop_cleaned"]]
cat_encoder = OneHotEncoder()
stop_cleaned_encoder = cat_encoder.fit_transform(stop_cleaned_cat)

In [32]:
# Convert the one-hot encoded array to a DataFrame
stop_encoded_df = pd.DataFrame(stop_cleaned_encoder.toarray(), columns=cat_encoder.get_feature_names_out(['stop_cleaned']))

# Concatenate the original DataFrame with the one-hot encoded DataFrame
shuffled_df = pd.concat([shuffled_df, stop_encoded_df], axis=1)

# Drop the original 'stop_cleaned' column 
shuffled_df = shuffled_df.drop('stop_cleaned', axis=1)

In [34]:
airline_cat = shuffled_df[["airline"]]
airline_encoder = OneHotEncoder()
airline_encoded = airline_encoder.fit_transform(airline_cat)


In [36]:
# Convert the encoded array to a DataFrame
airline_encoded_df = pd.DataFrame(airline_encoded.toarray(), columns=airline_encoder.get_feature_names_out(['airline']))

# Concatenate the original DataFrame with the new one-hot encoded DataFrame
shuffled_df = pd.concat([shuffled_df, airline_encoded_df], axis=1)

# Drop the original 'airline' column 
shuffled_df = shuffled_df.drop('airline', axis=1)

In [38]:
to_cat = shuffled_df[["to"]]
to_encoder = OneHotEncoder()
to_encoded = to_encoder.fit_transform(to_cat)

In [41]:
# Convert the encoded array to a DataFrame
to_encoded_df = pd.DataFrame(to_encoded.toarray(), columns=to_encoder.get_feature_names_out(['to']))

# Concatenate the original DataFrame with the new one-hot encoded DataFrame
shuffled_df = pd.concat([shuffled_df, to_encoded_df], axis=1)

# Drop the original 'to' column 
shuffled_df = shuffled_df.drop('to', axis=1)

In [42]:
from_cat = shuffled_df[["from"]]
from_encoder = OneHotEncoder()
from_encoded = from_encoder.fit_transform(from_cat)

In [45]:
# Convert the encoded array to a DataFrame
from_encoded_df = pd.DataFrame(from_encoded.toarray(), columns=from_encoder.get_feature_names_out(['from']))

# Concatenate the original DataFrame with the new one-hot encoded DataFrame
shuffled_df = pd.concat([shuffled_df, from_encoded_df], axis=1)

# Drop the original 'from' column 
shuffled_df = shuffled_df.drop('from', axis=1)

##### Categorizing using BinaryEncoder

In [48]:
# Convert 'type' column values to 1 for 'business' and 0 for others, store in 'class' column
shuffled_df["class"] = shuffled_df["type"].apply(lambda x: 1 if x == "business" else 0)

In [50]:
# Check class values ( airline )
shuffled_df["class"].value_counts()

class
0    206774
1     93487
Name: count, dtype: int64

In [52]:
shuffled_df = shuffled_df.drop("type", axis=1)

In [54]:
shuffled_df = shuffled_df.drop(['ch_code', 'num_code'], axis=1)

##### Custom categorizing - time and applying OneHotEncoder

In [57]:
import datetime

def categorize_time(time_str):
    if isinstance(time_str, datetime.time):
        time_str = time_str.strftime("%H:%M")
    
    hour = int(time_str.split(':')[0])
    
    if 5 <= hour < 8:
        return 'Early Morning'
    elif 8 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 16:
        return 'Afternoon'
    elif 16 <= hour < 20:
        return 'Evening'
    elif 20 <= hour < 23:
        return 'Night'
    else:
        return 'Late Night'

# Apply the function to 'dep_time' and 'arr_time'
shuffled_df['dep_time_category'] = shuffled_df['dep_time'].apply(categorize_time)
shuffled_df['arr_time_category'] = shuffled_df['arr_time'].apply(categorize_time)


In [59]:
print(shuffled_df[['dep_time', 'dep_time_category', 'arr_time', 'arr_time_category']].head(10))

   dep_time dep_time_category  arr_time arr_time_category
0  06:45:00     Early Morning  12:15:00         Afternoon
1  07:00:00     Early Morning  21:15:00             Night
2  20:35:00             Night  23:00:00        Late Night
3  06:20:00     Early Morning  23:40:00        Late Night
4  09:55:00           Morning  16:35:00           Evening
5  15:30:00         Afternoon  17:40:00           Evening
6  20:55:00             Night  19:55:00           Evening
7  09:00:00           Morning  23:10:00        Late Night
8  14:55:00         Afternoon  10:15:00           Morning
9  07:00:00     Early Morning  20:20:00             Night


In [61]:
shuffled_df = shuffled_df.drop(['dep_time', 'arr_time'], axis=1)

In [63]:
dep_time_category_cat = shuffled_df[["dep_time_category"]]
dep_time_category_encoder = OneHotEncoder()
dep_time_category_encoded = dep_time_category_encoder.fit_transform(dep_time_category_cat)

In [65]:
# Convert the encoded array to a DataFrame
dep_time_category_encoded_df = pd.DataFrame(dep_time_category_encoded.toarray(), columns=dep_time_category_encoder.get_feature_names_out(['dep_time_category']))

# Concatenate the original DataFrame with the new one-hot encoded DataFrame
shuffled_df = pd.concat([shuffled_df, dep_time_category_encoded_df], axis=1)

# Drop the original 'dep_time_category' column 
shuffled_df = shuffled_df.drop('dep_time_category', axis=1)

In [66]:
arr_time_category_cat = shuffled_df[["arr_time_category"]]
arr_time_category_encoder = OneHotEncoder()
arr_time_category_encoded = arr_time_category_encoder.fit_transform(arr_time_category_cat)

In [69]:
# Convert the encoded array to a DataFrame
arr_time_category_encoded_df = pd.DataFrame(arr_time_category_encoded.toarray(), columns=arr_time_category_encoder.get_feature_names_out(['arr_time_category']))

# Concatenate the original DataFrame with the new one-hot encoded DataFrame
shuffled_df = pd.concat([shuffled_df, arr_time_category_encoded_df], axis=1)

# Drop the original 'arr_time_category' column 
shuffled_df = shuffled_df.drop('arr_time_category', axis=1)

In [71]:
shuffled_df = shuffled_df.drop('date', axis=1)

##### Final Data Frame

In [74]:
# Show all columns
pd.set_option('display.max_columns', None)

In [76]:
shuffled_df.head(10)

Unnamed: 0,price,time_taken,stop_cleaned_1-stop,stop_cleaned_1-stopVia BBI,stop_cleaned_1-stopVia Bhubaneswar,stop_cleaned_1-stopVia Chennai,stop_cleaned_1-stopVia Delhi,stop_cleaned_1-stopVia GAU,stop_cleaned_1-stopVia GAY,stop_cleaned_1-stopVia GOP,stop_cleaned_1-stopVia Guwahati,stop_cleaned_1-stopVia HYD,stop_cleaned_1-stopVia Hyderabad,stop_cleaned_1-stopVia IDR,stop_cleaned_1-stopVia IXE,stop_cleaned_1-stopVia IXR,stop_cleaned_1-stopVia IXU,stop_cleaned_1-stopVia Indore,stop_cleaned_1-stopVia JGB,stop_cleaned_1-stopVia JRG,stop_cleaned_1-stopVia KLH,stop_cleaned_1-stopVia Kolhapur,stop_cleaned_1-stopVia Kolkata,stop_cleaned_1-stopVia Lucknow,stop_cleaned_1-stopVia MYQ,stop_cleaned_1-stopVia Mangalore,stop_cleaned_1-stopVia Mumbai,stop_cleaned_1-stopVia Mysore,stop_cleaned_1-stopVia NAG,stop_cleaned_1-stopVia NDC,stop_cleaned_1-stopVia Nagpur,stop_cleaned_1-stopVia PAT,stop_cleaned_1-stopVia Patna,stop_cleaned_1-stopVia RPR,stop_cleaned_1-stopVia Raipur,stop_cleaned_1-stopVia Ranchi,stop_cleaned_1-stopVia STV,stop_cleaned_1-stopVia Surat,stop_cleaned_1-stopVia VTZ,stop_cleaned_1-stopVia Vishakhapatnam,stop_cleaned_2+-stop,stop_cleaned_non-stop,airline_Air India,airline_AirAsia,airline_GO FIRST,airline_Indigo,airline_SpiceJet,airline_StarAir,airline_Trujet,airline_Vistara,to_Bangalore,to_Chennai,to_Delhi,to_Hyderabad,to_Kolkata,to_Mumbai,from_Bangalore,from_Chennai,from_Delhi,from_Hyderabad,from_Kolkata,from_Mumbai,class,dep_time_category_Afternoon,dep_time_category_Early Morning,dep_time_category_Evening,dep_time_category_Late Night,dep_time_category_Morning,dep_time_category_Night,arr_time_category_Afternoon,arr_time_category_Early Morning,arr_time_category_Evening,arr_time_category_Late Night,arr_time_category_Morning,arr_time_category_Night
0,42521.0,5.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,7212.0,14.25,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,41281.0,26.416667,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,54481.0,17.333333,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,8665.0,6.666667,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,34472.0,2.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6,9879.0,23.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
7,23478.0,14.166667,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8,45185.0,19.333333,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
9,6122.0,13.333333,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [78]:
shuffled_df.columns

Index(['price', 'time_taken', 'stop_cleaned_1-stop',
       'stop_cleaned_1-stopVia BBI', 'stop_cleaned_1-stopVia Bhubaneswar',
       'stop_cleaned_1-stopVia Chennai', 'stop_cleaned_1-stopVia Delhi',
       'stop_cleaned_1-stopVia GAU', 'stop_cleaned_1-stopVia GAY',
       'stop_cleaned_1-stopVia GOP', 'stop_cleaned_1-stopVia Guwahati',
       'stop_cleaned_1-stopVia HYD', 'stop_cleaned_1-stopVia Hyderabad',
       'stop_cleaned_1-stopVia IDR', 'stop_cleaned_1-stopVia IXE',
       'stop_cleaned_1-stopVia IXR', 'stop_cleaned_1-stopVia IXU',
       'stop_cleaned_1-stopVia Indore', 'stop_cleaned_1-stopVia JGB',
       'stop_cleaned_1-stopVia JRG', 'stop_cleaned_1-stopVia KLH',
       'stop_cleaned_1-stopVia Kolhapur', 'stop_cleaned_1-stopVia Kolkata',
       'stop_cleaned_1-stopVia Lucknow', 'stop_cleaned_1-stopVia MYQ',
       'stop_cleaned_1-stopVia Mangalore', 'stop_cleaned_1-stopVia Mumbai',
       'stop_cleaned_1-stopVia Mysore', 'stop_cleaned_1-stopVia NAG',
       'stop_cleaned_1-

In [80]:
shuffled_df.dtypes

price                                 float64
time_taken                            float64
stop_cleaned_1-stop                   float64
stop_cleaned_1-stopVia BBI            float64
stop_cleaned_1-stopVia Bhubaneswar    float64
                                       ...   
arr_time_category_Early Morning       float64
arr_time_category_Evening             float64
arr_time_category_Late Night          float64
arr_time_category_Morning             float64
arr_time_category_Night               float64
Length: 75, dtype: object

### Training Regression Models

#### Train Test Split

In [84]:
from sklearn.model_selection import train_test_split

X, y = shuffled_df.drop("price", axis=1), shuffled_df.price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#### Evaluate

In [96]:
from sklearn.metrics import mean_absolute_percentage_error
# Function to train and evaluate a model
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the test data
    y_pred = model.predict(X_test)
    
    # Calculate evaluation metrics
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    
    # Print results
    print(f"\n{model_name} - Performance:")
    print(f"R²: {r2:.4f}")
    print(f"MAPE: {mape:.4f}")
    
    # Get feature importances if supported
    if hasattr(model, 'feature_importances_'):
        feature_importances = model.feature_importances_
        feature_importance_df = pd.DataFrame({
            'Feature': X.columns,
            'Importance': feature_importances
        }).sort_values(by='Importance', ascending=False)
        print(f"\n{model_name} - Top 10 Feature Importances:\n", feature_importance_df.head(10))
    
    return r2, mape

#### Linear Regression Model

In [99]:
from sklearn.linear_model import LinearRegression

linear_reg = LinearRegression().fit(X_train, y_train)


In [101]:
# make predictions using the testing set
linear_price_y_pred = linear_reg.predict(X_test)

In [103]:
from sklearn.metrics import mean_squared_error, r2_score

r2, mape = evaluate_model(linear_reg, X_train, X_test, y_train, y_test, "Linear Regression")



Linear Regression - Performance:
R²: 0.9048
MAPE: 0.5022


In [129]:
import pandas as pd
import numpy as np

# Assuming linear_reg is your fitted LinearRegression model
# Get the coefficients
coefficients = linear_reg.coef_

# Create a DataFrame to display the feature importance
feature_importance = pd.DataFrame({'Feature': X_train.columns, 'Importance': np.abs(coefficients)})

# Sort by importance
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

print(feature_importance)


                                  Feature    Importance
61                                  class  4.496671e+04
38  stop_cleaned_1-stopVia Vishakhapatnam  1.487315e+04
22         stop_cleaned_1-stopVia Lucknow  9.824651e+03
21         stop_cleaned_1-stopVia Kolkata  8.512696e+03
40                  stop_cleaned_non-stop  7.592589e+03
..                                    ...           ...
64              dep_time_category_Evening  7.646987e+01
0                              time_taken  5.270278e+01
67                dep_time_category_Night  3.049962e+01
72              arr_time_category_Morning  2.205616e+01
10             stop_cleaned_1-stopVia HYD  4.001777e-11

[74 rows x 2 columns]


In [207]:
linear_reg.score(X_test, y_test)

0.9060963053055201

#### Random Forest Regression Model

In [105]:
from sklearn.ensemble import RandomForestRegressor

RFR_reg = RandomForestRegressor(n_jobs=-1)
RFR_reg.fit(X_train, y_train)

In [107]:
RFR_reg.score(X_test, y_test)

0.9761432169416728

In [114]:
# make predictions using the testing set
RFR_price_y_pred = RFR_reg.predict(X_test)

In [115]:
from sklearn.metrics import mean_squared_error, r2_score

r2, mape = evaluate_model(RFR_reg, X_train, X_test, y_train, y_test, "RF Regression")


RF Regression - Performance:
R²: 0.9761
MAPE: 0.2327

RF Regression - Top 10 Feature Importances:
               Feature  Importance
61              class    0.897284
0          time_taken    0.058428
48    airline_Vistara    0.005253
41  airline_Air India    0.004944
57         from_Delhi    0.003699
51           to_Delhi    0.003353
60        from_Mumbai    0.002117
54          to_Mumbai    0.001977
53         to_Kolkata    0.001825
59       from_Kolkata    0.001679


#### Decision Tree Regression Model

In [118]:
# import the regressor 
from sklearn.tree import DecisionTreeRegressor  
  
# create a regressor object 
DTR_reg = DecisionTreeRegressor(random_state = 42)  
  
# fit the regressor with X and Y data 
DTR_reg.fit(X_train, y_train) 

In [120]:
DTR_reg.score(X_test, y_test)

0.9757864337065146

In [122]:
# make predictions using the testing set
DTR_price_y_pred = RFR_reg.predict(X_test)

In [124]:
from sklearn.metrics import mean_squared_error, r2_score
r2, mape = evaluate_model(DTR_reg, X_train, X_test, y_train, y_test, "DT Regression")


DT Regression - Performance:
R²: 0.9758
MAPE: 0.2319

DT Regression - Top 10 Feature Importances:
                       Feature  Importance
61                      class    0.898123
0                  time_taken    0.057607
41          airline_Air India    0.009499
57                 from_Delhi    0.004195
51                   to_Delhi    0.003907
60                from_Mumbai    0.002261
53                 to_Kolkata    0.001941
59               from_Kolkata    0.001604
54                  to_Mumbai    0.001516
70  arr_time_category_Evening    0.001488


#### Analyzation

##### Which of the models do you recommend and why?

- Random Forest Regression has the highest R² (0.9761) and a relatively low MAPE (0.2327), indicating it captures most of the variance and has the most accurate predictions on average.
  
- Decision Tree Regression has a similar R² (0.9758) and MAPE (0.2319), but it's slightly less accurate than the Random Forest model.
  
- Linear Regression has a lower R² (0.9048) and a higher MAPE (0.5022), indicating it doesn't capture the data's complexity as well.
  
We recommend the Random Forest Regression since it provides the best combination of high R² and low MAPE, making it the most reliable model for predictions.