In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the dataset
file_path = r"C:\Users\OMEN\OneDrive\Desktop\Flight_Price.csv"
df = pd.read_csv(file_path)

In [3]:
# Drop any rows with missing values
df.dropna(inplace=True)

In [4]:
# Convert 'Date_of_Journey' to datetime and extract just the date
df['Date_of_Journey'] = pd.to_datetime(df['Date_of_Journey'], format='%d-%m-%Y')
df['Journey_Date'] = df['Date_of_Journey'].dt.date


In [5]:
df.drop(columns=['Date_of_Journey'], inplace=True)

In [6]:
# Example: applying to the actual DataFrame column 'Arrival_Time'
def extract_arrival_parts(value):
    try:
        parts = value.strip().split()
        
        # Case: Date + Time
        if len(parts) == 2:
            date_str = parts[0]
            time_str = parts[1]
        # Case: Only Time
        elif len(parts) == 1 and ':' in parts[0]:
            date_str = np.nan
            time_str = parts[0]
        else:
            return pd.Series([np.nan, np.nan, np.nan])
        
        hour, minute = map(int, time_str.split(':'))
        return pd.Series([date_str, hour, minute])
    
    except Exception as e:
        return pd.Series([np.nan, np.nan, np.nan])

# Apply the function to Arrival_Time
df[['Arrival_Date', 'Arrival_Hour', 'Arrival_Minute']] = df['Arrival_Time'].apply(extract_arrival_parts)


In [7]:
df.head()

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Journey_Date,Arrival_Date,Arrival_Hour,Arrival_Minute
0,IndiGo,Banglore,New Delhi,BLR ? DEL,22:20,23-03-2025 01:10,2h 50m,non-stop,No info,3897,2019-03-24,23-03-2025,1.0,10.0
1,Air India,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,05:50,13:15,7h 25m,2 stops,No info,7662,2019-05-01,,13.0,15.0
2,Jet Airways,Delhi,Cochin,DEL ? LKO ? BOM ? COK,09:25,10-06-2025 04:25,19h,2 stops,No info,13882,2019-06-09,10-06-2025,4.0,25.0
3,IndiGo,Kolkata,Banglore,CCU ? NAG ? BLR,18:05,23:30,5h 25m,1 stop,No info,6218,2019-05-12,,23.0,30.0
4,IndiGo,Banglore,New Delhi,BLR ? NAG ? DEL,16:50,21:35,4h 45m,1 stop,No info,13302,2019-03-01,,21.0,35.0


In [8]:
# Extract Dep_Hour and Dep_Minute from Dep_Time column
def extract_dep_parts(time_str):
    try:
        hour, minute = map(int, time_str.strip().split(':'))
        return pd.Series([hour, minute])
    except:
        return pd.Series([np.nan, np.nan])

# Apply the function to create two new columns
df[['Dep_Hour', 'Dep_Minute']] = df['Dep_Time'].apply(extract_dep_parts)


In [9]:
df.head()

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Journey_Date,Arrival_Date,Arrival_Hour,Arrival_Minute,Dep_Hour,Dep_Minute
0,IndiGo,Banglore,New Delhi,BLR ? DEL,22:20,23-03-2025 01:10,2h 50m,non-stop,No info,3897,2019-03-24,23-03-2025,1.0,10.0,22,20
1,Air India,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,05:50,13:15,7h 25m,2 stops,No info,7662,2019-05-01,,13.0,15.0,5,50
2,Jet Airways,Delhi,Cochin,DEL ? LKO ? BOM ? COK,09:25,10-06-2025 04:25,19h,2 stops,No info,13882,2019-06-09,10-06-2025,4.0,25.0,9,25
3,IndiGo,Kolkata,Banglore,CCU ? NAG ? BLR,18:05,23:30,5h 25m,1 stop,No info,6218,2019-05-12,,23.0,30.0,18,5
4,IndiGo,Banglore,New Delhi,BLR ? NAG ? DEL,16:50,21:35,4h 45m,1 stop,No info,13302,2019-03-01,,21.0,35.0,16,50


In [10]:
df.drop(columns=['Dep_Time', 'Arrival_Time'], inplace=True)

In [11]:
df.head()


Unnamed: 0,Airline,Source,Destination,Route,Duration,Total_Stops,Additional_Info,Price,Journey_Date,Arrival_Date,Arrival_Hour,Arrival_Minute,Dep_Hour,Dep_Minute
0,IndiGo,Banglore,New Delhi,BLR ? DEL,2h 50m,non-stop,No info,3897,2019-03-24,23-03-2025,1.0,10.0,22,20
1,Air India,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,7h 25m,2 stops,No info,7662,2019-05-01,,13.0,15.0,5,50
2,Jet Airways,Delhi,Cochin,DEL ? LKO ? BOM ? COK,19h,2 stops,No info,13882,2019-06-09,10-06-2025,4.0,25.0,9,25
3,IndiGo,Kolkata,Banglore,CCU ? NAG ? BLR,5h 25m,1 stop,No info,6218,2019-05-12,,23.0,30.0,18,5
4,IndiGo,Banglore,New Delhi,BLR ? NAG ? DEL,4h 45m,1 stop,No info,13302,2019-03-01,,21.0,35.0,16,50


In [12]:
# Reorder columns in the specified format
desired_order = [
    'Airline',
    'Journey_Date',
    'Source',
    'Destination',
    'Route',
    'Dep_Hour',
    'Dep_Minute',
    'Arrival_Date',
    'Arrival_Hour',
    'Arrival_Minute',
    'Duration',
    'Total_Stops',
    'Additional_Info',
    'Price'
]

df = df[desired_order]

In [13]:
#Convert Arrival_Date to datetime64[ns] (in case it's still string or object)
df['Arrival_Date'] = pd.to_datetime(df['Arrival_Date'], format='%d-%m-%Y', errors='coerce')

In [14]:
df.head()

Unnamed: 0,Airline,Journey_Date,Source,Destination,Route,Dep_Hour,Dep_Minute,Arrival_Date,Arrival_Hour,Arrival_Minute,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR ? DEL,22,20,2025-03-23,1.0,10.0,2h 50m,non-stop,No info,3897
1,Air India,2019-05-01,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,5,50,NaT,13.0,15.0,7h 25m,2 stops,No info,7662
2,Jet Airways,2019-06-09,Delhi,Cochin,DEL ? LKO ? BOM ? COK,9,25,2025-06-10,4.0,25.0,19h,2 stops,No info,13882
3,IndiGo,2019-05-12,Kolkata,Banglore,CCU ? NAG ? BLR,18,5,NaT,23.0,30.0,5h 25m,1 stop,No info,6218
4,IndiGo,2019-03-01,Banglore,New Delhi,BLR ? NAG ? DEL,16,50,NaT,21.0,35.0,4h 45m,1 stop,No info,13302


In [15]:
df.tail()

Unnamed: 0,Airline,Journey_Date,Source,Destination,Route,Dep_Hour,Dep_Minute,Arrival_Date,Arrival_Hour,Arrival_Minute,Duration,Total_Stops,Additional_Info,Price
10678,Air Asia,2019-04-09,Kolkata,Banglore,CCU ? BLR,19,55,NaT,22.0,25.0,2h 30m,non-stop,No info,4107
10679,Air India,2019-04-27,Kolkata,Banglore,CCU ? BLR,20,45,NaT,23.0,20.0,2h 35m,non-stop,No info,4145
10680,Jet Airways,2019-04-27,Banglore,Delhi,BLR ? DEL,8,20,NaT,11.0,20.0,3h,non-stop,No info,7229
10681,Vistara,2019-03-01,Banglore,New Delhi,BLR ? DEL,11,30,NaT,14.0,10.0,2h 40m,non-stop,No info,12648
10682,Air India,2019-05-09,Delhi,Cochin,DEL ? GOI ? BOM ? COK,10,55,NaT,19.0,15.0,8h 20m,2 stops,No info,11753


In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [14]:
# Load preprocessed dataset
df = pd.read_csv(r"C:\Users\OMEN\OneDrive\Desktop\Processed_Flight_Data.csv")  # If saved, else use df_final

In [15]:
# Drop any leftover non-numeric columns (like Total_Stops)
if "Total_Stops" in df.columns:
    df.drop(columns=["Total_Stops"], inplace=True)


In [16]:
# Define features and target
X = df.drop(columns=["Price"])
y = df["Price"]

In [17]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [18]:
# Train model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

In [19]:
# Predict
y_pred = lr_model.predict(X_test)

In [24]:
# Evaluate
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

In [25]:
# Output metrics
print(f"Linear Regression RMSE: ₹{rmse:.2f}")
print(f"Linear Regression R² Score: {r2:.4f}")

Linear Regression RMSE: ₹2863.63
Linear Regression R² Score: 0.6197


In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [27]:
# Load preprocessed dataset
df = pd.read_csv(r"C:\Users\OMEN\OneDrive\Desktop\Processed_Flight_Data.csv")  # If saved, else use df_final

In [28]:
# Drop any non-numeric columns
for col in df.columns:
    if df[col].dtype == 'object':
        print(f"Dropping non-numeric column: {col}")
        df.drop(columns=[col], inplace=True)


Dropping non-numeric column: Total_Stops


In [29]:
# Features and target
X = df.drop(columns=["Price"])
y = df["Price"]

In [30]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [31]:
# Train Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [33]:
# Predict and evaluate
y_pred = rf_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

In [34]:
# Output results
print(f"🌲 Random Forest RMSE: ₹{rmse:.2f}")
print(f"🌲 Random Forest R² Score: {r2:.4f}")

🌲 Random Forest RMSE: ₹1964.48
🌲 Random Forest R² Score: 0.8210


In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
import numpy as np

In [36]:
# Load preprocessed dataset
df = pd.read_csv(r"C:\Users\OMEN\OneDrive\Desktop\Processed_Flight_Data.csv")  # If saved, else use df_final

In [37]:
# Drop non-numeric columns
for col in df.columns:
    if df[col].dtype == 'object':
        print(f"Dropping non-numeric column: {col}")
        df.drop(columns=[col], inplace=True)

Dropping non-numeric column: Total_Stops


In [38]:
# Features and target
X = df.drop(columns=["Price"])
y = df["Price"]

In [39]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [40]:
# Train XGBoost Regressor
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

In [42]:
# Predict and evaluate
y_pred = xgb_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

In [43]:
# Output results
print(f"⚡ XGBoost RMSE: ₹{rmse:.2f}")
print(f"⚡ XGBoost R² Score: {r2:.4f}")

⚡ XGBoost RMSE: ₹1828.12
⚡ XGBoost R² Score: 0.8450
