In [7]:
import pandas as pd  # For data manipulation
import numpy as np  # For numerical operations
from sklearn.model_selection import train_test_split  # For splitting the dataset
from sklearn.ensemble import RandomForestRegressor  # For Random Forest model
from sklearn.metrics import mean_absolute_error, mean_squared_error  # For model evaluation
from sklearn.preprocessing import LabelEncoder  # For encoding categorical variables

In [8]:
# Load the dataset
file_path = "C:/Users/HP/Downloads/Clean_Dataset.csv"
df = pd.read_csv(file_path)

# Checking the first few rows of the dataset to understand its structure
df.head()

Unnamed: 0.1,Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [9]:
# Print column names to check
print(df.columns)

Index(['Unnamed: 0', 'airline', 'flight', 'source_city', 'departure_time',
       'stops', 'arrival_time', 'destination_city', 'class', 'duration',
       'days_left', 'price'],
      dtype='object')


In [10]:
# Calculate the average price for each airline
avg_price_by_airline = df.groupby('airline')['price'].mean()

# Sort by price in descending order to find the most expensive airline
most_expensive_airline = avg_price_by_airline.sort_values(ascending=False).head(1)

# Print the result
print("Most Expensive Airline:")
print(most_expensive_airline)

Most Expensive Airline:
airline
Vistara    30396.536302
Name: price, dtype: float64


In [11]:
# Subset the dataset to include only Vistara flights
vistara_df = df[df['airline'] == 'Vistara']

# Check the first few rows of the subsetted data
vistara_df.head()

Unnamed: 0.1,Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
3,3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955
5,5,Vistara,UK-945,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.33,1,5955
6,6,Vistara,UK-927,Delhi,Morning,zero,Morning,Mumbai,Economy,2.08,1,6060
7,7,Vistara,UK-951,Delhi,Afternoon,zero,Evening,Mumbai,Economy,2.17,1,6060


In [12]:
# Drop the 'Unnamed: 0' column from the original dataframe
vistara_df = vistara_df.drop(columns=['Unnamed: 0'])

# One-hot encoding categorical variables
vistara_df_encoded = pd.get_dummies(vistara_df, drop_first=True)

# Handle missing values (if any)
vistara_df_encoded = vistara_df_encoded.dropna()

# Define features (X) and target variable (y)
X_vistara = vistara_df_encoded.drop('price', axis=1)  # All features except 'price'
y_vistara = vistara_df_encoded['price']  # Target variable: price

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Split the data into training and testing sets (80% train, 20% test)
X_train_vistara, X_test_vistara, y_train_vistara, y_test_vistara = train_test_split(X_vistara, y_vistara, test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor model
rf_vistara_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_vistara_model.fit(X_train_vistara, y_train_vistara)

# Make predictions
y_pred_vistara = rf_vistara_model.predict(X_test_vistara)

# Evaluate the model (Mean Squared Error)
mse_vistara = mean_squared_error(y_test_vistara, y_pred_vistara)
print(f"Mean Squared Error for Vistara: {mse_vistara}")

In [None]:
# Feature importance for Vistara flight ticket price prediction
importances_vistara = rf_vistara_model.feature_importances_

# Create a DataFrame with feature names and their importance scores
feature_importance_vistara_df = pd.DataFrame({
    'Feature': X_vistara.columns,
    'Importance': importances_vistara
})

# Sort the features by importance
feature_importance_vistara_df = feature_importance_vistara_df.sort_values(by='Importance', ascending=False)

# Print top features
print("Top Features Influencing Vistara Flight Price:")
print(feature_importance_vistara_df.head())

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

# Define a smaller set of hyperparameters to search over
param_grid = {
    'n_estimators': [100, 200],  # Number of trees in the forest
    'max_depth': [None, 10, 20],  # Depth of each tree (controls overfitting)
    'min_samples_split': [2, 10],  # Minimum samples to split an internal node
    'min_samples_leaf': [1, 2],  # Minimum samples at each leaf node
    'bootstrap': [True, False]  # Whether to sample data with replacement
}

# Initialize Random Forest model
rf_model = RandomForestRegressor(random_state=77)

# Initialize RandomizedSearchCV to search over parameter grid
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_grid,
                                   n_iter=5, cv=3, verbose=2, random_state=77, n_jobs=-1)

# Fit the model to the data
random_search.fit(X_train_vistara, y_train_vistara)

# Output the best parameters found by the search
print(f"Best Hyperparameters: {random_search.best_params_}")

# Get the best model from the search
best_rf_model = random_search.best_estimator_

# Evaluate the best model on the test data
y_pred_best = best_rf_model.predict(X_test_vistara)

In [None]:
from sklearn.metrics import mean_squared_error

# Calculate Mean Squared Error to evaluate the model's accuracy
mse_best = mean_squared_error(y_test_vistara, y_pred_best)

# Print the MSE
print(f"Mean Squared Error after Hyperparameter Tuning: {mse_best}")

In [None]:
import pandas as pd

# Create a DataFrame to compare actual vs predicted prices for the test data
comparison_df = pd.DataFrame({
    'Actual Price': y_test_vistara,  # Actual prices from the test data
    'Predicted Price': y_pred_best   # Predicted prices from the model
})

# Display the comparison of actual and predicted prices
print(comparison_df.head())  # Print the first few rows of the comparison

In [None]:
import matplotlib.pyplot as plt

# Scatter plot of actual vs predicted prices
plt.figure(figsize=(10, 6))
plt.scatter(y_test_vistara, y_pred_best, color='blue', alpha=0.6)  # Actual vs Predicted
plt.plot([min(y_test_vistara), max(y_test_vistara)], [min(y_test_vistara), max(y_test_vistara)], color='red', linestyle='--')  # Ideal line
plt.title("Actual vs Predicted Prices for Vistara Flights")
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.show()

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Calculate MAE
mae = mean_absolute_error(y_test_vistara, y_pred_best)
print(f"Mean Absolute Error (MAE): {mae}")

In [None]:
print(best_rf_model)

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

# Select a single tree from the Random Forest
plt.figure(figsize=(20, 10))
plot_tree(best_rf_model.estimators_[0], filled=True, feature_names=X_train_vistara.columns, rounded=True)
plt.show()