# **Linear Regression**


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import pandas as pd
import random
random.seed(42)

data = pd.read_csv("Clean_Dataset.csv")

# Drop unnecessary columns and select features
data = data.drop(columns=["Unnamed: 0", "flight"])

# One-hot encode categorical variables
categorical_cols = ["airline", "source_city", "departure_time", "stops", "arrival_time", "destination_city", "class"]
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_categorical = encoder.fit_transform(data[categorical_cols])

# Combine encoded categorical variables with numerical ones
numerical_cols = ["duration", "days_left"]
X = np.hstack([encoded_categorical, data[numerical_cols].values])
y = data["price"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse, r2

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import numpy as np
import pandas as pd

# Load the dataset
data = pd.read_csv("Clean_Dataset.csv")

# Drop unnecessary columns and select features
data = data.drop(columns=["Unnamed: 0", "flight"])

# Separate features and target
categorical_cols = ["airline", "source_city", "departure_time", "stops", "arrival_time", "destination_city", "class"]
numerical_cols = ["duration", "days_left"]
target_col = "price"

# Log transform the target variable to reduce skewness
data[target_col] = np.log1p(data[target_col])

# Define preprocessing pipeline for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(drop="first", sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

# Define function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(np.expm1(y_test), np.expm1(y_pred))  # Reverse log transformation
    r2 = r2_score(np.expm1(y_test), np.expm1(y_pred))
    return mse, r2

# Split the data into train and test sets
X = data[categorical_cols + numerical_cols]
y = data[target_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create pipelines for Ridge, Lasso, and Random Forest
ridge_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", Ridge())
])

lasso_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", Lasso())
])

rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(random_state=42, n_estimators=100))
])

# Hyperparameter tuning for Ridge and Lasso
ridge_param_grid = {"model__alpha": [0.1, 1, 10, 100, 200]}
lasso_param_grid = {"model__alpha": [0.01, 0.1, 1, 10, 100]}

ridge_cv = GridSearchCV(ridge_pipeline, ridge_param_grid, scoring="neg_mean_squared_error", cv=5)
lasso_cv = GridSearchCV(lasso_pipeline, lasso_param_grid, scoring="neg_mean_squared_error", cv=5)

# Evaluate models
ridge_mse, ridge_r2 = evaluate_model(ridge_cv, X_train, X_test, y_train, y_test)
lasso_mse, lasso_r2 = evaluate_model(lasso_cv, X_train, X_test, y_train, y_test)
rf_mse, rf_r2 = evaluate_model(rf_pipeline, X_train, X_test, y_train, y_test)

# Print results
print("Best Ridge Alpha:", ridge_cv.best_params_["model__alpha"])
print("Ridge MSE:", ridge_mse)
print("Ridge R-squared:", ridge_r2)

print("Best Lasso Alpha:", lasso_cv.best_params_["model__alpha"])
print("Lasso MSE:", lasso_mse)
print("Lasso R-squared:", lasso_r2)

print("Random Forest MSE:", rf_mse)
print("Random Forest R-squared:", rf_r2)

# Select 10 random samples from the test set
random_indices = random.sample(range(len(X_test)), 10)
X_sample = X_test.iloc[random_indices]
y_sample_actual = np.expm1(y_test.iloc[random_indices])  # Reverse log transform

# Fit the Random Forest pipeline and predict prices
rf_pipeline.fit(X_train, y_train)
y_sample_predicted = np.expm1(rf_pipeline.predict(X_sample))  # Reverse log transform

# Create a DataFrame to display actual vs predicted prices
predicted_vs_actual = pd.DataFrame({
    "Actual Price": y_sample_actual.values,
    "Predicted Price": y_sample_predicted
})

predicted_vs_actual

ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

# **Random Forest**

In [None]:
import pandas as pd  # For data manipulation
import numpy as np  # For numerical operations
from sklearn.model_selection import train_test_split  # For splitting the dataset
from sklearn.ensemble import RandomForestRegressor  # For Random Forest model
from sklearn.metrics import mean_absolute_error, mean_squared_error  # For model evaluation
from sklearn.preprocessing import LabelEncoder  # For encoding categorical variables

# Load the dataset
file_path = "Clean_Dataset.csv"
df = pd.read_csv(file_path)

# Checking the first few rows of the dataset to understand its structure
df.head()

# Print column names to check
print(df.columns)

# Calculate the average price for each airline
avg_price_by_airline = df.groupby('airline')['price'].mean()

# Sort by price in descending order to find the most expensive airline
most_expensive_airline = avg_price_by_airline.sort_values(ascending=False).head(1)

# Print the result
print("Most Expensive Airline:")
print(most_expensive_airline)

# Subset the dataset to include only Vistara flights
vistara_df = df[df['airline'] == 'Vistara']

# Check the first few rows of the subsetted data
vistara_df.head()

# Drop the 'Unnamed: 0' column from the original dataframe
vistara_df = vistara_df.drop(columns=['Unnamed: 0'])

# One-hot encoding categorical variables
vistara_df_encoded = pd.get_dummies(vistara_df, drop_first=True)

# Handle missing values (if any)
vistara_df_encoded = vistara_df_encoded.dropna()

# Define features (X) and target variable (y)
X_vistara = vistara_df_encoded.drop('price', axis=1)  # All features except 'price'
y_vistara = vistara_df_encoded['price']  # Target variable: price

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Split the data into training and testing sets (80% train, 20% test)
X_train_vistara, X_test_vistara, y_train_vistara, y_test_vistara = train_test_split(X_vistara, y_vistara, test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor model
rf_vistara_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_vistara_model.fit(X_train_vistara, y_train_vistara)

# Make predictions
y_pred_vistara = rf_vistara_model.predict(X_test_vistara)

# Evaluate the model (Mean Squared Error)
mse_vistara = mean_squared_error(y_test_vistara, y_pred_vistara)
print(f"Mean Squared Error for Vistara: {mse_vistara}")

# Feature importance for Vistara flight ticket price prediction
importances_vistara = rf_vistara_model.feature_importances_

# Create a DataFrame with feature names and their importance scores
feature_importance_vistara_df = pd.DataFrame({
    'Feature': X_vistara.columns,
    'Importance': importances_vistara
})

# Sort the features by importance
feature_importance_vistara_df = feature_importance_vistara_df.sort_values(by='Importance', ascending=False)

# Print top features
print("Top Features Influencing Vistara Flight Price:")
print(feature_importance_vistara_df.head())

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

# Define a smaller set of hyperparameters to search over
param_grid = {
    'n_estimators': [100, 200],  # Number of trees in the forest
    'max_depth': [None, 10, 20],  # Depth of each tree (controls overfitting)
    'min_samples_split': [2, 10],  # Minimum samples to split an internal node
    'min_samples_leaf': [1, 2],  # Minimum samples at each leaf node
    'bootstrap': [True, False]  # Whether to sample data with replacement
}

# Initialize Random Forest model
rf_model = RandomForestRegressor(random_state=77)

# Initialize RandomizedSearchCV to search over parameter grid
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_grid,
                                   n_iter=5, cv=3, verbose=2, random_state=77, n_jobs=-1)

# Fit the model to the data
random_search.fit(X_train_vistara, y_train_vistara)

# Output the best parameters found by the search
print(f"Best Hyperparameters: {random_search.best_params_}")

# Get the best model from the search
best_rf_model = random_search.best_estimator_

# Evaluate the best model on the test data
y_pred_best = best_rf_model.predict(X_test_vistara)

from sklearn.metrics import mean_squared_error

# Calculate Mean Squared Error to evaluate the model's accuracy
mse_best = mean_squared_error(y_test_vistara, y_pred_best)

# Print the MSE
print(f"Mean Squared Error after Hyperparameter Tuning: {mse_best}")

import pandas as pd

# Create a DataFrame to compare actual vs predicted prices for the test data
comparison_df = pd.DataFrame({
    'Actual Price': y_test_vistara,  # Actual prices from the test data
    'Predicted Price': y_pred_best   # Predicted prices from the model
})

# Display the comparison of actual and predicted prices
print(comparison_df.head())  # Print the first few rows of the comparison

import matplotlib.pyplot as plt

# Scatter plot of actual vs predicted prices
plt.figure(figsize=(10, 6))
plt.scatter(y_test_vistara, y_pred_best, color='blue', alpha=0.6)  # Actual vs Predicted
plt.plot([min(y_test_vistara), max(y_test_vistara)], [min(y_test_vistara), max(y_test_vistara)], color='red', linestyle='--')  # Ideal line
plt.title("Actual vs Predicted Prices for Vistara Flights")
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.show()

from sklearn.metrics import mean_absolute_error, mean_squared_error

# Calculate MAE
mae = mean_absolute_error(y_test_vistara, y_pred_best)
print(f"Mean Absolute Error (MAE): {mae}")

print(best_rf_model)

from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

# Select a single tree from the Random Forest
plt.figure(figsize=(20, 10))
plot_tree(best_rf_model.estimators_[0], filled=True, feature_names=X_train_vistara.columns, rounded=True)
plt.show()

Index(['Unnamed: 0', 'airline', 'flight', 'source_city', 'departure_time',
       'stops', 'arrival_time', 'destination_city', 'class', 'duration',
       'days_left', 'price'],
      dtype='object')
Most Expensive Airline:
airline
Vistara    30396.536302
Name: price, dtype: float64
Mean Squared Error for Vistara: 10472040.299447142
Top Features Influencing Vistara Flight Price:
                    Feature  Importance
155           class_Economy    0.861760
0                  duration    0.061344
1                 days_left    0.020944
151  destination_city_Delhi    0.005136
135       source_city_Delhi    0.004289
Fitting 3 folds for each of 5 candidates, totalling 15 fits


# **Gradient Boost Regressor**

In [None]:
import pandas as pd

# Load the dataset again
file_path = "Clean_Dataset.csv"
df = pd.read_csv(file_path)

# Step 1: Apply filters
df_filtered = df[
    (df["stops"] == "zero") &  # Direct flights only
    (df["source_city"].isin(["Delhi", "Mumbai"])) &  # Major metro cities
    (df["class"] == "Economy")  # Economy class only
]

# Step 2: Limit to 50,000 rows if more are available
df_filtered = df_filtered.sample(n=min(50000, len(df_filtered)), random_state=42)

# Step 3: Drop unnecessary columns
df_filtered = df_filtered.drop(columns=["Unnamed: 0", "flight"])

# Step 4: Encode categorical variables using Label Encoding
from sklearn.preprocessing import LabelEncoder, StandardScaler

categorical_columns = ["airline", "source_city", "departure_time", "stops", "arrival_time", "destination_city", "class"]
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df_filtered[col] = le.fit_transform(df_filtered[col])
    label_encoders[col] = le

# Step 5: Define features and target variable
X = df_filtered.drop(columns=["price"])
y = df_filtered["price"]

# Step 6: Scale numerical features
scaler = StandardScaler()
X[["duration", "days_left"]] = scaler.fit_transform(X[["duration", "days_left"]])

# Step 7: Train-test split (80% train, 20% test)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

# Step 8: Train Gradient Boosting Regressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=45)
gbr.fit(X_train, y_train)


# Step 9: Make predictions
y_pred_gbr = gbr.predict(X_test)

# Step 10: Evaluate model performance
def evaluate_model(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return {"Model": model_name, "MAE": mae, "MSE": mse, "R² Score": r2}

gbr_results = evaluate_model(y_test, y_pred_gbr, "Gradient Boosting Regressor (Filtered Data)")

# Display results using pandas directly
gbr_results_df

# **Support Vector Machine**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

df = pd.read_csv("Clean_Dataset.csv")

df.columns

df = df.drop(columns = ['flight', 'Unnamed: 0'], axis = 1 )

# Define important trained variables (features) and target variable
trained_variables = ['airline', 'source_city', 'departure_time', 'stops', 'destination_city', 'class', 'duration', 'days_left']
target_variable = 'price'

# Apply one-hot encoding to categorical features
categorical_features = ['airline', 'source_city', 'departure_time', 'stops', 'destination_city', 'class','arrival_time']
ohe = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
encoded_features = ohe.fit_transform(df[categorical_features])
encoded_df = pd.DataFrame(encoded_features, columns=ohe.get_feature_names_out(categorical_features))

# Reset index and ensure proper merging
df = df.drop(columns=categorical_features).reset_index(drop=True)
df = pd.concat([df, encoded_df], axis=1)

# Convert all columns to float64 to avoid type mismatch
df = df.astype('float64')

# Apply log transformation to the target variable
df[target_variable] = np.log1p(df[target_variable])

# Prepare dataset
X = df.drop(columns=[target_variable]).values
y = df[target_variable].values

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the dataset
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define an optimized Linear SVR model
svm_model = LinearSVR(C=1.0, max_iter=1000, random_state=42)

svm_model.fit(X_train, y_train)

# Predictions
y_pred = svm_model.predict(X_test)

# Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared Score: {r2:.2f}")

import matplotlib.pyplot as plt
import pandas as pd

# Add predictions to the test set for visualization
test_results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

# Function to plot predicted vs actual values
def plot_predicted_vs_actual(df, actual_column, predicted_column):
    plt.figure(figsize=(10, 6))
    plt.scatter(df[actual_column], df[predicted_column], alpha=0.6, color='blue', label='Predicted vs Actual')
    plt.plot([df[actual_column].min(), df[actual_column].max()],
             [df[actual_column].min(), df[actual_column].max()],
             color='red', lw=2, label='Ideal Fit')
    plt.title('Predicted vs Actual Values')
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.legend()
    plt.grid(alpha=0.3)
    plt.show()

# Plot the predicted vs actual values
plot_predicted_vs_actual(test_results, 'Actual', 'Predicted')

import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

# Convert continuous predictions to binary class labels
y_pred_binary = np.where(y_pred >= 9, 1, 0)
y_test_binary = np.where(y_test >= 9, 1, 0)

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test_binary, y_pred_binary)

# Function to plot the confusion matrix
def plot_confusion_matrix(conf_matrix, class_names):
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.show()


# Plot confusion matrix (code from earlier)
plot_confusion_matrix(conf_matrix, class_names=['Class 0', 'Class 1'])

# Print classification report
print("Classification Report:\n", classification_report(y_test_binary, y_pred_binary))


# **Neural Network**

In [None]:
#importing Required Libraries
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.impute import SimpleImputer
from tensorflow import keras
from tensorflow.keras.layers import Dropout,Input
from tensorflow.keras.regularizers import l2
from keras.models import Sequential
from keras.layers import Dense
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, BatchNormalization, LeakyReLU, Dropout

df = pd.read_csv("Clean_Dataset.csv")

df['price'].mean()
df = df.drop(columns = ['flight', 'Unnamed: 0'], axis = 1 )

# Define important trained variables (features) and target variable
trained_variables = ['airline', 'source_city', 'departure_time', 'stops', 'destination_city', 'class', 'duration', 'days_left']
target_variable = 'price'

# Apply one-hot encoding to categorical features
categorical_features = ['airline', 'source_city', 'departure_time', 'stops', 'destination_city', 'class','arrival_time']
ohe = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
encoded_features = ohe.fit_transform(df[categorical_features])
encoded_df = pd.DataFrame(encoded_features, columns=ohe.get_feature_names_out(categorical_features))

# Combine encoded categorical features with numerical features
df = df.drop(columns=categorical_features)
df = pd.concat([df, encoded_df], axis=1)

# Apply log transformation to the target variable
df[target_variable] = np.log1p(df[target_variable])

# Prepare dataset
X = df.drop(columns=[target_variable]).values
y = df[target_variable].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the dataset
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define an improved neural network model
model = Sequential([
    Dense(256, input_shape=(X_train.shape[1],)),
    BatchNormalization(),
    LeakyReLU(),
    Dropout(0.2),
    Dense(128),
    BatchNormalization(),
    LeakyReLU(),
    Dropout(0.2),
    Dense(64),
    BatchNormalization(),
    LeakyReLU(),
    Dropout(0.2),
    Dense(1, activation='linear')  # Regression output
])

# Define an improved neural network model
model = Sequential([
    Dense(256, kernel_regularizer=l2(0.001), input_shape=(X_train.shape[1],)),
    BatchNormalization(),
    LeakyReLU(),
    Dropout(0.2),
    Dense(128, kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    LeakyReLU(),
    Dropout(0.2),
    Dense(64, kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    LeakyReLU(),
    Dropout(0.2),
    Dense(1, activation='linear')  # Regression output
])

# Compile model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005), loss='mse', metrics=['mae'])

# Model summary
model.summary()

# Train the model
# Train the model with Early Stopping
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=64, callbacks=[callback])

# Evaluate the model
test_loss, test_mae = model.evaluate(X_test, y_test, verbose=1)
print(f"Test MAE: {test_mae}")

# Plot training and validation loss
plt.figure(figsize=(10, 5))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training vs Validation Loss')
plt.legend()
plt.show()


# Plot training and validation MAE
plt.figure(figsize=(10, 5))
plt.plot(history.history['mae'], label='Training MAE')
plt.plot(history.history['val_mae'], label='Validation MAE')
plt.xlabel('Epochs')
plt.ylabel('MAE')
plt.title('Training vs Validation MAE')
plt.legend()
plt.show()

