In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import pandas as pd

data = pd.read_csv("Clean_Dataset.csv")

# Drop unnecessary columns and select features
data = data.drop(columns=["Unnamed: 0", "flight"])

# One-hot encode categorical variables
categorical_cols = ["airline", "source_city", "departure_time", "stops", "arrival_time", "destination_city", "class"]
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_categorical = encoder.fit_transform(data[categorical_cols])

# Combine encoded categorical variables with numerical ones
numerical_cols = ["duration", "days_left"]
X = np.hstack([encoded_categorical, data[numerical_cols].values])
y = data["price"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse, r2


(45720769.75632412, 0.9113048651706634)

In [8]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import numpy as np
import pandas as pd

# Load the dataset
data = pd.read_csv("Clean_Dataset.csv")

# Drop unnecessary columns and select features
data = data.drop(columns=["Unnamed: 0", "flight"])

# Separate features and target
categorical_cols = ["airline", "source_city", "departure_time", "stops", "arrival_time", "destination_city", "class"]
numerical_cols = ["duration", "days_left"]
target_col = "price"

# Log transform the target variable to reduce skewness
data[target_col] = np.log1p(data[target_col])

# Define preprocessing pipeline for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(drop="first", sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

# Define function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(np.expm1(y_test), np.expm1(y_pred))  # Reverse log transformation
    r2 = r2_score(np.expm1(y_test), np.expm1(y_pred))
    return mse, r2

# Split the data into train and test sets
X = data[categorical_cols + numerical_cols]
y = data[target_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create pipelines for Ridge, Lasso, and Random Forest
ridge_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", Ridge())
])

lasso_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", Lasso())
])

rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(random_state=42, n_estimators=100))
])

# Hyperparameter tuning for Ridge and Lasso
ridge_param_grid = {"model__alpha": [0.1, 1, 10, 100, 200]}
lasso_param_grid = {"model__alpha": [0.01, 0.1, 1, 10, 100]}

ridge_cv = GridSearchCV(ridge_pipeline, ridge_param_grid, scoring="neg_mean_squared_error", cv=5)
lasso_cv = GridSearchCV(lasso_pipeline, lasso_param_grid, scoring="neg_mean_squared_error", cv=5)

# Evaluate models
ridge_mse, ridge_r2 = evaluate_model(ridge_cv, X_train, X_test, y_train, y_test)
lasso_mse, lasso_r2 = evaluate_model(lasso_cv, X_train, X_test, y_train, y_test)
rf_mse, rf_r2 = evaluate_model(rf_pipeline, X_train, X_test, y_train, y_test)

# Print results
print("Best Ridge Alpha:", ridge_cv.best_params_["model__alpha"])
print("Ridge MSE:", ridge_mse)
print("Ridge R-squared:", ridge_r2)

print("Best Lasso Alpha:", lasso_cv.best_params_["model__alpha"])
print("Lasso MSE:", lasso_mse)
print("Lasso R-squared:", lasso_r2)

print("Random Forest MSE:", rf_mse)
print("Random Forest R-squared:", rf_r2)


Best Ridge Alpha: 0.1
Ridge MSE: 60801278.33402935
Ridge R-squared: 0.8820497203267926
Best Lasso Alpha: 0.01
Lasso MSE: 61191524.44361653
Lasso R-squared: 0.8812926698333093
Random Forest MSE: 7727635.090948485
Random Forest R-squared: 0.9850089217667035
