In [1]:
# Parameters
departure = "BUR"
arrival = "LAS"
start_date = "2025-04-15"
end_date = "2025-04-22"
event = "Foreign/Domestic Conflict"


In [2]:
# Parameters (required by papermill) - DO NOT MODIFY
departure = "BOS"
arrival = "EWR"
start_date = "2025-04-14"
end_date = "2025-04-30"
event = "None"

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# 1. Load the data
df = pd.read_csv('visualizing-airfare-trends-backend/airline_data.csv')

# Parameters (required by papermill) - DO NOT MODIFY
departure = "BOS"
arrival = "EWR"
start_date = "2025-04-14"
end_date = "2025-04-30"
event = "None"

# 2. Basic Preprocessing
# Drop rows where fare (target) is missing
df = df.dropna(subset=['fare'])

# 3. Select Features
categorical_features = ['airport_1', 'airport_2']
numeric_features = [
    'nsmiles', 'passengers', 'large_ms', 'lf_ms',
    'fare_lg', 'fare_low', 'Year', 'quarter'
]
target = 'fare'

# Drop rows with missing important feature values
df = df.dropna(subset=numeric_features + categorical_features)

X = df[categorical_features + numeric_features]
y = df[target]

# 4. Build Preprocessing Pipeline
# - One-hot encode airports
# - Fill missing numeric values if any (shouldn't be necessary after dropna, but safe)
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', SimpleImputer(strategy='mean'), numeric_features)
    ]
)

# 5. Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 6. Build the Final Pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge(alpha=1.0))
])

# 7. Train the Model
model.fit(X_train, y_train)

# 8. Predict and Evaluate
y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"R² Score: {r2:.4f}")
print(f"Mean Absolute Error (MAE): ${mae:.2f}")


  df = pd.read_csv('visualizing-airfare-trends-backend/airline_data.csv')


R² Score: 0.9391
Mean Absolute Error (MAE): $11.00


In [4]:
# 9. Prediction Function
def predict_fare(origin_airport, destination_airport, year, quarter, distance, passengers,
                 large_market_share, lowfare_market_share, largest_fare, lowest_fare):
    """
    Predicts the airfare given user input.
    """

    # Create a single-row DataFrame matching the training features
    input_data = pd.DataFrame({
        'airport_1': [origin_airport],
        'airport_2': [destination_airport],
        'Year': [year],
        'quarter': [quarter],
        'nsmiles': [distance],
        'passengers': [passengers],
        'large_ms': [large_market_share],
        'lf_ms': [lowfare_market_share],
        'fare_lg': [largest_fare],
        'fare_low': [lowest_fare]
    })

    # Predict
    predicted_price = model.predict(input_data)[0]
    return predicted_price


In [5]:
# Example inputs:
predicted = predict_fare(
    origin_airport='ATL',  # Atlanta
    destination_airport='JFK',  # New York JFK
    year=2023,
    quarter=2,
    distance=760,  # Atlanta to JFK in miles
    passengers=15000,  # Hypothetical number
    large_market_share=0.6,  # 60%
    lowfare_market_share=0.3,  # 30%
    largest_fare=250,  # $250
    lowest_fare=180  # $180
)

print(f"Predicted Fare: ${predicted:.2f}")


Predicted Fare: $170.35


In [6]:
# # 1. Build Pipeline again (without fixed alpha)
# model = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('regressor', Ridge())  # No alpha specified yet
# ])

# # 2. Define Grid Search parameters
# param_grid = {
#     'regressor__alpha': [0.01, 0.1, 1, 10, 100]
# }

# # 3. GridSearchCV
# grid_search = GridSearchCV(
#     model,
#     param_grid,
#     cv=5,             # 5-fold cross-validation
#     scoring='r2',     # Maximize R²
#     n_jobs=-1
# )

# # 4. Fit GridSearchCV
# grid_search.fit(X_train, y_train)

# # 5. Get best model
# best_model = grid_search.best_estimator_

# # 6. Predict and Evaluate
# y_pred = best_model.predict(X_test)

# r2 = r2_score(y_test, y_pred)
# mae = mean_absolute_error(y_test, y_pred)

# print(f"Best Ridge alpha: {grid_search.best_params_['regressor__alpha']}")
# print(f"R² Score after tuning: {r2:.4f}")
# print(f"Mean Absolute Error after tuning: ${mae:.2f}")


In [7]:
# # 1. Sample a bit more data (20% instead of 10%)
# X_train_small = X_train.sample(frac=0.2, random_state=42)
# y_train_small = y_train.loc[X_train_small.index]

# # 2. Slightly bigger Random Forest search
# rf_param_grid = {
#     'regressor__n_estimators': [50, 100],    # More trees
#     'regressor__max_depth': [10, 20, None],  # Deeper trees
#     'regressor__min_samples_split': [2, 5]   # Try bigger splits
# }

# rf_random_search = RandomizedSearchCV(
#     rf_model,
#     rf_param_grid,
#     n_iter=4,        # Try 4 random combinations
#     cv=2,            # 2 folds still
#     scoring='r2',
#     n_jobs=-1,
#     verbose=1,
#     random_state=42
# )

# # 3. Train
# rf_random_search.fit(X_train_small, y_train_small)

# # 4. Predict and Evaluate
# rf_best_model = rf_random_search.best_estimator_
# rf_y_pred = rf_best_model.predict(X_test)

# rf_r2 = r2_score(y_test, rf_y_pred)
# rf_mae = mean_absolute_error(y_test, rf_y_pred)

# print(f"Random Forest Best Params (Medium Mode): {rf_random_search.best_params_}")
# print(f"Random Forest R² Score (Medium Mode): {rf_r2:.4f}")
# print(f"Random Forest MAE (Medium Mode): ${rf_mae:.2f}")


In [8]:
# rf_param_grid = {
#     'regressor__n_estimators': [100, 200, 300, 400],  # More trees
#     'regressor__max_depth': [15, 20, None],            # Deeper trees
#     'regressor__min_samples_split': [2, 5]             # Same splits
# }

# rf_random_search = RandomizedSearchCV(
#     rf_model,
#     rf_param_grid,
#     n_iter=10,          # Try 10 random configurations
#     cv=3,               # 3-fold cross-validation
#     scoring='r2',
#     n_jobs=-1,
#     verbose=1,
#     random_state=42
# )

# rf_random_search.fit(X_train, y_train)

# # Best model
# rf_best_model = rf_random_search.best_estimator_
# rf_y_pred = rf_best_model.predict(X_test)

# rf_r2 = r2_score(y_test, rf_y_pred)
# rf_mae = mean_absolute_error(y_test, rf_y_pred)

# print(f"Random Forest Best Params (Heavy Mode): {rf_random_search.best_params_}")
# print(f"Random Forest R² Score (Heavy Mode): {rf_r2:.4f}")
# print(f"Random Forest MAE (Heavy Mode): ${rf_mae:.2f}")


In [9]:
# from xgboost import XGBRegressor

# # 1. Build Pipeline (swap regressor to XGB)
# xgb_model = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('regressor', XGBRegressor(objective='reg:squarederror', random_state=42))
# ])

# # 2. Define hyperparameter grid for XGBoost
# xgb_param_grid = {
#     'regressor__n_estimators': [100, 200],
#     'regressor__max_depth': [3, 5, 7],
#     'regressor__learning_rate': [0.01, 0.1, 0.2]
# }

# # 3. GridSearchCV for XGBoost
# xgb_grid_search = GridSearchCV(
#     xgb_model,
#     xgb_param_grid,
#     cv=3,
#     scoring='r2',
#     n_jobs=-1,
#     verbose=1
# )

# # 4. Train XGBoost
# xgb_grid_search.fit(X_train, y_train)

# # 5. Best XGBoost model
# xgb_best_model = xgb_grid_search.best_estimator_

# # 6. Predict and Evaluate XGBoost
# xgb_y_pred = xgb_best_model.predict(X_test)

# xgb_r2 = r2_score(y_test, xgb_y_pred)
# xgb_mae = mean_absolute_error(y_test, xgb_y_pred)

# print(f"XGBoost Best Params: {xgb_grid_search.best_params_}")
# print(f"XGBoost R² Score: {xgb_r2:.4f}")
# print(f"XGBoost MAE: ${xgb_mae:.2f}")
