In [1]:
"""
Machine Learning Model for Airline Price Prediction 
by using xgboost
"""
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import joblib
import logging
from typing import Dict, List, Tuple, Any, Optional
from pathlib import Path

In [2]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [13]:
class XGBoostAirlinePredictor:
    """Simplified XGBoost model for airline price prediction"""
    
    def __init__(self, n_estimators=100, max_depth=6, learning_rate=0.1, 
                 subsample=0.8, colsample_bytree=0.8, random_state=42):
        """
        Initialize XGBoost model with parameters
        
        Args:
            n_estimators: Number of boosting rounds
            max_depth: Maximum tree depth
            learning_rate: Learning rate
            subsample: Subsample ratio
            colsample_bytree: Feature subsample ratio
            random_state: Random state for reproducibility
        """
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.random_state = random_state
        
        self.model = None
        self.is_trained = False
        self.feature_names = None
        self.metrics = {}
        
    def train(self, X_train, y_train, X_val=None, y_val=None, early_stopping_rounds=10, verbose=True):
        """
        Train the XGBoost model
        
        Args:
            X_train: Training features
            y_train: Training target
            X_val: Validation features (optional)
            y_val: Validation target (optional)
            early_stopping_rounds: Early stopping rounds
            verbose: Whether to print training progress
        """
        logger.info("Training XGBoost model...")
        
        # Store feature names
        if hasattr(X_train, 'columns'):
            self.feature_names = list(X_train.columns)
        else:
            self.feature_names = [f'feature_{i}' for i in range(X_train.shape[1])]
        
        # Initialize XGBoost model
        self.model = xgb.XGBRegressor(
            n_estimators=self.n_estimators,
            max_depth=self.max_depth,
            learning_rate=self.learning_rate,
            subsample=self.subsample,
            colsample_bytree=self.colsample_bytree,
            random_state=self.random_state,
            n_jobs=-1,
            device='cpu' 
        )
        
        # Prepare evaluation set for early stopping
        eval_set = None
        if X_val is not None and y_val is not None:
            eval_set = [(X_val, y_val)]
        
        # Train the model - XGBoost 3.0+ compatible
        if eval_set and early_stopping_rounds:
            # XGBoost 3.0+ syntax
            self.model.set_params(early_stopping_rounds=early_stopping_rounds)
            self.model.fit(
                X_train, y_train,
                eval_set=eval_set,
                verbose=verbose
            )
        else:
            # Simple training without early stopping
            self.model.fit(X_train, y_train, verbose=verbose)
        
        self.is_trained = True
        
        # Calculate training metrics
        train_pred = self.model.predict(X_train)
        self.metrics['train'] = self._calculate_metrics(y_train, train_pred)
        
        # Calculate validation metrics if provided
        if X_val is not None and y_val is not None:
            val_pred = self.model.predict(X_val)
            self.metrics['val'] = self._calculate_metrics(y_val, val_pred)
            
        if verbose:
            print(f"Training completed!")
            print(f"Training RMSE: {self.metrics['train']['rmse']:.2f}")
            if 'val' in self.metrics:
                print(f"Validation RMSE: {self.metrics['val']['rmse']:.2f}")
        
        logger.info("XGBoost training completed")
        
    def predict(self, X):
        """Make predictions"""
        if not self.is_trained:
            raise ValueError("Model is not trained yet. Call train() first.")
        
        return self.model.predict(X)
    
    def _calculate_metrics(self, y_true, y_pred):
        """Calculate evaluation metrics"""
        return {
            'mae': mean_absolute_error(y_true, y_pred),
            'mse': mean_squared_error(y_true, y_pred),
            'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
            'r2': r2_score(y_true, y_pred),
            'mape': np.mean(np.abs((y_true - y_pred) / y_true)) * 100
        }
    
    def evaluate(self, X, y):
        """Evaluate model performance on given dataset"""
        if not self.is_trained:
            raise ValueError("Model is not trained yet.")
        
        y_pred = self.predict(X)
        metrics = self._calculate_metrics(y, y_pred)
        
        print("Model Performance:")
        print(f"MAE: {metrics['mae']:.2f}")
        print(f"RMSE: {metrics['rmse']:.2f}")
        print(f"R²: {metrics['r2']:.4f}")
        print(f"MAPE: {metrics['mape']:.2f}%")
        
        return metrics
    
    def get_feature_importance(self, plot=True, top_n=15):
        """
        Get and visualize feature importance
        
        Args:
            plot: Whether to create a plot
            top_n: Number of top features to show in plot
        """
        if not self.is_trained:
            raise ValueError("Model is not trained yet.")
        
        # Create feature importance dataframe
        importance_df = pd.DataFrame({
            'feature': self.feature_names,
            'importance': self.model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        if plot:
            plt.figure(figsize=(10, 8))
            top_features = importance_df.head(top_n)
            
            plt.barh(range(len(top_features)), top_features['importance'])
            plt.yticks(range(len(top_features)), top_features['feature'])
            plt.xlabel('Feature Importance')
            plt.title(f'Top {top_n} Feature Importance - XGBoost')
            plt.gca().invert_yaxis()
            plt.tight_layout()
            plt.show()
        
        return importance_df
    
    def plot_predictions(self, X, y, title="XGBoost Predictions vs Actual"):
        """Plot predictions vs actual values"""
        if not self.is_trained:
            raise ValueError("Model is not trained yet.")
        
        y_pred = self.predict(X)
        
        plt.figure(figsize=(10, 6))
        
        # Scatter plot
        plt.subplot(1, 2, 1)
        plt.scatter(y, y_pred, alpha=0.5)
        plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=2)
        plt.xlabel('Actual Prices')
        plt.ylabel('Predicted Prices')
        plt.title('Predicted vs Actual')
        
        # Residuals plot
        plt.subplot(1, 2, 2)
        residuals = y - y_pred
        plt.scatter(y_pred, residuals, alpha=0.5)
        plt.axhline(y=0, color='r', linestyle='--')
        plt.xlabel('Predicted Prices')
        plt.ylabel('Residuals')
        plt.title('Residuals Plot')
        
        plt.suptitle(title)
        plt.tight_layout()
        plt.show()
        
        # Print metrics
        metrics = self._calculate_metrics(y, y_pred)
        print(f"R² Score: {metrics['r2']:.4f}")
        print(f"RMSE: {metrics['rmse']:.2f}")
        
    def save_model(self, filepath):
        """Save the trained model"""
        if not self.is_trained:
            raise ValueError("Model is not trained yet.")
        
        Path(filepath).parent.mkdir(parents=True, exist_ok=True)
        
        model_data = {
            'model': self.model,
            'feature_names': self.feature_names,
            'metrics': self.metrics,
            'is_trained': self.is_trained,
            'params': {
                'n_estimators': self.n_estimators,
                'max_depth': self.max_depth,
                'learning_rate': self.learning_rate,
                'subsample': self.subsample,
                'colsample_bytree': self.colsample_bytree,
                'random_state': self.random_state
            }
        }
        
        joblib.dump(model_data, filepath)
        logger.info(f"Model saved to {filepath}")
        print(f"Model saved successfully to {filepath}")
    
    def load_model(self, filepath):
        """Load a trained model"""
        model_data = joblib.load(filepath)
        
        self.model = model_data['model']
        self.feature_names = model_data['feature_names']
        self.metrics = model_data['metrics']
        self.is_trained = model_data['is_trained']
        
        # Load parameters
        params = model_data['params']
        self.n_estimators = params['n_estimators']
        self.max_depth = params['max_depth']
        self.learning_rate = params['learning_rate']
        self.subsample = params['subsample']
        self.colsample_bytree = params['colsample_bytree']
        self.random_state = params['random_state']
        
        logger.info(f"Model loaded from {filepath}")
        print(f"Model loaded successfully from {filepath}")
    
    def hyperparameter_tuning(self, X_train, y_train, X_val, y_val, param_grid=None):
        """
        Simple hyperparameter tuning using validation set
        
        Args:
            X_train: Training features
            y_train: Training target
            X_val: Validation features
            y_val: Validation target
            param_grid: Dictionary of parameters to tune
        """
        if param_grid is None:
            param_grid = {
                'n_estimators': [50, 100, 200],
                'max_depth': [3, 6, 9],
                'learning_rate': [0.01, 0.1, 0.2],
                'subsample': [0.8, 0.9, 1.0],
                'colsample_bytree': [0.8, 0.9, 1.0]
            }
        
        best_score = float('inf')
        best_params = {}
        results = []
        
        print("Starting hyperparameter tuning...")
        
        # Simple grid search
        from itertools import product
        
        param_names = list(param_grid.keys())
        param_values = list(param_grid.values())
        
        total_combinations = np.prod([len(v) for v in param_values])
        print(f"Total combinations to test: {total_combinations}")
        
        for i, combination in enumerate(product(*param_values)):
            params = dict(zip(param_names, combination))
            
            # Create temporary model with current parameters
            temp_model = XGBoostAirlinePredictor(**params, random_state=self.random_state)
            temp_model.train(X_train, y_train, X_val, y_val, verbose=False)
            
            # Evaluate on validation set
            val_pred = temp_model.predict(X_val)
            val_rmse = np.sqrt(mean_squared_error(y_val, val_pred))
            
            results.append({
                'params': params.copy(),
                'val_rmse': val_rmse
            })
            
            if val_rmse < best_score:
                best_score = val_rmse
                best_params = params.copy()
            
            if (i + 1) % 10 == 0:
                print(f"Tested {i + 1}/{total_combinations} combinations. Best RMSE so far: {best_score:.4f}")
        
        print(f"Best parameters: {best_params}")
        print(f"Best validation RMSE: {best_score:.4f}")
        
        # Update model with best parameters
        self.__init__(**best_params, random_state=self.random_state)
        
        return best_params, best_score, results

In [14]:
if __name__ == "__main__":
    # Create sample data for testing
    print("Creating sample data for testing...")
    
    # Generate sample dataset
    np.random.seed(42)
    n_samples = 1000
    n_features = 15
    
    X_sample = np.random.randn(n_samples, n_features)
    # Create realistic price relationships
    y_sample = (X_sample[:, 0] * 100 +  # route factor
                X_sample[:, 1] * 50 +   # seasonality
                X_sample[:, 2] * 30 +   # fuel prices
                X_sample[:, 3] * 20 +   # holidays
                np.random.normal(0, 20, n_samples) + 300)  # base price + noise
    
    # Convert to DataFrame
    feature_names = [f'feature_{i}' for i in range(n_features)]
    X_sample = pd.DataFrame(X_sample, columns=feature_names)
    
    # Split data
    split_idx = int(0.7 * len(X_sample))
    val_idx = int(0.85 * len(X_sample))
    
    X_train = X_sample[:split_idx]
    y_train = y_sample[:split_idx]
    X_val = X_sample[split_idx:val_idx]
    y_val = y_sample[split_idx:val_idx]
    X_test = X_sample[val_idx:]
    y_test = y_sample[val_idx:]
    
    # Initialize and train model
    print("Initializing XGBoost model...")
    model = XGBoostAirlinePredictor(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1
    )
    
    # Train model
    print("Training model...")
    model.train(X_train, y_train, X_val, y_val)
    
    # Evaluate model
    print("\nEvaluating on test set:")
    test_metrics = model.evaluate(X_test, y_test)
        
    # Test save/load
    print("\nTesting save/load functionality...")
    model.save_model("test_xgb_model.pkl")
    
    # Create new model and load
    new_model = XGBoostAirlinePredictor()
    new_model.load_model("test_xgb_model.pkl")
    
    # Test prediction with loaded model
    test_pred = new_model.predict(X_test[:5])
    print(f"Sample predictions: {test_pred}")
    
    print("\nXGBoost model test completed successfully!")

INFO:__main__:Training XGBoost model...


Creating sample data for testing...
Initializing XGBoost model...
Training model...
[0]	validation_0-rmse:109.14125
[1]	validation_0-rmse:100.58039
[2]	validation_0-rmse:98.31126
[3]	validation_0-rmse:90.94879
[4]	validation_0-rmse:84.73283
[5]	validation_0-rmse:83.33077
[6]	validation_0-rmse:77.52579
[7]	validation_0-rmse:73.43990
[8]	validation_0-rmse:69.40930
[9]	validation_0-rmse:67.90851
[10]	validation_0-rmse:64.12301
[11]	validation_0-rmse:60.85854
[12]	validation_0-rmse:58.13647
[13]	validation_0-rmse:55.60187
[14]	validation_0-rmse:53.19185
[15]	validation_0-rmse:51.19771
[16]	validation_0-rmse:49.33250
[17]	validation_0-rmse:47.80231
[18]	validation_0-rmse:46.34741
[19]	validation_0-rmse:44.79926
[20]	validation_0-rmse:43.63541
[21]	validation_0-rmse:42.58767
[22]	validation_0-rmse:41.52879
[23]	validation_0-rmse:41.44672
[24]	validation_0-rmse:40.41998
[25]	validation_0-rmse:39.73207
[26]	validation_0-rmse:39.20685
[27]	validation_0-rmse:38.95698
[28]	validation_0-rmse:38.38

INFO:__main__:XGBoost training completed


Training completed!
Training RMSE: 4.90
Validation RMSE: 33.22

Evaluating on test set:
Model Performance:
MAE: 25.70
RMSE: 31.35
R²: 0.9364
MAPE: 13.83%

Top 10 Feature Importance:
       feature  importance
0    feature_0    0.549522
1    feature_1    0.133193
2    feature_2    0.057326
3    feature_3    0.050080
12  feature_12    0.026344
11  feature_11    0.025024
14  feature_14    0.024223
6    feature_6    0.023095
5    feature_5    0.020662
4    feature_4    0.020371

Testing save/load functionality...


INFO:__main__:Model saved to test_xgb_model.pkl


Model saved successfully to test_xgb_model.pkl


INFO:__main__:Model loaded from test_xgb_model.pkl


Model loaded successfully from test_xgb_model.pkl
Sample predictions: [243.79791 372.4136  360.92062 243.52837 427.9935 ]

XGBoost model test completed successfully!
