In [2]:
# 1. Libraries
import pandas as pd
import mlflow
import mlflow.sklearn

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

# 2. Load dataset
df = pd.read_csv('data/preprocessed_airfare_data.csv')

# 3. Features & Target
X = df.drop(columns=['Price'])
y = df['Price']

# 4. Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Identify feature types
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# 6. Preprocessing
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# 7. Final Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])

# 8. MLflow Tracking
mlflow.set_experiment("Airfare_Pipeline_Experiment")

with mlflow.start_run(run_name="RandomForest_Pipeline"):
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)

    # Evaluation
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    # Log to MLflow
    mlflow.log_param("model", "RandomForestRegressor")
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("R2", r2)

    # Save model
    mlflow.sklearn.log_model(pipeline, "airfare_pipeline_model")

    print(f"MAE: {mae:.2f}, MSE: {mse:.2f}, R2: {r2:.2f}")

mlflow.end_run()


2025/04/09 11:27:13 INFO mlflow.tracking.fluent: Experiment with name 'Airfare_Pipeline_Experiment' does not exist. Creating a new experiment.




MAE: 588.14, MSE: 2228176.76, R2: 0.90
