In [None]:
import polars as pl
import numpy as np
import plotly.express as px
import joblib

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import SelectFromModel

import warnings
warnings.filterwarnings('ignore')
pl.Config.set_tbl_rows(30)


In [None]:
SEED = 100622
TARGET = 'Salary'
ID = 'id'
N_FOLDS = 5

## Data read

In [None]:
def load_data(data_path="../data/"):
    """Load all required datasets."""
    df_salary = pl.read_csv(f"{data_path}salary.csv")
    df_people = pl.read_csv(f"{data_path}people.csv")
    df_descriptions = pl.read_csv(f"{data_path}descriptions.csv")
    return df_salary, df_people, df_descriptions

In [None]:
# prompt: fix the error below by adding drive to the environment

if 'google.colab' in str(get_ipython()):
    # Mount Google Drive if running in Google Colab
    from google.colab import drive
    drive.mount('/content/drive')
    df_salary_raw, df_people_raw, df_descriptions_raw = load_data(data_path="/content/drive/MyDrive/Postgrado ciencia de datos/pwc-challenge/data/")
else:
    # Load data from local directory
    df_salary_raw, df_people_raw, df_descriptions_raw = load_data(data_path="../data/")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df_salary_people_raw = df_salary_raw.join(df_people_raw,
                                      on="id", how="inner")
df_salary_people_raw

id,Salary,Age,Gender,Education Level,Job Title,Years of Experience
i64,f64,f64,str,str,str,f64
0,90000.0,32.0,"""Male""","""Bachelor's""","""Software Engineer""",5.0
1,65000.0,28.0,"""Female""","""Master's""","""Data Analyst""",3.0
2,150000.0,45.0,"""Male""","""PhD""","""Senior Manager""",15.0
3,60000.0,36.0,"""Female""","""Bachelor's""","""Sales Associate""",7.0
4,200000.0,52.0,"""Male""","""Master's""","""Director""",20.0
5,55000.0,29.0,"""Male""","""Bachelor's""","""Marketing Analyst""",2.0
6,120000.0,42.0,"""Female""","""Master's""","""Product Manager""",12.0
7,80000.0,31.0,"""Male""","""Bachelor's""","""Sales Manager""",4.0
8,45000.0,26.0,"""Female""","""Bachelor's""","""Marketing Coordinator""",1.0
9,110000.0,38.0,"""Male""","""PhD""","""Senior Scientist""",10.0


## Feature engineer

In [None]:
num_cols = ['Age']
cat_cols = ['Gender', 'Education Level', 'Job Title', 'Years of Experience']

### Target variable treatment

In [None]:
def clean_target_variable(df, target_col=TARGET):
    """Remove rows with null values in target variable."""
    initial_count = df.shape[0]
    df_cleaned = df.filter(pl.col(target_col).is_not_null())
    removed_count = initial_count - df_cleaned.shape[0]

    if removed_count > 0:
        print(f"Removed {removed_count} rows with null {target_col} values")

    # Only one value for a salary of 350 for a junior analys
    # I assume this is an error and remove it
    df_cleaned = df_cleaned.filter(pl.col(target_col) > 400)

    method = 'log_transform878'

        # 3. Handle outliers based on the chosen method

    # set target as global
    global TARGET

    if method == 'log_transform':
        # Apply a log transformation to compress the range of the target variable.
        # This is useful when the distribution is skewed and has a long tail.
        # It doesn't remove outliers but reduces their impact.
        # Use log1p to handle cases where target_col can be 0 or small positive values.
        print(f"Applying log1p transformation to '{target_col}'")
        df_cleaned = df_cleaned.with_columns(
            (pl.col(target_col) + 1).log().alias(f'{TARGET}_log')
        )
        TARGET = f'{TARGET}_log'
        # Note: If you choose this, your model will predict log-transformed values.
        # You'll need to inverse transform them (exp_minus_1) after prediction.
        print(f"A new column '{target_col}_log' has been created. Use this for training.")
        print("Remember to inverse transform your predictions (exp(pred) - 1) later.")

    elif method == 'capping':
        # Winsorization: Cap values at a certain percentile (e.g., 1st and 99th percentile).
        lower_percentile = 0.01
        upper_percentile = 0.99
        lower_cap = df_cleaned.select(pl.col(target_col).quantile(lower_percentile)).item()
        upper_cap = df_cleaned.select(pl.col(target_col).quantile(upper_percentile)).item()

        print(f"Capping '{target_col}' values between {lower_cap:.2f} (1st percentile) and {upper_cap:.2f} (99th percentile).")
        df_cleaned = df_cleaned.with_columns(
            pl.col(target_col).clip(lower_bound=lower_cap, upper_bound=upper_cap).alias(target_col)
        )
        print(f"Values outside this range have been capped.")

    else:
        print(f"Warning: Unknown outlier handling method '{method}'. No outlier treatment applied.")


    return df_cleaned

### Age treatment

In [None]:
def engineer_age_features(df):
    """Engineer age-related features."""

    # Create age bins
    df = df.with_columns(
        pl.when(pl.col('Age').is_not_null()).then(
            pl.when(pl.col('Age') < 30).then(pl.lit('young'))
            .when(pl.col('Age') < 45).then(pl.lit('mid'))
            .otherwise(pl.lit('old')))
            .otherwise(None).alias('Age_bin')
    )

    cat_cols.append('Age_bin')

    return df

### Gender treatment

In [None]:
def engineer_gender_features(df):
    """Engineer gender-related features."""

    pass

    return df

### Education treatment

In [None]:
def engineer_ed_features(df):
    """engineer education-related features."""

    pass

    return df

### Job title treatment

In [None]:
def engineer_jt_features(df):
    """engineer job title-related features."""

    # Create a new column for job title categories
    df = df.with_columns(
        pl.when(pl.col('Job Title').str.contains('Director')).then(pl.lit('Director'))
        .when(pl.col('Job Title').str.contains('Manager')).then(pl.lit('Manager'))
        .when(pl.col('Job Title').str.contains('Coordinator')).then(pl.lit('Coordinator'))
        .when(pl.col('Job Title').str.contains('Analyst')).then(pl.lit('Analyst'))
        .when(pl.col('Job Title').str.contains('CEO')).then(pl.lit('Executive'))
        .when(pl.col('Job Title').str.contains('Chief')).then(pl.lit('Executive'))
        .when(pl.col('Job Title').str.contains('VP')).then(pl.lit('Executive'))
        .otherwise(pl.lit('Other')).alias('Job_Title_Category')
    )

    df = df.with_columns(
        pl.when(pl.col('Job Title').str.contains('Senior')).then(pl.lit('Senior'))
        .when(pl.col('Job Title').str.contains('Junior')).then(pl.lit('Junior'))
        .otherwise(pl.lit('Other')).alias('Job_Level')
    )

    df = df.drop('Job Title')
    cat_cols.remove('Job Title')

    cat_cols.append('Job_Title_Category')
    cat_cols.append('Job_Level')

    return df

### Years of experience treatment

In [None]:
def engineer_yoe_features(df):
    """engineer years of experience-related features."""

    # create years of experience bins
    df = df.with_columns(
        pl.when(pl.col('Years of Experience') < 1).then(pl.lit('junior'))
        .when(pl.col('Years of Experience') < 3).then(pl.lit('mid'))
        .otherwise(pl.lit('senior')).alias('Experience_bin')
    )
    cat_cols.append('Experience_bin')

    return df

### Add interaction features

In [None]:
def create_interaction_features(df):
    """Create interaction features"""
    ed_age = df.group_by('Education Level').agg(
        pl.col('Age').mean().alias('Education_Age_mean'),
        pl.col('Age').std().alias('Education_Age_std')
    )

    df = df.join(ed_age, on='Education Level', how='left')

    num_cols.extend(['Education_Age_mean', 'Education_Age_std'])

    df = df.with_columns(
        (pl.col('Years of Experience') / pl.col('Age')).alias('YoE_Age_ratio'),
    )
    num_cols.append('YoE_Age_ratio')

    df = df.with_columns(
        (pl.col('Years of Experience') + pl.col('Years of Experience')).alias('YoE_squared'),
        (pl.col('Years of Experience') * pl.col('Age')).alias('YoE_Age_interaction')
    )

    num_cols.extend(['YoE_squared', 'YoE_Age_interaction'])

    return df

In [None]:
def perform_feature_engineering(df):
    """Perform all feature engineering steps."""
    # Remove ID column

    # Clean target variable
    df = clean_target_variable(df)

    # Engineer features
    df = engineer_age_features(df)
    df = engineer_gender_features(df)
    df = engineer_ed_features(df)
    df = engineer_jt_features(df)
    df = engineer_yoe_features(df)
    df = create_interaction_features(df)

    ids = df[ID]
    df = df.drop(ID)
    return df, ids

df_salary_people, ids = perform_feature_engineering(df_salary_people_raw)

Removed 2 rows with null Salary values


## Data preprocessing

In [None]:
# Salary has outliers, need to decide how to handle them

# age has 3 null values, need to decide how to fill them

# gender has 3 null values, need to decide how to fill them

# Education Level has 3 null values, need to decide how to fill them

# Job Title has 3 null values, need to decide how to fill them
# Job title has high cardinality, experiment extracting keywords

In [None]:
def create_preprocessing_pipeline():
    """Create sklearn preprocessing pipeline."""
    # numerical imputer
    num_imputer = SimpleImputer(strategy='median')
    # numerical scaler
    scaler = StandardScaler()
    # categorical imputer
    cat_imputer = SimpleImputer(strategy='most_frequent')
    # categorical encoder
    ohe_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    oe_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

    # numerical pipeline
    num_pipeline = Pipeline([
        ('num_imputer', num_imputer),
        ('scaler', scaler)
    ])
    # categorical pipeline
    cat_pipeline = Pipeline([
        ('cat_imputer', cat_imputer),
        ('ohe_encoder', ohe_encoder)
    ])

    # column transformer

    pipeline = ColumnTransformer([
        ('num', num_pipeline, num_cols),
        ('cat_ohe', cat_pipeline, cat_cols),
        ])
    return pipeline


In [None]:
X = df_salary_people.drop(TARGET)
# y_base = df_salary_people.select('Salary')
y = df_salary_people[TARGET]

In [None]:
# Analyze correlated features
corr_matrix = X.select(num_cols).to_pandas().corr()
fig = px.imshow(corr_matrix, text_auto=True, aspect="auto", color_continuous_scale='RdBu_r')
fig.update_layout(title='Correlation Matrix of Numerical Features')
fig.show()

## Metric definition

In [None]:
def metrics(y_true, y_pred):
    rmse = root_mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return {'rmse': rmse, 'r2': r2}

In [None]:
def bootstrap_confidence_intervals(y_true, y_pred, n_bootstrap=1000, confidence_level=0.95):
    """
    Calculate bootstrap confidence intervals for RMSE and R2.

    Args:
        y_true: True values
        y_pred: Predicted values
        n_bootstrap: Number of bootstrap samples
        confidence_level: Confidence level

    Returns:
        Dictionary with bootstrap confidence intervals
    """
    n_samples = len(y_true)
    bootstrap_rmse = []
    bootstrap_r2 = []

    np.random.seed(SEED)
    for _ in range(n_bootstrap):
        # Bootstrap sample indices
        indices = np.random.choice(n_samples, size=n_samples, replace=True)
        y_true_boot = y_true[indices]
        y_pred_boot = y_pred[indices]

        # Calculate metrics for bootstrap sample
        rmse_boot = root_mean_squared_error(y_true_boot, y_pred_boot)
        r2_boot = r2_score(y_true_boot, y_pred_boot)

        bootstrap_rmse.append(rmse_boot)
        bootstrap_r2.append(r2_boot)

    # Calculate percentile confidence intervals
    alpha = 1 - confidence_level
    lower_percentile = (alpha/2) * 100
    upper_percentile = (1 - alpha/2) * 100

    rmse_ci = np.percentile(bootstrap_rmse, [lower_percentile, upper_percentile])
    r2_ci = np.percentile(bootstrap_r2, [lower_percentile, upper_percentile])

    return {
        'rmse': {
            'mean': np.mean(bootstrap_rmse),
            'std': np.std(bootstrap_rmse),
            'lower_ci': rmse_ci[0],
            'upper_ci': rmse_ci[1]
        },
        'r2': {
            'mean': np.mean(bootstrap_r2),
            'std': np.std(bootstrap_r2),
            'lower_ci': r2_ci[0],
            'upper_ci': r2_ci[1]
        }
    }

In [None]:
def print_metrics_with_ci(results):
  print("\nBootstrap Confidence Intervals for Out-of-Fold Predictions:")
  print("=" * 60)
  print(f"RMSE: {results['rmse']['mean']:.2f}")
  print(f"      95% CI: [{results['rmse']['lower_ci']:.2f}, {results['rmse']['upper_ci']:.2f}]")
  print(f"R²:   {results['r2']['mean']:.3f}")
  print(f"      95% CI: [{results['r2']['lower_ci']:.3f}, {results['r2']['upper_ci']:.3f}]")

## Cross validation training

In [None]:
def feature_importance(pipeline, model):
    """Get feature importance from the model."""
    importances = []
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
    elif hasattr(model, 'coef_'):
        importances = np.abs(model.coef_)
    else:
        importances = np.zeros(len(pipeline.get_feature_names_out()))

    fe_impo = pl.DataFrame({
        'feature': pipeline.get_feature_names_out(),
        'importance': importances
    }).sort('importance', descending=True)

    return fe_impo

In [None]:
def cross_validation(model, X, y, seed = SEED, verbose=True):

    kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=seed)
    results = []
    importances_table = None
    oof_preds = np.zeros(len(y))

    for fold, (train_index, val_index) in enumerate(kf.split(X)):

        # print(f"Fold {fold + 1}/{N_FOLDS}")

        X_train, X_val = X[train_index].clone(), X[val_index].clone()
        y_train, y_val = y[train_index].clone(), y[val_index].clone()

        # Create preprocessing pipeline
        preprocessing_pipeline = create_preprocessing_pipeline()

        X_train = preprocessing_pipeline.fit_transform(X_train)
        X_val = preprocessing_pipeline.transform(X_val)

        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        oof_preds[val_index] = y_pred

        result = metrics(y_val, y_pred)
        results.append(result)

        # Get feature importance for this fold
        fe_impo = feature_importance(preprocessing_pipeline, model)



        # Rename the importance column to include fold number
        fe_impo = fe_impo.with_columns(
            pl.col('importance').alias(f'fold_{fold + 1}')
        ).drop('importance')

        # Initialize or join the importances table
        if importances_table is None:
            # First fold - create the base table
            importances_table = fe_impo
        else:
            # Subsequent folds - join on feature names
            importances_table = importances_table.join(
                fe_impo,
                on='feature',
                how='inner'
            )

        if verbose:
            # Print metrics for the current fold
            print(f"Fold {fold + 1} - RMSE: {result['rmse']:.2f}, R2: {result['r2']:.2f}")


    # Add summary statistics across folds
    fold_columns = [f'fold_{i+1}' for i in range(N_FOLDS)]

    # Calculate mean and std across only the fold columns (numeric)
    importances_table = importances_table.with_columns([
        pl.mean_horizontal([pl.col(col).round(2) for col in fold_columns]).alias('mean_importance'),
        pl.concat_list([pl.col(col).round(2) for col in fold_columns]).list.std().alias('std_importance')
    ]).drop(fold_columns)

    importances_table = importances_table.with_columns(
        (pl.col("mean_importance") / pl.col("mean_importance").sum() * 100).alias("mean_importance_pct").round(1),
    )
    # Round numeric columns to 2 decimals
    numeric_cols = ['mean_importance', 'std_importance']
    importances_table = importances_table.with_columns([
        pl.col(col).round(2) for col in numeric_cols
    ])

    # Sort by mean importance
    importances_table = importances_table.sort('mean_importance', descending=True)

    if verbose:
    # print mean metrics across folds
        avg_rmse = np.mean([result['rmse'] for result in results])
        avg_r2 = np.mean([result['r2'] for result in results])
        print(f"\nAverage RMSE across folds: {avg_rmse:.2f}")
        print(f"Average R2 across folds: {avg_r2:.2f}")

    return results, importances_table, oof_preds

# evaluate model with cv using multple seeds to assess stability
seeds = np.random.RandomState(SEED).choice(range(10000), size=5, replace=False).tolist()

def evaluate_model_seeds(model, X, y, seeds=seeds):

    results = []
    for s in seeds:

        seed_results, _, _ = cross_validation(model, X, y, s, verbose=False)

        avg_rmse = np.mean([result['rmse'] for result in seed_results])
        avg_r2 = np.mean([result['r2'] for result in seed_results])

        results.append({
            'seed': s,
            'rmse': avg_rmse,
            'r2': avg_r2
        })

    eval_results = pl.DataFrame(results)

    eval_results = eval_results.with_columns(
        pl.col('seed').cast(pl.String).alias('seed'))

    # Calculate the mean of 'rmse' and 'r2' columns
    mean_rmse = eval_results['rmse'].mean()
    mean_r2 = eval_results['r2'].mean()

    # Create a summary row DataFrame
    summary_row = pl.DataFrame({
        'seed': 'Mean',  # Label for the summary row
        'rmse': mean_rmse,
        'r2': mean_r2
    })

    eval_results = eval_results.vstack(summary_row)

    return eval_results

## Model evaluation

In [None]:
def plot_error_vs_predicted(oof_preds_rf, y, ids):
    errors = pl.DataFrame({
        'id': ids,
        'actual': y,
        'predicted': oof_preds_rf
    })
    # calculate the mean and std of the errors
    errors = errors.with_columns(
        ((pl.col('predicted') - pl.col('actual')) ** 2).alias('squared_error')
    )

    rmse_value = np.sqrt(errors['squared_error'].mean())

    fig = px.scatter(
        errors.to_pandas(),
        x='predicted',
        y='actual',
        hover_data=['id'],
        labels={'predicted': 'Predicted Values', 'actual': 'Real Values'},
        title=f'Real vs Predicted Values\nMean Error: {rmse_value:.2f}'
    )
    # add a line at 45 degrees

    max_val = max(errors['predicted'].max(), errors['actual'].max())

    fig.add_shape(
        type='line',
        x0=0,
        y0=0,
        x1=max_val,
        y1=max_val,
        line=dict(
            color='Red',
            width=2,
            dash='dash',
        )
    )
    fig.update_layout(
        width=800,
        height=800,
        legend_title_text='',
        showlegend=False
    )
    fig.show()

In [None]:
model_dummy = DummyRegressor(strategy='median')
results_dummy, _, _ = cross_validation(model_dummy, X, y)

Fold 1 - RMSE: 56941.49, R2: -0.08
Fold 2 - RMSE: 44369.66, R2: -0.03
Fold 3 - RMSE: 45707.65, R2: -0.05
Fold 4 - RMSE: 48455.89, R2: -0.01
Fold 5 - RMSE: 45943.56, R2: -0.07

Average RMSE across folds: 48283.65
Average R2 across folds: -0.05


In [None]:
# pip install optuna

In [None]:
# hyperparameter optim
import optuna

def objective(trial):
  """
  Objective function for Optuna to minimize RMSE from cross-validation.
  """
  n_estimators = trial.suggest_int('n_estimators', 200, 400)
  max_depth = trial.suggest_int('max_depth', 6, 28)
  min_samples_split = trial.suggest_int('min_samples_split', 2, 5)
  min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)


  model = RandomForestRegressor(
      n_estimators=n_estimators,
      max_depth=max_depth,
      min_samples_split=min_samples_split,
      min_samples_leaf=min_samples_leaf,
      random_state=SEED,
      n_jobs=-1 # Use all available cores for faster training
  )

  # Run cross-validation and get the average RMSE
  results, _, _ = cross_validation(model, X, y, verbose=False)
  avg_rmse = np.mean([result['rmse'] for result in results])
  avg_r2 = np.mean([result['r2'] for result in results])

  return avg_r2

study = optuna.create_study(direction='maximize',  # Minimize RMSE
                                sampler=optuna.samplers.TPESampler(seed=SEED))
optuna.logging.set_verbosity(optuna.logging.WARNING)
study.optimize(objective, n_trials=50, show_progress_bar=True, )

best_params = study.best_params
best_params

  0%|          | 0/50 [00:00<?, ?it/s]

{'n_estimators': 327,
 'max_depth': 18,
 'min_samples_split': 2,
 'min_samples_leaf': 1}

In [None]:
# best_params = {'n_estimators': 298,
#  'max_depth': 8,
#  'min_samples_split': 2,
#  'min_samples_leaf': 1}

In [None]:
# Sort trials by value (RMSE) and print the top 10
# print("\nTop 10 trials sorted by RMSE:")
# sorted_trials = sorted(study.trials, key=lambda t: t.value)
# for i, trial in enumerate(sorted_trials[:10]):
#   print(f"Trial {i+1} (Value/RMSE: {trial.value:.2f}):")
#   print(f"n_estimators: {trial.params['n_estimators']} - max_depth: {trial.params['max_depth']} - min_samples_split: {trial.params['min_samples_split']} - min_samples_leaf: {trial.params['min_samples_leaf']}")



In [None]:
# rf regression model
model_rf = RandomForestRegressor(**best_params, random_state=SEED)
results_rf, importances_rf, oof_preds_rf = cross_validation(model_rf, X, y)
importances_rf.filter(pl.col('mean_importance') > 0)

Fold 1 - RMSE: 14828.02, R2: 0.93
Fold 2 - RMSE: 9591.73, R2: 0.95
Fold 3 - RMSE: 9081.01, R2: 0.96
Fold 4 - RMSE: 12807.00, R2: 0.93
Fold 5 - RMSE: 10095.58, R2: 0.95

Average RMSE across folds: 11280.67
Average R2 across folds: 0.94


feature,mean_importance,std_importance,mean_importance_pct
str,f64,f64,f64
"""num__YoE_Age_interaction""",0.32,0.04,32.9
"""num__Age""",0.3,0.13,30.9
"""num__YoE_squared""",0.18,0.08,18.1
"""num__YoE_Age_ratio""",0.09,0.05,8.8
"""cat_ohe__Job_Level_Senior""",0.03,0.01,3.3
"""cat_ohe__Job_Title_Category_Di…",0.02,0.01,1.6
"""cat_ohe__Job_Title_Category_Ex…",0.01,0.01,1.4
"""num__Education_Age_mean""",0.01,0.01,1.0
"""cat_ohe__Education Level_Bache…",0.01,0.0,0.8
"""cat_ohe__Job_Title_Category_Ma…",0.01,0.0,0.8


In [None]:
# Inverse log for oof preds
# oof_preds_rf = np.expm1(oof_preds_rf)

In [None]:
plot_error_vs_predicted(oof_preds_rf, y, ids)

In [None]:
rf_seed_results = evaluate_model_seeds(model_rf, X, y)
rf_seed_results

seed,rmse,r2
str,f64,f64
"""8431""",10911.790601,0.946754
"""176""",11580.600279,0.939202
"""3954""",11147.546051,0.943353
"""1946""",11094.049407,0.944101
"""1626""",11262.248325,0.940115
"""Mean""",11199.246933,0.942705


In [None]:
bootstrap_ci = bootstrap_confidence_intervals(np.array(y), oof_preds_rf)
print_metrics_with_ci(bootstrap_ci)


Bootstrap Confidence Intervals for Out-of-Fold Predictions:
RMSE: 11487.21
      95% CI: [9664.10, 13672.67]
R²:   0.942
      95% CI: [0.922, 0.959]


In [None]:
# export model

preprocessing_pipeline = create_preprocessing_pipeline()

X_train = preprocessing_pipeline.fit_transform(X)

final_rf_model = RandomForestRegressor(**best_params, random_state=SEED)
final_rf_model.fit(X_train, y)

# Save the model and preprocessing pipeline
# joblib.dump(final_rf_model, '../models/final_rf_model.pkl')
# joblib.dump(preprocessing_pipeline, '../models/rf_pipeline.pkl')


Stuff to do

1. Evaluate correlated features
1. Feature selection
1. Hyperparameter optimization
1. Modelblending
1. Model professionaldescription