In [156]:
import polars as pl
import numpy as np
import plotly.express as px

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

import warnings
warnings.filterwarnings('ignore')
pl.Config.set_tbl_rows(30)


polars.config.Config

In [157]:
SEED = 100622
TARGET = 'Salary'
ID = 'id'
N_FOLDS = 5

## Data read

In [158]:
def load_data(data_path="../data/"):
    """Load all required datasets."""
    df_salary = pl.read_csv(f"{data_path}salary.csv")
    df_people = pl.read_csv(f"{data_path}people.csv")
    df_descriptions = pl.read_csv(f"{data_path}descriptions.csv")
    return df_salary, df_people, df_descriptions

In [159]:
# prompt: fix the error below by adding drive to the environment

if 'google.colab' in str(get_ipython()):
    # Mount Google Drive if running in Google Colab
    from google.colab import drive
    drive.mount('/content/drive')
    df_salary_raw, df_people_raw, df_descriptions_raw = load_data(data_path="/content/drive/MyDrive/Postgrado ciencia de datos/pwc-challenge/data/")
else:
    # Load data from local directory
    df_salary_raw, df_people_raw, df_descriptions_raw = load_data(data_path="../data/")

In [160]:
df_salary_people_raw = df_salary_raw.join(df_people_raw,
                                      on="id", how="inner")
df_salary_people_raw

id,Salary,Age,Gender,Education Level,Job Title,Years of Experience
i64,f64,f64,str,str,str,f64
0,90000.0,32.0,"""Male""","""Bachelor's""","""Software Engineer""",5.0
1,65000.0,28.0,"""Female""","""Master's""","""Data Analyst""",3.0
2,150000.0,45.0,"""Male""","""PhD""","""Senior Manager""",15.0
3,60000.0,36.0,"""Female""","""Bachelor's""","""Sales Associate""",7.0
4,200000.0,52.0,"""Male""","""Master's""","""Director""",20.0
5,55000.0,29.0,"""Male""","""Bachelor's""","""Marketing Analyst""",2.0
6,120000.0,42.0,"""Female""","""Master's""","""Product Manager""",12.0
7,80000.0,31.0,"""Male""","""Bachelor's""","""Sales Manager""",4.0
8,45000.0,26.0,"""Female""","""Bachelor's""","""Marketing Coordinator""",1.0
9,110000.0,38.0,"""Male""","""PhD""","""Senior Scientist""",10.0


## Feature engineer

In [161]:
num_cols = ['Age']
cat_cols = ['Gender', 'Education Level', 'Job Title', 'Years of Experience']

### Target variable treatment

In [162]:
def clean_target_variable(df, target_col=TARGET):
    """Remove rows with null values in target variable."""
    initial_count = df.shape[0]
    df_cleaned = df.filter(pl.col(target_col).is_not_null())
    removed_count = initial_count - df_cleaned.shape[0]
    
    if removed_count > 0:
        print(f"Removed {removed_count} rows with null {target_col} values")

    # Only one value for a salary of 350 for a junior analys
    # I assume this is an error and remove it
    df_cleaned = df_cleaned.filter(pl.col(target_col) > 400)
    
    return df_cleaned

### Age treatment

In [163]:
def engineer_age_features(df):
    """Engineer age-related features."""
    
    # Create age bins
    df = df.with_columns(
        pl.when(pl.col('Age').is_not_null()).then(
            pl.when(pl.col('Age') < 30).then(pl.lit('young'))
            .when(pl.col('Age') < 45).then(pl.lit('mid'))
            .otherwise(pl.lit('old')))
            .otherwise(None).alias('Age_bin')
    )

    cat_cols.append('Age_bin')
    
    return df

### Gender treatment

In [164]:
def engineer_gender_features(df):
    """Engineer gender-related features."""
    
    pass
    
    return df

### Education treatment

In [165]:
def engineer_ed_features(df):
    """engineer education-related features."""

    pass
    
    return df

### Job title treatment

In [166]:
def engineer_jt_features(df):
    """engineer job title-related features."""

    # Create a new column for job title categories
    df = df.with_columns(
        pl.when(pl.col('Job Title').str.contains('Director')).then(pl.lit('Director'))
        .when(pl.col('Job Title').str.contains('Manager')).then(pl.lit('Manager'))
        .when(pl.col('Job Title').str.contains('Coordinator')).then(pl.lit('Coordinator'))
        .when(pl.col('Job Title').str.contains('Analyst')).then(pl.lit('Analyst'))
        .when(pl.col('Job Title').str.contains('CEO')).then(pl.lit('Executive'))
        .when(pl.col('Job Title').str.contains('Chief')).then(pl.lit('Executive'))
        .when(pl.col('Job Title').str.contains('VP')).then(pl.lit('Executive'))
        .otherwise(pl.lit('Other')).alias('Job_Title_Category')
    )

    df = df.with_columns(
        pl.when(pl.col('Job Title').str.contains('Senior')).then(pl.lit('Senior'))
        .when(pl.col('Job Title').str.contains('Junior')).then(pl.lit('Junior'))
        .otherwise(pl.lit('Other')).alias('Job_Level')
    )

    df = df.drop('Job Title')
    cat_cols.remove('Job Title')

    cat_cols.append('Job_Title_Category')
    cat_cols.append('Job_Level')
    
    return df

### Years of experience treatment

In [167]:
def engineer_yoe_features(df):
    """engineer years of experience-related features."""

    # create years of experience bins
    df = df.with_columns(
        pl.when(pl.col('Years of Experience') < 1).then(pl.lit('junior'))
        .when(pl.col('Years of Experience') < 3).then(pl.lit('mid'))
        .otherwise(pl.lit('senior')).alias('Experience_bin')
    )
    cat_cols.append('Experience_bin')
    
    return df

### Add interaction features

In [168]:
def create_interaction_features(df):
    """Create interaction features"""
    ed_age = df.group_by('Education Level').agg(
        pl.col('Age').mean().alias('Education_Age_mean'),
        pl.col('Age').std().alias('Education_Age_std')
    )
    
    df = df.join(ed_age, on='Education Level', how='left')

    num_cols.extend(['Education_Age_mean', 'Education_Age_std'])

    df = df.with_columns(
        (pl.col('Years of Experience') / pl.col('Age')).alias('YoE_Age_ratio'),
    )
    num_cols.append('YoE_Age_ratio')
    
    df = df.with_columns(
        (pl.col('Years of Experience') + pl.col('Years of Experience')).alias('YoE_squared'),
        (pl.col('Years of Experience') * pl.col('Age')).alias('YoE_Age_interaction')
    )

    num_cols.extend(['YoE_squared', 'YoE_Age_interaction'])

    return df

In [169]:
def perform_feature_engineering(df):
    """Perform all feature engineering steps."""
    # Remove ID column
       
    # Clean target variable
    df = clean_target_variable(df)
    
    # Engineer features
    df = engineer_age_features(df)
    df = engineer_gender_features(df)
    df = engineer_ed_features(df)
    df = engineer_jt_features(df)
    df = engineer_yoe_features(df)
    df = create_interaction_features(df)

    ids = df[ID]
    df = df.drop(ID)
    return df, ids

df_salary_people, ids = perform_feature_engineering(df_salary_people_raw)

Removed 2 rows with null Salary values


## Data preprocessing

In [170]:
# Salary has outliers, need to decide how to handle them

# age has 3 null values, need to decide how to fill them

# gender has 3 null values, need to decide how to fill them

# Education Level has 3 null values, need to decide how to fill them

# Job Title has 3 null values, need to decide how to fill them
# Job title has high cardinality, experiment extracting keywords

In [171]:
def create_preprocessing_pipeline():
    """Create sklearn preprocessing pipeline."""
    # numerical imputer
    num_imputer = SimpleImputer(strategy='median')
    # numerical scaler
    scaler = StandardScaler()
    # categorical imputer
    cat_imputer = SimpleImputer(strategy='most_frequent')
    # categorical encoder
    ohe_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

    # numerical pipeline
    num_pipeline = Pipeline([
        ('num_imputer', num_imputer),
        ('scaler', scaler)
    ])
    # categorical pipeline
    cat_pipeline = Pipeline([
        ('cat_imputer', cat_imputer),
        ('ohe_encoder', ohe_encoder)
    ])

    # column transformer

    pipeline = ColumnTransformer([
        ('num', num_pipeline, num_cols),
        ('cat_ohe', cat_pipeline, cat_cols),
        ])
    return pipeline


In [172]:
X = df_salary_people.drop(TARGET)
y = df_salary_people[TARGET]

In [173]:
# Analyze correlated features
corr_matrix = X.select(num_cols).to_pandas().corr()
fig = px.imshow(corr_matrix, text_auto=True, aspect="auto", color_continuous_scale='RdBu_r')
fig.update_layout(title='Correlation Matrix of Numerical Features')
fig.show()

## Metric definition

In [174]:
def metrics(y_true, y_pred):
    rmse = root_mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return {'rmse': rmse, 'r2': r2}

## Cross validation training

In [175]:
def feature_importance(pipeline, model):
    """Get feature importance from the model."""
    importances = []
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
    elif hasattr(model, 'coef_'):
        importances = np.abs(model.coef_)
    else:
        importances = np.zeros(len(pipeline.get_feature_names_out()))
    
    fe_impo = pl.DataFrame({
        'feature': pipeline.get_feature_names_out(),
        'importance': importances
    }).sort('importance', descending=True)
    
    return fe_impo

In [176]:
def cross_validation(model, X, y, seed = SEED, verbose=True):

    kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=seed)
    results = []
    importances_table = None
    oof_preds = np.zeros(len(y))

    for fold, (train_index, val_index) in enumerate(kf.split(X)):

        # print(f"Fold {fold + 1}/{N_FOLDS}")

        X_train, X_val = X[train_index].clone(), X[val_index].clone()
        y_train, y_val = y[train_index].clone(), y[val_index].clone()

        # Create preprocessing pipeline
        preprocessing_pipeline = create_preprocessing_pipeline()

        X_train = preprocessing_pipeline.fit_transform(X_train)
        X_val = preprocessing_pipeline.transform(X_val)

        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        oof_preds[val_index] = y_pred

        result = metrics(y_val, y_pred)
        results.append(result)

        # Get feature importance for this fold
        fe_impo = feature_importance(preprocessing_pipeline, model)

        
        
        # Rename the importance column to include fold number
        fe_impo = fe_impo.with_columns(
            pl.col('importance').alias(f'fold_{fold + 1}')
        ).drop('importance')
        
        # Initialize or join the importances table
        if importances_table is None:
            # First fold - create the base table
            importances_table = fe_impo
        else:
            # Subsequent folds - join on feature names
            importances_table = importances_table.join(
                fe_impo, 
                on='feature', 
                how='inner'
            )

        if verbose:
            # Print metrics for the current fold
            print(f"Fold {fold + 1} - RMSE: {result['rmse']:.2f}, R2: {result['r2']:.2f}")

    
    # Add summary statistics across folds
    fold_columns = [f'fold_{i+1}' for i in range(N_FOLDS)]
    
    # Calculate mean and std across only the fold columns (numeric)
    importances_table = importances_table.with_columns([
        pl.mean_horizontal([pl.col(col).round(2) for col in fold_columns]).alias('mean_importance'),
        pl.concat_list([pl.col(col).round(2) for col in fold_columns]).list.std().alias('std_importance')
    ]).drop(fold_columns)

    importances_table = importances_table.with_columns(
        (pl.col("mean_importance") / pl.col("mean_importance").sum() * 100).alias("mean_importance_pct").round(1),
    )
    # Round numeric columns to 2 decimals
    numeric_cols = ['mean_importance', 'std_importance']
    importances_table = importances_table.with_columns([
        pl.col(col).round(2) for col in numeric_cols
    ])

    # Sort by mean importance
    importances_table = importances_table.sort('mean_importance', descending=True)

    if verbose:
    # print mean metrics across folds
        avg_rmse = np.mean([result['rmse'] for result in results])
        avg_r2 = np.mean([result['r2'] for result in results])
        print(f"\nAverage RMSE across folds: {avg_rmse:.2f}")
        print(f"Average R2 across folds: {avg_r2:.2f}")
        
    return results, importances_table, oof_preds
# evaluate model with cv using multple seeds to assess stability
seeds = np.random.RandomState(SEED).choice(range(10000), size=5, replace=False).tolist()

def evaluate_model_seeds(model, X, y, seeds=seeds):

    results = []
    for s in seeds:

        seed_results, _, _ = cross_validation(model, X, y, s, verbose=False)

        avg_rmse = np.mean([result['rmse'] for result in seed_results]).round(2)
        avg_r2 = np.mean([result['r2'] for result in seed_results]).round(2)
        std_rmse = np.std([result['rmse'] for result in seed_results]).round(2)
        std_r2 = np.std([result['r2'] for result in seed_results]).round(2)


        results.append({
            'seed': s,
            'rmse': avg_rmse,
            'r2': avg_r2,
            'std_rmse': std_rmse,
            'std_r2': std_r2
        })

    eval_results = pl.DataFrame(results)

    return eval_results

## Model evaluation

In [177]:
def plot_error_vs_predicted(oof_preds_rf, y, ids):
    errors = pl.DataFrame({
        'id': ids,
        'actual': y,
        'predicted': oof_preds_rf
    })
    # calculate the mean and std of the errors
    errors = errors.with_columns(
        ((pl.col('predicted') - pl.col('actual')) ** 2).alias('squared_error')
    )
    errors = errors.with_columns(
        (pl.col('squared_error').sqrt()).alias('rmse')
    )

    mean_error = errors['rmse'].mean()
    std_error = errors['rmse'].std()

    fig = px.scatter(
        errors.to_pandas(),
        x='predicted',
        y='actual',
        hover_data=['id'],
        labels={'predicted': 'Predicted Values', 'actual': 'Real Values'},
        title=f'Real vs Predicted Values\nMean Error: {mean_error:.2f}, Std Error: {std_error:.2f}'
    )
    fig.add_shape(
        type='line',
        x0=errors['predicted'].min(),
        y0=errors['predicted'].min(),
        x1=errors['predicted'].max(),
        y1=errors['actual'].max(),
        line=dict(color='red', dash='dash')
    )
    fig.update_layout(
        width=800,
        height=800,
        legend_title_text='',
        showlegend=False
    )
    fig.show()

In [178]:
model_dummy = DummyRegressor(strategy='median')
results_dummy, _, _ = cross_validation(model_dummy, X, y)

Fold 1 - RMSE: 56941.49, R2: -0.08
Fold 2 - RMSE: 44369.66, R2: -0.03
Fold 3 - RMSE: 45707.65, R2: -0.05
Fold 4 - RMSE: 48455.89, R2: -0.01
Fold 5 - RMSE: 45943.56, R2: -0.07

Average RMSE across folds: 48283.65
Average R2 across folds: -0.05


In [179]:
# linear regression model
model_lr = LinearRegression()
results_lr, importances_lr, oof_preds_lr = cross_validation(model_lr, X, y)
importances_lr

Fold 1 - RMSE: 13416.38, R2: 0.94
Fold 2 - RMSE: 11765.45, R2: 0.93
Fold 3 - RMSE: 12475.77, R2: 0.92
Fold 4 - RMSE: 11426.19, R2: 0.94
Fold 5 - RMSE: 11488.77, R2: 0.93

Average RMSE across folds: 12114.51
Average R2 across folds: 0.93


feature,mean_importance,std_importance,mean_importance_pct
str,f64,f64,f64
"""cat_ohe__Job_Title_Category_Ex…",55216.31,10574.89,13.0
"""num__YoE_squared""",28807.05,11199.59,6.8
"""num__YoE_Age_ratio""",26997.0,7383.05,6.4
"""cat_ohe__Job_Title_Category_Co…",22510.27,3221.03,5.3
"""cat_ohe__Job_Title_Category_Ot…",20133.32,2374.56,4.7
"""cat_ohe__Job_Title_Category_An…",18689.08,2344.92,4.4
"""cat_ohe__Job_Title_Category_Di…",18057.52,3068.91,4.3
"""num__YoE_Age_interaction""",16006.98,7231.34,3.8
"""cat_ohe__Job_Level_Senior""",13910.82,888.24,3.3
"""cat_ohe__Years of Experience_1…",13173.43,2005.72,3.1


In [180]:
plot_error_vs_predicted(oof_preds_lr, y, ids)

In [181]:
lr_seed_results = evaluate_model_seeds(model_lr, X, y)
lr_seed_results

seed,rmse,r2,std_rmse,std_r2
i64,f64,f64,f64,f64
8431,12450.78,0.93,1024.42,0.01
176,11986.2,0.94,1326.56,0.01
3954,12422.59,0.93,1779.53,0.03
1946,14206.21,0.9,4819.77,0.07
1626,12611.81,0.93,1649.73,0.02


In [182]:
# lasso regression model
model_lasso = Lasso(alpha=20, random_state=SEED)
results_lasso, importances_lasso, oof_preds_lasso = cross_validation(model_lasso, X, y)
importances_lasso

Fold 1 - RMSE: 13188.70, R2: 0.94
Fold 2 - RMSE: 10811.77, R2: 0.94
Fold 3 - RMSE: 11932.23, R2: 0.93
Fold 4 - RMSE: 11529.97, R2: 0.94
Fold 5 - RMSE: 11028.56, R2: 0.94

Average RMSE across folds: 11698.25
Average R2 across folds: 0.94


feature,mean_importance,std_importance,mean_importance_pct
str,f64,f64,f64
"""cat_ohe__Job_Title_Category_Ex…",67480.9,10921.67,23.6
"""cat_ohe__Job_Title_Category_Di…",31563.8,3032.25,11.0
"""num__YoE_squared""",25632.1,5573.52,8.9
"""cat_ohe__Job_Level_Senior""",17199.91,1105.57,6.0
"""num__YoE_Age_ratio""",12723.05,2523.03,4.4
"""num__Age""",12226.39,2571.2,4.3
"""cat_ohe__Years of Experience_1…",10006.82,916.62,3.5
"""cat_ohe__Job_Title_Category_Co…",7916.34,1653.6,2.8
"""cat_ohe__Job_Level_Junior""",6644.84,917.44,2.3
"""cat_ohe__Gender_Male""",6594.13,610.53,2.3


In [183]:
plot_error_vs_predicted(oof_preds_lasso, y, ids)

In [184]:
lasso_seed_results = evaluate_model_seeds(model_lasso, X, y)
lasso_seed_results

seed,rmse,r2,std_rmse,std_r2
i64,f64,f64,f64,f64
8431,11642.18,0.94,686.54,0.01
176,11592.19,0.94,1108.82,0.01
3954,11874.95,0.93,1577.38,0.03
1946,11505.63,0.94,1239.33,0.01
1626,12071.57,0.93,1517.16,0.02


In [185]:
# ridge regression model
model_ridge = Ridge(alpha=1, random_state=SEED)
results_ridge, importances_ridge, oof_preds_ridge = cross_validation(model_ridge, X, y)
importances_ridge

Fold 1 - RMSE: 14553.44, R2: 0.93
Fold 2 - RMSE: 10892.96, R2: 0.94
Fold 3 - RMSE: 11919.41, R2: 0.93
Fold 4 - RMSE: 11962.83, R2: 0.94
Fold 5 - RMSE: 11104.56, R2: 0.94

Average RMSE across folds: 12086.64
Average R2 across folds: 0.93


feature,mean_importance,std_importance,mean_importance_pct
str,f64,f64,f64
"""cat_ohe__Job_Title_Category_Ex…",42190.9,9000.34,13.9
"""cat_ohe__Job_Title_Category_Co…",17985.46,2734.81,5.9
"""cat_ohe__Job_Title_Category_Di…",17277.52,2634.69,5.7
"""cat_ohe__Job_Title_Category_Ot…",16967.88,2063.73,5.6
"""cat_ohe__Job_Title_Category_An…",15317.29,2218.6,5.0
"""num__YoE_squared""",15110.77,4015.4,5.0
"""cat_ohe__Job_Level_Senior""",12719.11,919.87,4.2
"""num__Age""",11697.96,2221.97,3.9
"""cat_ohe__Job_Level_Junior""",9671.44,920.01,3.2
"""cat_ohe__Years of Experience_1…",9607.04,1594.89,3.2


In [186]:
plot_error_vs_predicted(oof_preds_ridge, y, ids)

In [187]:
ridge_seed_results = evaluate_model_seeds(model_ridge, X, y)
ridge_seed_results

seed,rmse,r2,std_rmse,std_r2
i64,f64,f64,f64,f64
8431,11893.54,0.94,868.49,0.01
176,11835.19,0.94,1351.58,0.01
3954,11910.56,0.93,1556.89,0.02
1946,11666.79,0.94,1503.59,0.01
1626,12257.09,0.93,1823.43,0.02


In [188]:
rfr = RandomForestRegressor(random_state=SEED)
results_rf, importances_rf, oof_preds_rf = cross_validation(rfr, X, y)
importances_rf

Fold 1 - RMSE: 14540.06, R2: 0.93
Fold 2 - RMSE: 9562.80, R2: 0.95
Fold 3 - RMSE: 8946.17, R2: 0.96
Fold 4 - RMSE: 13025.30, R2: 0.93
Fold 5 - RMSE: 10089.77, R2: 0.95

Average RMSE across folds: 11232.82
Average R2 across folds: 0.94


feature,mean_importance,std_importance,mean_importance_pct
str,f64,f64,f64
"""num__Age""",0.31,0.13,31.4
"""num__YoE_Age_interaction""",0.31,0.04,31.6
"""num__YoE_squared""",0.18,0.09,18.6
"""num__YoE_Age_ratio""",0.09,0.05,9.2
"""cat_ohe__Job_Level_Senior""",0.03,0.01,3.1
"""cat_ohe__Job_Title_Category_Di…",0.02,0.01,1.6
"""cat_ohe__Job_Title_Category_Ex…",0.01,0.01,1.4
"""num__Education_Age_mean""",0.01,0.01,1.4
"""cat_ohe__Education Level_Bache…",0.01,0.01,0.6
"""cat_ohe__Job_Title_Category_Ma…",0.01,0.0,0.8


In [189]:
plot_error_vs_predicted(oof_preds_rf, y, ids)

In [190]:
rfr_seed_results = evaluate_model_seeds(rfr, X, y)
rfr_seed_results

seed,rmse,r2,std_rmse,std_r2
i64,f64,f64,f64,f64
8431,11046.3,0.95,1404.25,0.01
176,11613.55,0.94,2194.38,0.02
3954,11098.38,0.94,1344.22,0.01
1946,11187.32,0.94,1658.73,0.01
1626,11303.38,0.94,2654.96,0.03


In [191]:
lgbm = LGBMRegressor(random_state=SEED,
                    n_estimators=7500,
                    learning_rate=0.01, 
                    verbosity=-1)
results_lgb, importances_lgbm, oof_preds_lgbm = cross_validation(lgbm, X, y)
importances_lgbm

Fold 1 - RMSE: 13703.52, R2: 0.94
Fold 2 - RMSE: 10357.49, R2: 0.94
Fold 3 - RMSE: 10241.49, R2: 0.95
Fold 4 - RMSE: 14057.66, R2: 0.92
Fold 5 - RMSE: 10056.77, R2: 0.95

Average RMSE across folds: 11683.38
Average R2 across folds: 0.94


feature,mean_importance,std_importance,mean_importance_pct
str,f64,f64,f64
"""num__YoE_Age_ratio""",19712.8,1939.59,25.1
"""num__YoE_Age_interaction""",13474.4,993.31,17.2
"""num__Age""",11358.2,846.53,14.5
"""num__YoE_squared""",4172.6,479.74,5.3
"""cat_ohe__Job_Title_Category_An…",3533.2,901.45,4.5
"""cat_ohe__Job_Title_Category_Ot…",3411.6,334.11,4.3
"""cat_ohe__Gender_Female""",3263.4,459.13,4.2
"""cat_ohe__Job_Title_Category_Ma…",2798.2,645.48,3.6
"""cat_ohe__Job_Title_Category_Di…",2664.0,167.23,3.4
"""cat_ohe__Job_Level_Senior""",2320.0,202.47,3.0


In [192]:
plot_error_vs_predicted(oof_preds_lgbm, y, ids)

In [193]:
lgbm_seed_results = evaluate_model_seeds(lgbm, X, y)
lgbm_seed_results

seed,rmse,r2,std_rmse,std_r2
i64,f64,f64,f64,f64
8431,11567.7,0.94,1726.5,0.01
176,12390.55,0.93,1804.6,0.02
3954,11691.68,0.94,1595.34,0.02
1946,11371.12,0.94,2145.83,0.02
1626,10991.12,0.94,1864.08,0.02


Stuff to do

1. Evaluate correlated features
1. Feature selection
1. Hyperparameter optimization
1. Modelblending
1. Model professionaldescription