In [86]:
import polars as pl
import numpy as np
import plotly.express as px

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor


In [87]:
SEED = 100622
TARGET = 'Salary'
ID = 'id'
N_FOLDS = 5

## Data read

In [88]:
df_salary_raw = pl.read_csv("../data/salary.csv")
df_people_raw = pl.read_csv("../data/people.csv")
df_descriptions_raw = pl.read_csv("../data/descriptions.csv")

In [89]:
fig = px.histogram(df_salary_raw,
                   x="Salary",
                   nbins=30,
                   title="Salary Distribution")
fig.show()

In [90]:
df_salary_people_raw = df_salary_raw.join(df_people_raw, 
                                      on="id", how="inner")
df_salary_people_raw

id,Salary,Age,Gender,Education Level,Job Title,Years of Experience
i64,f64,f64,str,str,str,f64
0,90000.0,32.0,"""Male""","""Bachelor's""","""Software Engineer""",5.0
1,65000.0,28.0,"""Female""","""Master's""","""Data Analyst""",3.0
2,150000.0,45.0,"""Male""","""PhD""","""Senior Manager""",15.0
3,60000.0,36.0,"""Female""","""Bachelor's""","""Sales Associate""",7.0
4,200000.0,52.0,"""Male""","""Master's""","""Director""",20.0
…,…,…,…,…,…,…
370,85000.0,35.0,"""Female""","""Bachelor's""","""Senior Marketing Analyst""",8.0
371,170000.0,43.0,"""Male""","""Master's""","""Director of Operations""",19.0
372,40000.0,29.0,"""Female""","""Bachelor's""","""Junior Project Manager""",2.0
373,90000.0,34.0,"""Male""","""Bachelor's""","""Senior Operations Coordinator""",7.0


In [91]:
df_salary_people_raw.null_count()

id,Salary,Age,Gender,Education Level,Job Title,Years of Experience
u32,u32,u32,u32,u32,u32,u32
0,2,5,5,5,5,2


In [92]:
df_salary_people_raw.filter(pl.col("Salary") < 3500)

id,Salary,Age,Gender,Education Level,Job Title,Years of Experience
i64,f64,f64,str,str,str,f64
259,350.0,29.0,"""Male""","""Bachelor's""","""Junior Business Operations Ana…",1.5


In [93]:
df_salary_people_raw['Job Title'].value_counts()

Job Title,count
str,u32
"""Junior Data Scientist""",1
"""Senior Data Analyst""",3
"""Sales Director""",1
"""Junior Web Developer""",1
"""Project Manager""",2
…,…
"""Director of Finance""",2
"""Event Coordinator""",2
"""Junior Marketing Manager""",3
"""Senior Software Developer""",3


In [94]:
df_salary_people_raw['Salary'].describe()

statistic,value
str,f64
"""count""",373.0
"""null_count""",2.0
"""mean""",100577.345845
"""std""",48240.013482
"""min""",350.0
"""25%""",55000.0
"""50%""",95000.0
"""75%""",140000.0
"""max""",250000.0


## Feature engineer

In [95]:
df_salary_people_raw.columns

['id',
 'Salary',
 'Age',
 'Gender',
 'Education Level',
 'Job Title',
 'Years of Experience']

In [96]:
def feature_eng(df):
    # drop id
    df = df.drop(ID)

    # Remove rows with null values in 'Salary' 2 totatl
    df = df.filter(pl.col("Salary").is_not_null())


    return df, num_cols, cat_cols

num_cols = ['Age']
cat_cols = ['Gender', 'Education Level', 'Job Title', 'Years of Experience']

df_salary_people, num_cols, cat_cols = feature_eng(df_salary_people_raw)

## Data preprocessing

In [97]:
# Salary has outliers, need to decide how to handle them

# age has 3 null values, need to decide how to fill them

# gender has 3 null values, need to decide how to fill them

# Education Level has 3 null values, need to decide how to fill them

# Job Title has 3 null values, need to decide how to fill them
# Job title has high cardinality, experiment extracting keywords

In [None]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

pipeline = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat_oe', cat_pipeline, cat_cols),
])

## Train-test split

In [111]:
# Split the data into training and testing sets
X = df_salary_people.drop(TARGET).to_pandas()
y = df_salary_people[TARGET]
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

# fit and transform the training data, and transform the test data
X_train = pipeline.fit_transform(X_train_raw)
X_test = pipeline.transform(X_test_raw)

## Metric definition

In [105]:
def metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return {'mse': mse, 'r2': r2}

## Cross validation training

In [106]:
def cross_calidation(model, X, y):

    kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
    results = []

    for train_index, val_index in kf.split(X):

        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)

        result = metrics(y_val, y_pred)
        results.append(result)

    return results


In [116]:
def print_results(results):
    # Print results
    print("Cross-validation results:")
    for i, result in enumerate(results):
        print(f"Fold {i+1}: MSE = {result['mse']:.2f}, R² = {result['r2']:.4f}")

    # Calculate average performance
    avg_mse = np.mean([r['mse'] for r in results])
    avg_r2 = np.mean([r['r2'] for r in results])
    print(f"\nAverage MSE: {avg_mse:.2f}")
    print(f"Average R²: {avg_r2:.4f}")

## Model definition

In [None]:
# linear regression model
model_lr = LinearRegression()

## Model evaluation

In [117]:
results_lr = cross_calidation(model_lr, X_train, y_train)
print_results(results_lr)

Cross-validation results:
Fold 1: MSE = 197157744.96, R² = 0.8996
Fold 2: MSE = 171006517.94, R² = 0.9208
Fold 3: MSE = 220232116.93, R² = 0.9051
Fold 4: MSE = 167742674.07, R² = 0.9162
Fold 5: MSE = 360198382.70, R² = 0.8374

Average MSE: 223267487.32
Average R²: 0.8958


In [119]:
results_rf = cross_calidation(RandomForestRegressor(random_state=SEED), X_train, y_train)
print_results(results_rf)

Cross-validation results:
Fold 1: MSE = 120145196.35, R² = 0.9388
Fold 2: MSE = 221905798.75, R² = 0.8973
Fold 3: MSE = 224358473.50, R² = 0.9033
Fold 4: MSE = 188543271.00, R² = 0.9058
Fold 5: MSE = 411686308.85, R² = 0.8142

Average MSE: 233327809.69
Average R²: 0.8919
