In [2]:
import pickle
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, SplineTransformer, PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, root_mean_squared_error
import mlflow
from mlflow.models import infer_signature

In [3]:
with open('../data/dataset.pkl', 'rb') as handle:
    df = pickle.load(handle)

In [4]:
df = df.rename(columns={'salary_in_usd': 'target'})
X = df.drop('target', axis=1)
y = df['target']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [6]:
num_features = X.select_dtypes(include=['number']).columns.to_list()
print(num_features)
cat_features = X.select_dtypes(include=['category']).columns.to_list()
print(cat_features)

['work_year', 'remote_ratio']
['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']


In [7]:
pf = PolynomialFeatures(degree=2)
pf.fit_transform(X_train[['work_year', 'remote_ratio']])

array([[1.000000e+00, 2.022000e+03, 1.000000e+02, 4.088484e+06,
        2.022000e+05, 1.000000e+04],
       [1.000000e+00, 2.020000e+03, 1.000000e+02, 4.080400e+06,
        2.020000e+05, 1.000000e+04],
       [1.000000e+00, 2.022000e+03, 1.000000e+02, 4.088484e+06,
        2.022000e+05, 1.000000e+04],
       ...,
       [1.000000e+00, 2.022000e+03, 1.000000e+02, 4.088484e+06,
        2.022000e+05, 1.000000e+04],
       [1.000000e+00, 2.023000e+03, 0.000000e+00, 4.092529e+06,
        0.000000e+00, 0.000000e+00],
       [1.000000e+00, 2.022000e+03, 1.000000e+02, 4.088484e+06,
        2.022000e+05, 1.000000e+04]])

In [8]:
sp = SplineTransformer(n_knots=3, degree=3)
sp.fit_transform()