In [1]:
import pandas as pd
from pathlib import Path
from glob import glob
from pyprojroot import here

import numpy as np

In [2]:
p = Path('.')
df = pd.concat([pd.read_csv(x) for x in p.glob("../data/Comp_*.csv")])

In [3]:
comp_data = df.drop(columns=["SalaryPY", "BonusPY", "BonusCY", "ID"])
comp_data["SalaryCY"] = np.log10(comp_data["SalaryCY"])
comp_data_dummy = pd.get_dummies(comp_data)

In [4]:
y = comp_data_dummy.pop("SalaryCY")

In [5]:
comp_data_dummy.sample(10)

Unnamed: 0,Years,Reports,Floor,Region_AIPAC,Region_Europe,Region_North America,Region_Switzerland,Region_UK,Title_Analyst,Title_Associate,...,Level_Executive,Level_Junior,Level_VP,Career_Junior,Career_Senior,Office_Corner,Office_Cubicle,Office_Shared,Retirement_Eligible,Retirement_Ineligible
8955,2,0,42,0,0,0,0,1,1,0,...,0,1,0,1,0,0,1,0,0,1
5877,18,389,31,0,0,0,0,1,0,0,...,1,0,0,0,1,1,0,0,0,1
771,2,0,46,0,0,0,0,1,1,0,...,0,1,0,1,0,0,1,0,0,1
22207,2,0,7,0,0,1,0,0,1,0,...,0,1,0,1,0,0,1,0,0,1
15238,2,0,47,1,0,0,0,0,1,0,...,0,1,0,1,0,0,1,0,0,1
20154,3,0,12,0,0,0,1,0,1,0,...,0,1,0,1,0,0,1,0,0,1
3903,1,0,11,1,0,0,0,0,1,0,...,0,1,0,1,0,0,1,0,0,1
11568,4,4,22,1,0,0,0,0,0,1,...,0,1,0,1,0,0,1,0,0,1
18466,15,31,31,0,0,0,1,0,0,0,...,0,0,1,0,1,0,0,1,0,1
18392,3,0,5,0,0,0,1,0,1,0,...,0,1,0,1,0,0,1,0,0,1


In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    comp_data_dummy, y, test_size=.2, random_state=42
)

In [7]:
from sklearn.pipeline import Pipeline

from sklearn.feature_selection import VarianceThreshold
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import Normalizer
from sklearn.linear_model import ElasticNet

In [8]:
pipe = Pipeline([
    ("selector",  VarianceThreshold()),
    ("impute", IterativeImputer(max_iter=10, random_state=0)),
    ("scaler", Normalizer()),
    ("en", ElasticNet())
])

In [9]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('selector', VarianceThreshold(threshold=0.0)),
                ('impute',
                 IterativeImputer(add_indicator=False, estimator=None,
                                  imputation_order='ascending',
                                  initial_strategy='mean', max_iter=10,
                                  max_value=None, min_value=None,
                                  missing_values=nan, n_nearest_features=None,
                                  random_state=0, sample_posterior=False,
                                  skip_complete=False, tol=0.001, verbose=0)),
                ('scaler', Normalizer(copy=True, norm='l2')),
                ('en',
                 ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True,
                            l1_ratio=0.5, max_iter=1000, normalize=False,
                            positive=False, precompute=False, random_state=None,
                            selection='cyclic', tol=0.0001,
          

In [10]:
pipe.score(X_test, y_test)

-2.099150320233889e-07

In [11]:
param_grid = {'en__alpha': [1],
              'en__l1_ratio': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]}

In [None]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, param_grid=param_grid)
grid.fit(X_train, y_train)

In [None]:
print("best cross-validation accuracy:", grid.best_score_)
print("test set score: ", grid.score(X_test, y_test))
print("best parameters: ", grid.best_params_)