In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import VarianceThreshold

from sklearn.metrics import accuracy_score

In [None]:
penguins = pd.read_csv('data/penguins.csv').dropna().reset_index()

Let's start with a regular linear regression model including all of our predictors and interaction terms.

In [None]:
variables = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'sex']
categorical_variables = ['sex']

X = penguins[variables]
y = penguins['species']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 321, stratify = y)

pipe = Pipeline(
    steps = [
        ('ct', ColumnTransformer(
            transformers = [
                ('ohe', OneHotEncoder(sparse = False, drop = 'first'), categorical_variables)
            ],
            remainder = 'passthrough')),
        ('pf', PolynomialFeatures(interaction_only = True, include_bias = False)),
        ('vt', VarianceThreshold()),
        ('scaler', StandardScaler()),
        ('logistic', LogisticRegression())
    ]
)

pipe.fit(X_train, y_train)

In [None]:
features = list(pipe['ct'].named_transformers_['ohe'].get_feature_names(categorical_variables))
features += [x for x in X_train.columns if x not in categorical_variables]
features = pipe['pf'].get_feature_names(features)
features = list(np.array(features)[pipe['vt'].get_support()])

species = 'Chinstrap'
idx = list(pipe['logistic'].classes_).index(species)

coefficients = pd.DataFrame({
    'variable': ['intercept'] + features,
    'coefficient': [pipe['logistic'].intercept_[idx]] + list(pipe['logistic'].coef_[idx])
})

coefficients

Let's say that we want to use a lasso model so that we can get a simpler model. There is not a separate lasso classification model, but we can instead change the arguments to the LogisticRegression model. See the documentation for more information: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

We need to change the penalty argument from the default of 'l2' (for ridge) to 'l1' (for lasso). We also have to change the solver that is used.

Notice also that we need to scale our variables before passing them to the model. We'll do this with a StandardScaler.

In [None]:
variables = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'sex']
categorical_variables = ['sex']

X = penguins[variables]
y = penguins['species']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 321, stratify = y)

pipe = Pipeline(
    steps = [
        ('ct', ColumnTransformer(
            transformers = [
                ('ohe', OneHotEncoder(sparse = False, drop = 'first'), categorical_variables)
            ],
            remainder = 'passthrough')),
        ('pf', PolynomialFeatures(interaction_only = True, include_bias = False)),
        ('vt', VarianceThreshold()),
        ('scaler', StandardScaler()),
        ('logistic', LogisticRegression(penalty = 'l1', solver = 'saga', max_iter = 10000))
    ]
)

pipe.fit(X_train, y_train)

We end up with a fairly simple model using just the default penalty of 1.

In [None]:
features = list(pipe['ct'].named_transformers_['ohe'].get_feature_names(categorical_variables))
features += [x for x in X_train.columns if x not in categorical_variables]
features = pipe['pf'].get_feature_names(features)
features = list(np.array(features)[pipe['vt'].get_support()])

species = 'Chinstrap'
idx = list(pipe['logistic'].classes_).index(species)

coefficients = pd.DataFrame({
    'variable': ['intercept'] + features,
    'coefficient': [pipe['logistic'].intercept_[idx]] + list(pipe['logistic'].coef_[idx])
})

coefficients[coefficients['coefficient'] != 0]

It might be the case that the default value of C is not the best possible one.

If we wanted to try out other values, we should use k-fold cross-validation. We can do that using the GridSearchCV class.

In [None]:
from sklearn.model_selection import GridSearchCV

For this, we need to give it the estimator and a grid of hyperparameter values to try out. This grid needs to be a dictionary where the keys specify the hyperparameter and the values are a list of values to try out.

You can also specify how you want to score each hyperparameter.

In [None]:
gs = GridSearchCV(estimator = pipe, 
                 param_grid = {'logistic__C': [1, 0.5, 0.1, 0.05, 0.01]},
                 scoring = 'accuracy')

In [None]:
gs.fit(X_train, y_train)

After fitting, we can see the best parameters.

In [None]:
gs.best_params_

If we need to view additional information, we can access the `best_estimator_` attribute.

In [None]:
gs.best_estimator_

In [None]:
features = list(pipe['ct'].named_transformers_['ohe'].get_feature_names(categorical_variables))
features += [x for x in X_train.columns if x not in categorical_variables]
features = pipe['pf'].get_feature_names(features)
features = list(np.array(features)[pipe['vt'].get_support()])


species = 'Gentoo'
idx = list(pipe['logistic'].classes_).index(species)

coefficients = pd.DataFrame({
    'variable': ['intercept'] + features,
    'coefficient': [gs.best_estimator_['logistic'].intercept_[idx]] + list(gs.best_estimator_['logistic'].coef_[idx])
})

coefficients[coefficients['coefficient'] != 0]

We can generate predictions using the `predict` method of the GridSearchCV object.

In [None]:
accuracy_score(y_test, gs.predict(X_test))