# Adding Interactions using scikit-learn

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [None]:
penguins = pd.read_csv('data/penguins.csv').dropna().reset_index(drop = True)

In [None]:
penguins[['species', 'bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'sex', 'body_mass_g']].head()

Let's start by building a model using the species and sex variables.

In [None]:
variables = ['flipper_length_mm', 'species', 'sex']
categorical_variables = ['species', 'sex']

X = penguins[variables]
y = penguins['body_mass_g']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 321, stratify = penguins['species'])

We need to convert the categorical variables into dummy columns. To dummyize in a way that will work will with other scikit-learn tools, we can use the OneHotEncoder class from scikit-learn's preprocessing module.

In [None]:
from sklearn.preprocessing import OneHotEncoder

By default, this class returns a sparse matrix. We're going to set sparse to False so that it returns a regular array.

We'll also tell it to drop the first category that it sees so that we end up with one fewer column than category.

Notice that we have to pass in just the categorical variables.

In [None]:
ohe = OneHotEncoder(sparse = False, drop = 'first')
ohe.fit(X_train[categorical_variables])

In [None]:
ohe.transform(X_train[categorical_variables])

If we want to get the resulting column names, we can do so using the `get_feature_names` method.

In [None]:
ohe.get_feature_names_out(categorical_variables)

Now, what about our numeric variables? In order to be able to apply one-hot-encoding to some but not all of the columns, we can use the ColumnTransformer class.

The ColumnTransformer class lets us specify one or more transformations to apply to subsets of our columns.

In this case, we want to dummyize our categorical variables and leave everything else untouched. We'll tell it to do this by specifying to "passthrough" the remaining columns.

In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
ct = ColumnTransformer(transformers = [
    ('ohe', OneHotEncoder(sparse = False, drop = 'first'), categorical_variables)
],
                      remainder = 'passthrough')

Notice that it outputs the dummy columns first followed by the remaining columns.

In [None]:
ct.fit_transform(X_train)

We can extract out the OneHotEncoder using the named_transformers_ attribute of our column transformer.

In [None]:
ct.named_transformers_['ohe'].get_feature_names_out(categorical_variables)

Finally, we can combine this all together with our LinearRegression model using a Pipeline.

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
pipe = Pipeline(
    steps = [
        ('ct', ColumnTransformer(transformers = [
            ('ohe', OneHotEncoder(sparse = False, drop = 'first'), categorical_variables)
        ],
                                 remainder = 'passthrough')),
        ('linreg', LinearRegression())
    ]
)

pipe.fit(X_train, y_train)

In [None]:
print(f'MSE: {mean_squared_error(y_test, pipe.predict(X_test))}')
print(f'R2: {r2_score(y_test, pipe.predict(X_test))}')

If we want to inspect the coefficients, we have to do a little bit of work to extact out the column names.

In [None]:
features = list(pipe['ct'].named_transformers_['ohe'].get_feature_names_out(categorical_variables))
features += [x for x in X_train.columns if x not in categorical_variables]

coefficients = pd.DataFrame({
    'variable': ['intercept'] + features,
    'coefficient': [pipe['linreg'].intercept_] + list(pipe['linreg'].coef_)
})
coefficients

**Question:** What does this model predict for a female Adelie penguin with a flipper length of 189 mm?

Now, let's add some interaction terms. We can do this using the PolynomialFeatures class.

In [None]:
from sklearn.preprocessing import PolynomialFeatures

The PolynomialFeatures class will create new featues by multiplying our existing variables. For now, we'll specify `interaction_only = True` which let's it know that we don't want to multiply a column by itself.

In [None]:
pipe = Pipeline(
    steps = [
        ('ct', ColumnTransformer(transformers = [
            ('ohe', OneHotEncoder(sparse = False, drop = 'first'), categorical_variables)
        ],
                                 remainder = 'passthrough')),
        ('pf', PolynomialFeatures(interaction_only = True, include_bias = False)),
        ('linreg', LinearRegression())
    ]
)

pipe.fit(X_train, y_train)

print(f'MSE: {mean_squared_error(y_test, pipe.predict(X_test))}')
print(f'R2: {r2_score(y_test, pipe.predict(X_test))}')

Similar to the OneHotEncoder, the PolynomialFeatures class has a `get_feature_names` method.

In [None]:
features = list(pipe['ct'].named_transformers_['ohe'].get_feature_names_out(categorical_variables))
features += [x for x in X_train.columns if x not in categorical_variables]
features = list(pipe['pf'].get_feature_names_out(features))

coefficients = pd.DataFrame({
    'variable': ['intercept'] + features,
    'coefficient': [pipe['linreg'].intercept_] + list(pipe['linreg'].coef_)
})
coefficients

Notice that we have a column for species_Chinstrap * species_Gentoo. Since a penguin cannot be both Chinstrap and Gentoo, so this column is unnecessary. We can exclude it by using a VarianceThreshold, which will remove any columns that have variance 0 (or below whatever threshold we set).

In [None]:
from sklearn.feature_selection import VarianceThreshold

In [None]:
pipe = Pipeline(
    steps = [
        ('ct', ColumnTransformer(transformers = [
            ('ohe', OneHotEncoder(sparse = False, drop = 'first'), categorical_variables)
        ],
                                 remainder = 'passthrough')),
        ('pf', PolynomialFeatures(interaction_only = True, include_bias = False)),
        ('vt', VarianceThreshold()),
        ('linreg', LinearRegression())
    ]
)

pipe.fit(X_train, y_train)

print(f'MSE: {mean_squared_error(y_test, pipe.predict(X_test))}')
print(f'R2: {r2_score(y_test, pipe.predict(X_test))}')

In [None]:
features = list(pipe['ct'].named_transformers_['ohe'].get_feature_names_out(categorical_variables))
features += [x for x in X_train.columns if x not in categorical_variables]
features = list(pipe['pf'].get_feature_names_out(features))
features = list(np.array(features)[pipe['vt'].get_support()])

coefficients = pd.DataFrame({
    'variable': ['intercept'] + features,
    'coefficient': [pipe['linreg'].intercept_] + list(pipe['linreg'].coef_)
})
coefficients

**Question:** What does this model predict for a male Adelie penguin with a flipper length of 194 mm?

Now, let's add some additional features.

In [None]:
variables = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'species', 'sex']
categorical_variables = ['species', 'sex']

X = penguins[variables]
y = penguins['body_mass_g']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 321, stratify = penguins['species'])


pipe = Pipeline(
    steps = [
        ('ct', ColumnTransformer(transformers = [
            ('ohe', OneHotEncoder(sparse = False, drop = 'first'), categorical_variables)
        ],
                                 remainder = 'passthrough')),
        ('pf', PolynomialFeatures(interaction_only = True, include_bias = False)),
        ('vt', VarianceThreshold()),
        ('linreg', LinearRegression())
    ]
)

pipe.fit(X_train, y_train)

print(f'MSE: {mean_squared_error(y_test, pipe.predict(X_test))}')
print(f'R2: {r2_score(y_test, pipe.predict(X_test))}')

In [None]:
features = list(pipe['ct'].named_transformers_['ohe'].get_feature_names_out(categorical_variables))
features += [x for x in X_train.columns if x not in categorical_variables]
features = list(pipe['pf'].get_feature_names_out(features))
features = list(np.array(features)[pipe['vt'].get_support()])

coefficients = pd.DataFrame({
    'variable': ['intercept'] + features,
    'coefficient': [pipe['linreg'].intercept_] + list(pipe['linreg'].coef_)
})
coefficients

We can also add higher-degree terms using the PolynomialFeatures class by specifying `interaction_only = False`.
Notice, however, that we get a significantly more complex model for not a lot of gain.

In [None]:
pipe = Pipeline(
    steps = [
        ('ct', ColumnTransformer(transformers = [
            ('ohe', OneHotEncoder(sparse = False, drop = 'first'), categorical_variables)
        ],
                                 remainder = 'passthrough')),
        ('pf', PolynomialFeatures(interaction_only = False, include_bias = False)),
        ('vt', VarianceThreshold()),
        ('linreg', LinearRegression())
    ]
)

pipe.fit(X_train, y_train)

print(f'MSE: {mean_squared_error(y_test, pipe.predict(X_test))}')
print(f'R2: {r2_score(y_test, pipe.predict(X_test))}')

In [None]:
features = list(pipe['ct'].named_transformers_['ohe'].get_feature_names_out(categorical_variables))
features += [x for x in X_train.columns if x not in categorical_variables]
features = list(pipe['pf'].get_feature_names_out(features))
features = list(np.array(features)[pipe['vt'].get_support()])

coefficients = pd.DataFrame({
    'variable': ['intercept'] + features,
    'coefficient': [pipe['linreg'].intercept_] + list(pipe['linreg'].coef_)
})
coefficients

Notice that, annoyingly, this also includes the one-hot encoded features and their squares. There is not an easy way to remove them, but notice that the value of the coefficients are split between the regular and square columns. 