# Transformers

In [None]:
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer

## Scale Numeric Data

In [None]:
# Create dummy data.
df = pd.DataFrame({"x": [1, 2, 3, 4]})
print(f"Mean: {df['x'].mean():5.2f}")
print(f"Std:  {df['x'].std(ddof=0):5.2f}")

In [None]:
# Create and fit the scaler.
ss = StandardScaler()
ss.fit(df)

In [None]:
# Mean learned by the scaler.
ss.mean_

In [None]:
# Variance learned by the scaler.
ss.var_

In [None]:
# Scaling factor based on variance.
ss.scale_

In [None]:
# Transform the data.
ss.transform(df)

In [None]:
# Check output.
(df["x"] - ss.mean_) / ss.scale_

In [None]:
# Use set_output() to get pandas output.
ss.set_output(transform="pandas")
ss.transform(df)

## Vectorize Text Data

In [None]:
# Dummy text data.
df = pd.DataFrame({
    "texts": [
        "I agree completely.",
        "I completely agree.",
        "I don't agree at all.",
        "My name is Zoë."
    ]
})

In [None]:
# Create the vectorizer.
cv = CountVectorizer(strip_accents="unicode")

In [None]:
# Fit to the texts column.
cv.fit(df["texts"])

In [None]:
# Show word frequencies in the data.
cv.vocabulary_

In [None]:
# Transform the data.
cv.transform(df["texts"]).todense()

In [None]:
# Get the feature names.
cv.get_feature_names_out()

In [None]:
# Combine to DataFrame.
# Note: Does not support set_output().
pd.DataFrame(
    data=cv.transform(df["texts"]).todense(),
    columns=cv.get_feature_names_out()
)