# Feature engineering

Воспользуемся корпусом `Adult`, в котором нужно предсказать уровень дохода человека (больше или меньше 50k).

In [4]:
import pandas as pd
import numpy as np

TARGET = "income"

data = pd.read_csv("data/adult.csv", na_values=["?"])
data.fillna(data.mean(numeric_only=True), inplace=True)
data.fillna(data.mode().iloc[0], inplace=True)
data["income"] = (data["income"] == ">50K").astype(np.int64)
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,0
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,1
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,1
4,18,Private,103497,Some-college,10,Never-married,Prof-specialty,Own-child,White,Female,0,0,30,United-States,0


Подготовим категориальные признаки.

In [5]:
data = pd.get_dummies(data, columns=["workclass", "education", "marital-status", "occupation", "relationship", "race", "gender", "native-country"])
data.head()

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,income,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,25,226802,7,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,38,89814,9,0,0,50,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,28,336951,12,0,0,40,1,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,44,160323,10,7688,0,40,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,18,103497,10,0,0,30,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


Выделим тестовый корпус.

In [6]:
from sklearn.model_selection import train_test_split

data_train, data_test = train_test_split(data, test_size=0.2, random_state=0)

X_train = data_train.drop([TARGET], axis=1)
X_test = data_test.drop([TARGET], axis=1)
y_train = data_train[TARGET]
y_test = data_test[TARGET]

print("Train size", len(X_train))
print("Test size", len(X_test))

Train size 39073
Test size 9769


Обучим линейную модель.

In [7]:
from pandas.api.types import is_numeric_dtype
from sklearn.linear_model import LogisticRegression


def run(X_train, y_train, X_test, y_test):
    columns = [column for column in X_train.columns if is_numeric_dtype(X_train[column])]
    drop = len(X_train.columns) - len(columns)
    if drop > 0:
        print("Drop", drop, "categorical features")
    X_train = X_train[columns]
    X_test = X_test[columns]
    print("Train size", len(X_train))
    print("Test size", len(X_test))
    print("Num features", len(X_train.columns))
    X_train = X_train.to_numpy()
    y_train = y_train.to_numpy().flatten()
    X_test = X_test.to_numpy()
    y_test = y_test.to_numpy().flatten()
    cls = LogisticRegression(max_iter=10000)
    cls.fit(X_train, y_train)
    accuracy = (cls.predict(X_train) == y_train).mean()
    print("Train accuracy: {}".format(accuracy))
    accuracy = (cls.predict(X_test) == y_test).mean()
    print("Test accuracy: {}".format(accuracy))
    
run(X_train, y_train, X_test, y_test)

Train size 39073
Test size 9769
Num features 105
Train accuracy: 0.7979167199856678
Test accuracy: 0.7986487869792199


**Задание 1.** Отнормируйте признаки.

*Подсказка.* Можно воспользоваться функциями `<df>.mean()` и `<df>.std()`.

In [8]:
def normalize(X_train, X_test):
    #
    # Ваш код.
    #
    mean = X_train.mean(numeric_only=True)
    std = X_train.std(numeric_only=True)
    X_train_new = (X_train - mean) / (std + 1e-6)
    X_test_new = (X_test - mean) / (std + 1e-6)
    return X_train_new, X_test_new

# Проверки.
X_train_new, X_test_new = normalize(X_train, X_test)
assert abs(X_train_new.loc[2516, "educational-num"] - X_test_new.loc[38113, "educational-num"]) < 1e-6
run(X_train_new, y_train, X_test_new, y_test)

Train size 39073
Test size 9769
Num features 105
Train accuracy: 0.8529931154505669
Test accuracy: 0.8486027228989661


Качество выросло?

**Задание 2.** Попробуйте добавить признаков в датасет и увеличить точность до 85.5%+ на тестовом корпусе.

In [9]:
def generate_features(df):
    columns = [column for column in df.columns if len(df[column].unique()) > 3]
    for column in columns:
        df[column + "-s2"] = df[column].map(lambda x: x ** 2)
        df[column + "-s3"] = df[column].map(lambda x: x ** 3)
        df[column + "-mabs"] = (df[column] - df[column].mean()).map(lambda x: abs(x))
        df[column + "-log"] = df[column].map(lambda x: np.log(max(1, x)))
        df[column + "-z"] = df[column] == 0
    return df

X_train_new = X_train.copy()
X_test_new = X_test.copy()
X_train_new = generate_features(X_train_new)
X_test_new = generate_features(X_test_new)
X_train_new, X_test_new = normalize(X_train_new, X_test_new)
run(X_train_new, y_train, X_test_new, y_test)

Train size 39073
Test size 9769
Num features 135
Train accuracy: 0.8609269828270161
Test accuracy: 0.8563824342307299
