# Feature engineering

Воспользуемся корпусом `Adult`, в котором нужно предсказать уровень дохода человека (больше или меньше 50k).

In [None]:
import pandas as pd
import numpy as np

TARGET = "income"

data = pd.read_csv("data/adult.csv", na_values=["?"])
data.fillna(data.mean(numeric_only=True), inplace=True)
data.fillna(data.mode().iloc[0], inplace=True)
data["income"] = (data["income"] == ">50K").astype(np.int64)
data.head()

Подготовим категориальные признаки.

In [None]:
data = pd.get_dummies(data, columns=["workclass", "education", "marital-status", "occupation", "relationship", "race", "gender", "native-country"])
data.head()

Выделим тестовый корпус.

In [None]:
from sklearn.model_selection import train_test_split

data_train, data_test = train_test_split(data, test_size=0.2, random_state=0)

X_train = data_train.drop([TARGET], axis=1)
X_test = data_test.drop([TARGET], axis=1)
y_train = data_train[TARGET]
y_test = data_test[TARGET]

print("Train size", len(X_train))
print("Test size", len(X_test))

Обучим линейную модель.

In [None]:
from pandas.api.types import is_numeric_dtype
from sklearn.linear_model import LogisticRegression


def run(X_train, y_train, X_test, y_test):
    columns = [column for column in X_train.columns if is_numeric_dtype(X_train[column])]
    drop = len(X_train.columns) - len(columns)
    if drop > 0:
        print("Drop", drop, "categorical features")
    X_train = X_train[columns]
    X_test = X_test[columns]
    print("Train size", len(X_train))
    print("Test size", len(X_test))
    print("Num features", len(X_train.columns))
    X_train = X_train.to_numpy()
    y_train = y_train.to_numpy().flatten()
    X_test = X_test.to_numpy()
    y_test = y_test.to_numpy().flatten()
    cls = LogisticRegression(max_iter=10000)
    cls.fit(X_train, y_train)
    accuracy = (cls.predict(X_train) == y_train).mean()
    print("Train accuracy: {}".format(accuracy))
    accuracy = (cls.predict(X_test) == y_test).mean()
    print("Test accuracy: {}".format(accuracy))
    
run(X_train, y_train, X_test, y_test)

**Задание 1.** Отнормируйте признаки.

*Подсказка.* Можно воспользоваться функциями `<df>.mean()` и `<df>.std()`.

In [None]:
def normalize(X_train, X_test):
    #
    # Ваш код.
    #
    
    return X_train_new, X_test_new

# Проверки.
X_train_new, X_test_new = normalize(X_train, X_test)
assert abs(X_train_new.loc[2516, "educational-num"] - X_test_new.loc[38113, "educational-num"]) < 1e-6
run(X_train_new, y_train, X_test_new, y_test)

Качество выросло?

**Задание 2.** Попробуйте добавить признаков в датасет и увеличить точность до 85.5%+ на тестовом корпусе.

In [None]:
def generate_features(df):
    #
    # Ваш код.
    #
    return df

X_train_new = X_train.copy()
X_test_new = X_test.copy()
X_train_new = generate_features(X_train_new)
X_test_new = generate_features(X_test_new)
X_train_new, X_test_new = normalize(X_train_new, X_test_new)
run(X_train_new, y_train, X_test_new, y_test)