In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', 500)

#### Нормализация

###### Standart Scaler

$$z = \frac{x-\mu}{\sigma}$$
$$\mu \text{ - среднее по фиче, }\sigma \text{ - ее стандартное отклонение}$$

In [None]:
from sklearn.preprocessing import StandardScaler

arr = np.random.randint(0, 100, 10).reshape(-1, 1)
arr

In [None]:
plt.scatter(range(10), arr)

In [None]:
StandardScaler().fit_transform(arr)

In [None]:
scaler = StandardScaler()
scaler.fit(arr)
arr_scaled = scaler.transform(arr)

In [None]:
scaler.inverse_transform(arr_scaled)

In [None]:
(arr - arr.mean())/arr.std()

In [None]:
plt.scatter(range(10), StandardScaler().fit_transform(arr))

###### MinMax Scaler

$$X_{norm} = \frac{X-X_{min}}{X_{max}-X_{min}}$$

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
MinMaxScaler().fit_transform(arr)

In [None]:
(arr - arr.min())/(arr.max() - arr.min())

In [None]:
plt.scatter(range(10), MinMaxScaler().fit_transform(arr))

###### Тестируем

In [None]:
dataset = load_wine()

X = dataset['data']
y = dataset['target']
features = dataset['feature_names']
X = pd.DataFrame(X, columns=features)
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
X_train.head()

In [None]:
y

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
for clf_ in [DecisionTreeClassifier, RandomForestClassifier, KNeighborsClassifier]:
    try:
        clf = clf_(random_state=45)
    except:
        clf = clf_()
    clf.fit(X_train, y_train)
    print(f'{clf}: Train score: {clf.score(X_train, y_train):0.2f}, Test score: {clf.score(X_test, y_test):0.2f}')

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
pd.DataFrame(X_train_scaled, columns=features).head()

In [None]:
for clf_ in [DecisionTreeClassifier, RandomForestClassifier, KNeighborsClassifier]:
    try:
        clf = clf_(random_state=45)
    except:
        clf = clf_()
    clf.fit(X_train_scaled, y_train)
    print(f'{clf}: Train score: {clf.score(X_train_scaled, y_train):0.2f}, '
          f'Test score: {clf.score(X_test_scaled, y_test):0.2f}')

In [None]:
from sklearn.decomposition import PCA

X_train_pca = PCA(n_components=2).fit_transform(X_train)
X_train_pca[:5]

In [None]:
plt.scatter(X_train_pca[:,0], X_train_pca[:,1], c=y_train)

In [None]:
pca = PCA(n_components=2)
pca.fit(X_train_scaled)
X_train_scaled_pca = pca.transform(X_train_scaled)
X_train_scaled_pca[:5]

In [None]:
plt.scatter(X_train_scaled_pca[:,0], X_train_scaled_pca[:,1], c=y_train)

In [None]:
plt.scatter(pca.transform(X_test_scaled)[:,0], pca.transform(X_test_scaled)[:,1], c=y_test)

In [None]:
for clf_ in [DecisionTreeClassifier, RandomForestClassifier, KNeighborsClassifier]:
    clf = clf_()
    clf.fit(X_train_scaled_pca, y_train)
    print(f'{clf}: Train score: {clf.score(X_train_scaled_pca, y_train):0.2f}, '
          f'Test score: {clf.score(pca.transform(X_test_scaled), y_test):0.2f}')

#### Кодирование фичей

данные https://www.kaggle.com/c/cat-in-the-dat-ii/data?select=train.csv

In [None]:
data = pd.read_csv('train.csv')

In [None]:
data.head()

In [None]:
y = data['target']
X = data.drop('target', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y)

clf = RandomForestClassifier().fit(X_train, y_train)
classification_report(y_test, clf.predict(X_test))

In [None]:
def description(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.iloc[0].values
    summary['Second Value'] = df.iloc[1].values
    summary['Third Value'] = df.iloc[2].values
    return summary
description(data)

###### Binary Feature Encoding

In [None]:
data['bin_4'].value_counts()

In [None]:
data['bin_3'] = data['bin_3'].replace({'T':1, 'F':0})
data['bin_4'] = data['bin_4'].replace({'Y':1, 'N':0})

In [None]:
data[[col for col in data.columns if 'bin_' in col]]

###### Nominal Features

In [None]:
data[[col for col in data.columns if 'nom_' in col]]

In [None]:
data['nom_0'].value_counts()

###### Dummy (one-hot) encoding

In [None]:
pd.get_dummies(data[['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4']])

In [None]:
data = pd.get_dummies(data, columns=['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4'])

###### Frequency ecnoding

In [None]:
data['nom_5'].value_counts()

In [None]:
freq_encode = data.groupby('nom_5').size() / data.shape[0]
freq_encode

In [None]:
data.loc[:, 'nom_5'] = data['nom_5'].map(freq_encode)

###### Mean target encoding

In [None]:
mean_encode = data.groupby('nom_6')['target'].mean()
mean_encode

In [None]:
data.loc[:, 'nom_6'] = data['nom_6'].map(mean_encode)

###### Smoothed target encoding

In [None]:
mean = data['target'].mean()
agg = data.groupby('nom_7')['target'].agg(['count', 'mean'])
counts = agg['count']
means = agg['mean']
weight = 100
smooth = (counts * means + weight * mean) / (counts + weight)
smooth

In [None]:
data.loc[:, 'nom_7'] = data['nom_7'].map(smooth)

###### Weight of Evidence encoding

conda install -c conda-forge category_encoders

In [None]:
import category_encoders as ce

$$\text{a - доля истинных меток у значения фичи}$$
$$\text{b - доля ложных меток у значения фичи}$$
$$WoE = \ln\frac{a}{b}$$

In [None]:
data.loc[:, 'nom_8'] = ce.WOEEncoder().fit_transform(data['nom_8'], data['target'])

###### Leave-one-out encoding

$$x_{i}^{k}=\frac{\sum_{j \ne i}(y_{i}\cdot(x_{j}==k))-y_{i}}{\sum_{j \ne i}(x_{j}==k)}$$

In [None]:
data.loc[:, 'nom_9'] = ce.LeaveOneOutEncoder().fit_transform(data['nom_9'], data['target'])

###### Catboost encoding

Как LOO, но лучше)

In [None]:
data.loc[:, 'nom_9'] = ce.CatBoostEncoder().fit_transform(data['nom_9'], data['target'])

###### Ordinal Feature Encoding

In [None]:
data[[col for col in data.columns if 'ord_' in col]]

In [None]:
for col in data.columns:
    if 'ord_' in col:
        display(data[col].value_counts())

In [None]:
ord_1 = {'Novice':1, 
         'Contributor':2, 
         'Expert':4, 
         'Master':5, 
         'Grandmaster':6}
data['ord_1'] = data['ord_1'].map(ord_1)
ord_2 = {'Freezing':1, 
         'Cold':2, 
         'Warm':3, 
         'Hot':4, 
         'Boiling Hot':5, 
         'Lava Hot':6}
data['ord_2'] = data['ord_2'].map(ord_2)

In [None]:
data['ord_3_by_ord'] = data['ord_3'].map(ord, na_action='ignore')

In [None]:
map_ord3 = {key:value for value,key in enumerate(sorted(data['ord_3'].dropna().unique()))}
map_ord3

In [None]:
data['ord_3'] = data['ord_3'].map(map_ord3)

In [None]:
data['ord_4_by_ord'] = data['ord_4'].map(ord, na_action='ignore')

In [None]:
map_ord4 = {key:value for value,key in enumerate(sorted(data['ord_4'].dropna().unique()))}
data['ord_4'] = data['ord_4'].map(map_ord4)

In [None]:
data['ord_5_1'] = data['ord_5'].map(lambda string: ord(string[0]), na_action='ignore')
data['ord_5_2'] = data['ord_5'].map(lambda string: ord(string[1]), na_action='ignore')

map_ord5 = {key:value for value,key in enumerate(sorted(data['ord_5'].dropna().unique()))} 
data['ord_5'] = data['ord_5'].map(map_ord5)

In [None]:
description(data)

In [None]:
y = data['target']
X = data.drop('target', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y)

clf = RandomForestClassifier().fit(X_train, y_train)
classification_report(y_test, clf.predict(X_test))

###### Пропущенные значения

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
X = np.array([[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9], [15, 3, np.nan]])
X

###### Mean

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X)
imputer.transform(X)

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
data.loc[:, ['nom_5', 'nom_6']] = imputer.fit_transform(data[['nom_5', 'nom_6']])

###### Median

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
imputer.fit(X)
imputer.transform(X)

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
data.loc[:, ['nom_7']] = imputer.fit_transform(data[['nom_7']])

###### Most frequent

In [None]:
data['bin_0'].value_counts(), data['bin_1'].value_counts()

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
data.loc[:, ['bin_0', 'bin_1', 
             'bin_2', 'bin_3', 'bin_4']] = imputer.fit_transform(data[['bin_0', 'bin_1', 
                                                                        'bin_2', 'bin_3', 'bin_4']])

In [None]:
data['bin_0'].value_counts(), data['bin_1'].value_counts()

###### Most frequent modification

In [None]:
data['day'].value_counts()

In [None]:
data['day_imputed'] = np.where(data['day'].isnull(), 1, 0)
data['day_imputed'].value_counts()

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
data.loc[:, ['day']] = imputer.fit_transform(data[['day']])

In [None]:
data['day'].value_counts()

###### New class

In [None]:
data['month'].value_counts()

In [None]:
data.loc[:, 'month'] = data['month'].fillna(0)

In [None]:
data['month'].value_counts()

###### Iterative

Давайте каждую фичу рассмотрим как таргет

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
imputer = IterativeImputer(random_state=45)
imputer.fit(data[['ord_0', 'ord_1', 'ord_2']])
data.loc[:, ['ord_0', 'ord_1', 'ord_2']] = imputer.transform(data[['ord_0', 'ord_1', 'ord_2']])

###### KNN

In [None]:
from sklearn.impute import KNNImputer

In [None]:
description(data)

In [None]:
old_len = data.shape[0]
old_len

In [None]:
data = data.dropna()
print(f'we lost {(old_len - data.shape[0]) / data.shape[0] * 100:.2f}% of data')

In [None]:
y = data['target']
X = data.drop('target', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
clf = RandomForestClassifier().fit(X_train, y_train)

In [None]:
from sklearn.linear_model import LogisticRegression
scaler = StandardScaler().fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
clf = LogisticRegression().fit(X_train, y_train)

In [None]:
print(classification_report(y_test, clf.predict(X_test)))

In [None]:
clf = RandomForestClassifier().fit(X_train, y_train)

In [None]:
print(classification_report(y_test, clf.predict(X_test)))