**Задача:** взять лучший алгоритм классифицирующий аномалии алгоритм и посчитать другие метрики

In [29]:
import pandas as pd
import numpy as np
import datetime

import matplotlib.pyplot as plt
import seaborn as sns 

import os
from pathlib import Path

#get my written functions
from userfuncs import prepare_dataframe, find_borders_nan_intervals, count_frequency, count_missing
from userfuncs import get_best_distribution, distribution_by_season, get_metrics

import warnings
warnings.filterwarnings("ignore")

#algorithms
#from sklearn.linear_model import LogisticRegression
#from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

from sklearn.ensemble import IsolationForest

#scaler
from sklearn.preprocessing import StandardScaler

#split
from sklearn.model_selection import train_test_split, TimeSeriesSplit

#metrics
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, roc_curve

#tqdm
from tqdm.notebook import tqdm

from collections import defaultdict

In [30]:
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 100)

In [31]:
from matplotlib import style
style.use('fivethirtyeight')

In [32]:
path_to_data = Path('../../data')
df = pd.read_csv(path_to_data / 'prepared_data_prioksk.csv')
ELEMENTS = df.columns
df.head(2)

Unnamed: 0,TSP,SO2,PB,CD
0,9.0,0.02,21.0,0.54
1,5.0,0.03,20.0,0.53


In [33]:
def code_mean(data, cat_feature, real_feature):
    """Simple target encoding"""
    return dict(data.groupby(cat_feature)[real_feature].mean())

def prepare_data(df, all_targets, lag_start=1, lag_end=15, test_size=0.2, is_split=True):
    
    start_date = pd.to_datetime('1987-10-01', format='%Y-%m-%d') #дата с которой идет ряд
    #так как данные примерно полные были, то заполним таким индексом для создания фичей дальше
    #полезно в первую очередь в обучающих целях
    df.index = pd.date_range(start=start_date, periods=df.shape[0], freq='D')

    test_index = int(len(df)*(1-test_size))
    df_lag = None
    for col in all_targets:
        ts = df[[col]]
        for i in range(lag_start, lag_end):
            ts[f"{col}_lag_{i}"] = ts[col].shift(i)
        if df_lag is None:
            df_lag = ts
        else:
            df_lag = df_lag.join(ts)
            
    extra_cols = list(set(df.columns) - set(df_lag.columns))
    df_lag = df_lag.join(df[extra_cols])
            
    df_lag["weekday"] = df_lag.index.weekday
    df_lag['is_weekend'] = df_lag.weekday.isin([5,6])*1
    
    for col in all_targets:
        df_lag[f'{col}_weekday_average'] = list(map(code_mean(df_lag[:test_index], 'weekday', col).get, df_lag.weekday))
        
    if 'weekday' in df_lag.columns:
        df_lag = df_lag.drop(["weekday"], axis=1)
    df_lag = df_lag.dropna()
    
    df_lag = df_lag.reset_index(drop=True)
    
    if is_split:
        train = df_lag.loc[:test_index]
        test = df_lag.loc[test_index:]
        return train, test
    else:
        return df_lag
    

def find_optimal_cutoff(y_true, y_pred_prob):
    '''Get optimal threshold of roc_curve to maximize roc_auc_score
    
    Binary classification
    
    Parameters:
        y_true - vector of true values 0 and 1
        y_pred_prob - vector of predicted probabilities
        
    Return:
        cutoff
    '''
    
    fpr, tpr, threshold = roc_curve(y_true, y_pred_prob)
    i = np.arange(len(tpr)) 
    roc = pd.DataFrame({'tf' : pd.Series(tpr-(1-fpr), index=i), 'threshold' : pd.Series(threshold, index=i)})
    roc_t = roc.iloc[roc['tf'].abs().argsort()[:1]]
    return roc_t['threshold'].values[0]

In [34]:
ALGORITHMS = ['Lightgbm']
METRICS = ['accuracy', 'precision', 'recall', 'f1']

In [35]:
#to keep results of auc for all elements for train and test parts
index = METRICS.copy()
multiindex = pd.MultiIndex.from_tuples([(elem, ix) for elem in ELEMENTS for ix in index])
stat_auc = pd.DataFrame(index=multiindex, columns=['Train', 'Test'])

In [36]:
useful_columns = ['is_weekend']
all_targets = list(df.columns)

In [37]:
##Isolation Forest##
iforest = IsolationForest(n_estimators=300, contamination=0.03, random_state=42, n_jobs=-1)
#1 - outliers, 0 - no outlier
for col in all_targets:
    df[f'{col}_anomalies'] = [1 if x == -1 else 0 for x in iforest.fit_predict(df[[col]])]

In [38]:
df = prepare_data(df, all_targets, lag_start=1, lag_end=14, test_size=0.2, is_split=False)

### Algorithms

In [39]:
def predict_by_lgbm(X_train, y_train, X_test, **kwargs):
    '''Return predicted values on train and test by lgbm'''
    
    y_train = y_train.values.reshape(1, -1).squeeze()
    max_scale_pos_weight = np.bincount(y_train)[0]/np.bincount(y_train)[1]
    
    params = {
        'n_estimators': 1000
        , 'learning_rate': 0.05
        , 'max_leaves': 31
        , 'max_depth': -1
        , 'subsample': 0.8
        , 'colsample_bytree': 0.9
        , 'scale_pos_weight': max_scale_pos_weight
        
        , 'n_jobs': -1
        , 'random_state': 777
    }
    
    model = LGBMClassifier(**params)
    model.fit(X_train, y_train, eval_metric=['auc'], verbose=False,
             eval_set=[(X_test, y_test)], early_stopping_rounds=100)
    
    
    pred_test = model.predict_proba(X_test)[:, 1]
    pred_train = model.predict_proba(X_train)[:, 1]
    
    return pred_train, pred_test

### TSP

In [40]:
target_name = 'TSP_anomalies'

#to take lag features
target_cols = [name for name in df.columns if name.startswith(target_name.split('_')[0])]
#take lag features and some other
train_columns = target_cols + all_targets + useful_columns
useless_cols = ([target_name]
#+ [f'{target_name.split("_")[0]}_weekday_average', 'is_weekend']
               )
train_columns = list(set(train_columns) - set(useless_cols))

#словарь для хранения результатов
metrics = defaultdict(dict)
for algo in ALGORITHMS:
    for metric in METRICS:
            metrics[algo][metric] = {'Train': [], 'Test': []}

In [41]:
tcsv = TimeSeriesSplit(n_splits=5)
#get best algorithms
auc_l = {algo : {'Train': [], 'Test': []} for algo in ALGORITHMS}

y_pred_train = None
for train_index, test_index in tqdm(tcsv.split(df), total=tcsv.n_splits):
    train, test = df.iloc[train_index], df.iloc[test_index]
    X_train, X_test = train.drop([target_name], axis=1), test.drop([target_name], axis=1)
    y_train, y_test = train[[target_name]], test[[target_name]]

    VAL = 0.5
    algo = 'Lightgbm'
    pred_train, pred_test = predict_by_lgbm(X_train=X_train, y_train=y_train, X_test=X_test)
    try:
        auc_l[algo]['Train'].append(roc_auc_score(y_train, pred_train))
    except:
        auc_l[algo]['Train'].append(VAL)
    
    try:
        auc_l[algo]['Test'].append(roc_auc_score(y_test, pred_test))
    except:
        auc_l[algo]['Test'].append(VAL)
    
    if y_pred_train is None:
        y_pred_train = np.hstack([pred_train, pred_test])
    else:
        y_pred_train = np.hstack([y_pred_train, pred_test])
        
        
#оптимальный порог для разбиения auc на 2 класса (нужно для рассчета других метрик)
THR = find_optimal_cutoff(df[target_name], y_pred_train)

tcsv = TimeSeriesSplit(n_splits=5)

for train_index, test_index in tqdm(tcsv.split(df), total=tcsv.n_splits):
    train, test = df.iloc[train_index], df.iloc[test_index]
    X_train, X_test = train.drop([target_name], axis=1), test.drop([target_name], axis=1)
    y_train, y_test = train[[target_name]], test[[target_name]]

    algo = 'Lightgbm'
    pred_train, pred_test = predict_by_lgbm(X_train=X_train, y_train=y_train, X_test=X_test)
    
    
    pred_train = np.where(pred_train > THR, 1, 0)
    pred_test = np.where(pred_test > THR, 1, 0)
    tmp = get_metrics(pred_train, y_train, metrics=METRICS)
    for k, v in tmp.items():
        metrics[algo][k.lower()]['Train'].append(v)
    tmp = get_metrics(pred_test, y_test, metrics=METRICS)
    for k, v in tmp.items():
        metrics[algo][k.lower()]['Test'].append(v)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [42]:
#аггрегируем метрику и записываем в датафрейм
tmp = defaultdict(dict)
for algo, metric_sample in metrics.items():
    for m, sample in metric_sample.items():
        tmp[algo][m] = {'Train': np.mean(sample['Train']), 'Test': np.mean(sample['Test'])}
metrics = tmp.copy()
for algo in ALGORITHMS: #just fo this case is okay
    for m in METRICS:
        stat_auc.loc[(target_name.split('_')[0], m), 'Train'] = metrics[algo][m]['Train']
        stat_auc.loc[(target_name.split('_')[0], m), 'Test'] = metrics[algo][m]['Test']

In [43]:
stat_auc.loc[(target_name.split('_')[0])]

Unnamed: 0,Train,Test
accuracy,0.982184,0.987121
precision,1.0,0.9
recall,0.713974,0.780214
f1,0.820438,0.795556


### SO2

In [44]:
target_name = 'SO2_anomalies'

#to take lag features
target_cols = [name for name in df.columns if name.startswith(target_name.split('_')[0])]
#take lag features and some other
train_columns = target_cols + all_targets + useful_columns
useless_cols = ([target_name]
#+ [f'{target_name.split("_")[0]}_weekday_average', 'is_weekend']
               )
train_columns = list(set(train_columns) - set(useless_cols))

#словарь для хранения результатов
metrics = defaultdict(dict)
for algo in ALGORITHMS:
    for metric in METRICS:
            metrics[algo][metric] = {'Train': [], 'Test': []}

In [45]:
tcsv = TimeSeriesSplit(n_splits=5)
#get best algorithms
auc_l = {algo : {'Train': [], 'Test': []} for algo in ALGORITHMS}

y_pred_train = None
for train_index, test_index in tqdm(tcsv.split(df), total=tcsv.n_splits):
    train, test = df.iloc[train_index], df.iloc[test_index]
    X_train, X_test = train.drop([target_name], axis=1), test.drop([target_name], axis=1)
    y_train, y_test = train[[target_name]], test[[target_name]]

    VAL = 0.5
    algo = 'Lightgbm'
    pred_train, pred_test = predict_by_lgbm(X_train=X_train, y_train=y_train, X_test=X_test)
    try:
        auc_l[algo]['Train'].append(roc_auc_score(y_train, pred_train))
    except:
        auc_l[algo]['Train'].append(VAL)
    
    try:
        auc_l[algo]['Test'].append(roc_auc_score(y_test, pred_test))
    except:
        auc_l[algo]['Test'].append(VAL)
    
    if y_pred_train is None:
        y_pred_train = np.hstack([pred_train, pred_test])
    else:
        y_pred_train = np.hstack([y_pred_train, pred_test])
        
        
#оптимальный порог для разбиения auc на 2 класса (нужно для рассчета других метрик)
THR = find_optimal_cutoff(df[target_name], y_pred_train)

tcsv = TimeSeriesSplit(n_splits=5)

for train_index, test_index in tqdm(tcsv.split(df), total=tcsv.n_splits):
    train, test = df.iloc[train_index], df.iloc[test_index]
    X_train, X_test = train.drop([target_name], axis=1), test.drop([target_name], axis=1)
    y_train, y_test = train[[target_name]], test[[target_name]]

    algo = 'Lightgbm'
    pred_train, pred_test = predict_by_lgbm(X_train=X_train, y_train=y_train, X_test=X_test)
    
    
    pred_train = np.where(pred_train > THR, 1, 0)
    pred_test = np.where(pred_test > THR, 1, 0)
    tmp = get_metrics(pred_train, y_train, metrics=METRICS)
    for k, v in tmp.items():
        metrics[algo][k.lower()]['Train'].append(v)
    tmp = get_metrics(pred_test, y_test, metrics=METRICS)
    for k, v in tmp.items():
        metrics[algo][k.lower()]['Test'].append(v)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [46]:
#аггрегируем метрику и записываем в датафрейм
tmp = defaultdict(dict)
for algo, metric_sample in metrics.items():
    for m, sample in metric_sample.items():
        tmp[algo][m] = {'Train': np.mean(sample['Train']), 'Test': np.mean(sample['Test'])}
metrics = tmp.copy()
for algo in ALGORITHMS: #just fo this case is okay
    for m in METRICS:
        stat_auc.loc[(target_name.split('_')[0], m), 'Train'] = metrics[algo][m]['Train']
        stat_auc.loc[(target_name.split('_')[0], m), 'Test'] = metrics[algo][m]['Test']

In [47]:
stat_auc.loc[(target_name.split('_')[0])]

Unnamed: 0,Train,Test
accuracy,0.960682,0.988636
precision,1.0,0.75
recall,0.794468,0.776471
f1,0.850426,0.758929


### PB

In [48]:
target_name = 'PB_anomalies'

#to take lag features
target_cols = [name for name in df.columns if name.startswith(target_name.split('_')[0])]
#take lag features and some other
train_columns = target_cols + all_targets + useful_columns
useless_cols = ([target_name]
#+ [f'{target_name.split("_")[0]}_weekday_average', 'is_weekend']
               )
train_columns = list(set(train_columns) - set(useless_cols))

#словарь для хранения результатов
metrics = defaultdict(dict)
for algo in ALGORITHMS:
    for metric in METRICS:
            metrics[algo][metric] = {'Train': [], 'Test': []}

In [49]:
tcsv = TimeSeriesSplit(n_splits=5)
#get best algorithms
auc_l = {algo : {'Train': [], 'Test': []} for algo in ALGORITHMS}

y_pred_train = None
for train_index, test_index in tqdm(tcsv.split(df), total=tcsv.n_splits):
    train, test = df.iloc[train_index], df.iloc[test_index]
    X_train, X_test = train.drop([target_name], axis=1), test.drop([target_name], axis=1)
    y_train, y_test = train[[target_name]], test[[target_name]]

    VAL = 0.5
    algo = 'Lightgbm'
    pred_train, pred_test = predict_by_lgbm(X_train=X_train, y_train=y_train, X_test=X_test)
    try:
        auc_l[algo]['Train'].append(roc_auc_score(y_train, pred_train))
    except:
        auc_l[algo]['Train'].append(VAL)
    
    try:
        auc_l[algo]['Test'].append(roc_auc_score(y_test, pred_test))
    except:
        auc_l[algo]['Test'].append(VAL)
    
    if y_pred_train is None:
        y_pred_train = np.hstack([pred_train, pred_test])
    else:
        y_pred_train = np.hstack([y_pred_train, pred_test])
        
        
#оптимальный порог для разбиения auc на 2 класса (нужно для рассчета других метрик)
THR = find_optimal_cutoff(df[target_name], y_pred_train)

tcsv = TimeSeriesSplit(n_splits=5)

for train_index, test_index in tqdm(tcsv.split(df), total=tcsv.n_splits):
    train, test = df.iloc[train_index], df.iloc[test_index]
    X_train, X_test = train.drop([target_name], axis=1), test.drop([target_name], axis=1)
    y_train, y_test = train[[target_name]], test[[target_name]]

    algo = 'Lightgbm'
    pred_train, pred_test = predict_by_lgbm(X_train=X_train, y_train=y_train, X_test=X_test)
    
    
    pred_train = np.where(pred_train > THR, 1, 0)
    pred_test = np.where(pred_test > THR, 1, 0)
    tmp = get_metrics(pred_train, y_train, metrics=METRICS)
    for k, v in tmp.items():
        metrics[algo][k.lower()]['Train'].append(v)
    tmp = get_metrics(pred_test, y_test, metrics=METRICS)
    for k, v in tmp.items():
        metrics[algo][k.lower()]['Test'].append(v)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [50]:
#аггрегируем метрику и записываем в датафрейм
tmp = defaultdict(dict)
for algo, metric_sample in metrics.items():
    for m, sample in metric_sample.items():
        tmp[algo][m] = {'Train': np.mean(sample['Train']), 'Test': np.mean(sample['Test'])}
metrics = tmp.copy()
for algo in ALGORITHMS: #just fo this case is okay
    for m in METRICS:
        stat_auc.loc[(target_name.split('_')[0], m), 'Train'] = metrics[algo][m]['Train']
        stat_auc.loc[(target_name.split('_')[0], m), 'Test'] = metrics[algo][m]['Test']

In [51]:
stat_auc.loc[(target_name.split('_')[0])]

Unnamed: 0,Train,Test
accuracy,0.969924,0.997727
precision,0.4,0.366667
recall,0.4,0.4
f1,0.4,0.381818


### CD

In [52]:
target_name = 'CD_anomalies'

#to take lag features
target_cols = [name for name in df.columns if name.startswith(target_name.split('_')[0])]
#take lag features and some other
train_columns = target_cols + all_targets + useful_columns
useless_cols = ([target_name]
#+ [f'{target_name.split("_")[0]}_weekday_average', 'is_weekend']
               )
train_columns = list(set(train_columns) - set(useless_cols))

#словарь для хранения результатов
metrics = defaultdict(dict)
for algo in ALGORITHMS:
    for metric in METRICS:
            metrics[algo][metric] = {'Train': [], 'Test': []}

In [53]:
tcsv = TimeSeriesSplit(n_splits=5)
#get best algorithms
auc_l = {algo : {'Train': [], 'Test': []} for algo in ALGORITHMS}

y_pred_train = None
for train_index, test_index in tqdm(tcsv.split(df), total=tcsv.n_splits):
    train, test = df.iloc[train_index], df.iloc[test_index]
    X_train, X_test = train.drop([target_name], axis=1), test.drop([target_name], axis=1)
    y_train, y_test = train[[target_name]], test[[target_name]]

    VAL = 0.5
    algo = 'Lightgbm'
    pred_train, pred_test = predict_by_lgbm(X_train=X_train, y_train=y_train, X_test=X_test)
    try:
        auc_l[algo]['Train'].append(roc_auc_score(y_train, pred_train))
    except:
        auc_l[algo]['Train'].append(VAL)
    
    try:
        auc_l[algo]['Test'].append(roc_auc_score(y_test, pred_test))
    except:
        auc_l[algo]['Test'].append(VAL)
    
    if y_pred_train is None:
        y_pred_train = np.hstack([pred_train, pred_test])
    else:
        y_pred_train = np.hstack([y_pred_train, pred_test])
        
        
#оптимальный порог для разбиения auc на 2 класса (нужно для рассчета других метрик)
THR = find_optimal_cutoff(df[target_name], y_pred_train)

tcsv = TimeSeriesSplit(n_splits=5)

for train_index, test_index in tqdm(tcsv.split(df), total=tcsv.n_splits):
    train, test = df.iloc[train_index], df.iloc[test_index]
    X_train, X_test = train.drop([target_name], axis=1), test.drop([target_name], axis=1)
    y_train, y_test = train[[target_name]], test[[target_name]]

    algo = 'Lightgbm'
    pred_train, pred_test = predict_by_lgbm(X_train=X_train, y_train=y_train, X_test=X_test)
    
    
    pred_train = np.where(pred_train > THR, 1, 0)
    pred_test = np.where(pred_test > THR, 1, 0)
    tmp = get_metrics(pred_train, y_train, metrics=METRICS)
    for k, v in tmp.items():
        metrics[algo][k.lower()]['Train'].append(v)
    tmp = get_metrics(pred_test, y_test, metrics=METRICS)
    for k, v in tmp.items():
        metrics[algo][k.lower()]['Test'].append(v)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [54]:
#аггрегируем метрику и записываем в датафрейм
tmp = defaultdict(dict)
for algo, metric_sample in metrics.items():
    for m, sample in metric_sample.items():
        tmp[algo][m] = {'Train': np.mean(sample['Train']), 'Test': np.mean(sample['Test'])}
metrics = tmp.copy()
for algo in ALGORITHMS: #just fo this case is okay
    for m in METRICS:
        stat_auc.loc[(target_name.split('_')[0], m), 'Train'] = metrics[algo][m]['Train']
        stat_auc.loc[(target_name.split('_')[0], m), 'Test'] = metrics[algo][m]['Test']

In [55]:
stat_auc.loc[(target_name.split('_')[0])]

Unnamed: 0,Train,Test
accuracy,0.989394,0.99697
precision,0.8,0.8
recall,0.782609,0.75
f1,0.790909,0.771429


### Результат

In [56]:
stat_auc

Unnamed: 0,Unnamed: 1,Train,Test
TSP,accuracy,0.982184,0.987121
TSP,precision,1.0,0.9
TSP,recall,0.713974,0.780214
TSP,f1,0.820438,0.795556
SO2,accuracy,0.960682,0.988636
SO2,precision,1.0,0.75
SO2,recall,0.794468,0.776471
SO2,f1,0.850426,0.758929
PB,accuracy,0.969924,0.997727
PB,precision,0.4,0.366667


In [57]:
stat_auc.T

Unnamed: 0_level_0,TSP,TSP,TSP,TSP,SO2,SO2,SO2,SO2,PB,PB,PB,PB,CD,CD,CD,CD
Unnamed: 0_level_1,accuracy,precision,recall,f1,accuracy,precision,recall,f1,accuracy,precision,recall,f1,accuracy,precision,recall,f1
Train,0.982184,1.0,0.713974,0.820438,0.960682,1.0,0.794468,0.850426,0.969924,0.4,0.4,0.4,0.989394,0.8,0.782609,0.790909
Test,0.987121,0.9,0.780214,0.795556,0.988636,0.75,0.776471,0.758929,0.997727,0.366667,0.4,0.381818,0.99697,0.8,0.75,0.771429
