**Задача:** по имеющимся значениям по конкретному веществу классифицировать объект: аномальный или нет

In [29]:
import pandas as pd
import numpy as np
import datetime

import matplotlib.pyplot as plt
import seaborn as sns 

import os
from pathlib import Path

#get my written functions
from userfuncs import prepare_dataframe, find_borders_nan_intervals, count_frequency, count_missing
from userfuncs import get_best_distribution, distribution_by_season, get_metrics

import warnings
warnings.filterwarnings("ignore")

#algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

from sklearn.ensemble import IsolationForest

#scaler
from sklearn.preprocessing import StandardScaler

#split
from sklearn.model_selection import train_test_split, TimeSeriesSplit

#metrics
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

#tqdm
from tqdm.notebook import tqdm

In [30]:
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 100)

In [32]:
from matplotlib import style
style.use('fivethirtyeight')

In [33]:
path_to_data = Path('../../data')
df = pd.read_csv(path_to_data / 'prepared_data_prioksk.csv')
ELEMENTS = df.columns
df.head(2)

Unnamed: 0,TSP,SO2,PB,CD
0,9.0,0.02,21.0,0.54
1,5.0,0.03,20.0,0.53


In [34]:
def code_mean(data, cat_feature, real_feature):
    """Simple target encoding"""
    return dict(data.groupby(cat_feature)[real_feature].mean())

def prepare_data(df, all_targets, lag_start=1, lag_end=15, test_size=0.2, is_split=True):
    
    start_date = pd.to_datetime('1987-10-01', format='%Y-%m-%d') #дата с которой идет ряд
    #так как данные примерно полные были, то заполним таким индексом для создания фичей дальше
    #полезно в первую очередь в обучающих целях
    df.index = pd.date_range(start=start_date, periods=df.shape[0], freq='D')

    test_index = int(len(df)*(1-test_size))
    df_lag = None
    for col in all_targets:
        ts = df[[col]]
        for i in range(lag_start, lag_end):
            ts[f"{col}_lag_{i}"] = ts[col].shift(i)
        if df_lag is None:
            df_lag = ts
        else:
            df_lag = df_lag.join(ts)
            
    extra_cols = list(set(df.columns) - set(df_lag.columns))
    df_lag = df_lag.join(df[extra_cols])
            
    df_lag["weekday"] = df_lag.index.weekday
    df_lag['is_weekend'] = df_lag.weekday.isin([5,6])*1
    
    for col in all_targets:
        df_lag[f'{col}_weekday_average'] = list(map(code_mean(df_lag[:test_index], 'weekday', col).get, df_lag.weekday))
        
    if 'weekday' in df_lag.columns:
        df_lag = df_lag.drop(["weekday"], axis=1)
    df_lag = df_lag.dropna()
    
    df_lag = df_lag.reset_index(drop=True)
    
    if is_split:
        train = df_lag.loc[:test_index]
        test = df_lag.loc[test_index:]
        return train, test
    else:
        return df_lag
    


In [35]:
ALGORITHMS = ['Random Forest', 'Logistic Regression', 'Lightgbm']
METRICS = ['accuracy', 'precision', 'recall', 'f1', 'auc']

In [36]:
#to keep results of auc for all elements for train and test parts
index = ALGORITHMS.copy()
multiindex = pd.MultiIndex.from_tuples([(elem, ix) for elem in ELEMENTS for ix in index])
stat_auc = pd.DataFrame(index=multiindex, columns=['Train', 'Test'])

In [37]:
useful_columns = ['is_weekend']
all_targets = list(df.columns)

In [38]:
##Isolation Forest##
iforest = IsolationForest(n_estimators=300, contamination=0.03, random_state=42, n_jobs=-1)
#1 - outliers, 0 - no outlier
for col in all_targets:
    df[f'{col}_anomalies'] = [1 if x == -1 else 0 for x in iforest.fit_predict(df[[col]])]

In [39]:
df = prepare_data(df, all_targets, lag_start=1, lag_end=14, test_size=0.2, is_split=False)

### Algorithms

In [40]:
def predict_by_random_forest(X_train, y_train, X_test, **kwargs):
    '''Return predicted values on train and test by Random Forest'''
    
    rf = RandomForestClassifier(n_estimators=600, n_jobs=-1, random_state=777, **kwargs
                               , max_features='sqrt'
                               #, min_samples_split=4
                              )
    rf.fit(X_train, y_train)
    yhat_train = rf.predict(X_train) 
    yhat_test = rf.predict(X_test)
    
    return yhat_train, yhat_test

def predict_by_logreg(X_train, y_train, X_test, **kwargs):
    '''Return predicted values on train and test by Elastic-Net'''
    
    scaler = StandardScaler()
    X_train_sc = scaler.fit_transform(X_train)
    X_test_sc = scaler.transform(X_test)
    
    lr = LogisticRegression(n_jobs=-1, random_state=777, **kwargs)
    lr.fit(X_train_sc, y_train)
    yhat_train = lr.predict(X_train_sc)
    yhat_test = lr.predict(X_test_sc)
    
    return yhat_train, yhat_test


def predict_by_lgbm(X_train, y_train, X_test, **kwargs):
    '''Return predicted values on train and test by lgbm'''
    
    y_train = y_train.values.reshape(1, -1).squeeze()
    max_scale_pos_weight = np.bincount(y_train)[0]/np.bincount(y_train)[1]
    
    params = {
        'n_estimators': 1000
        , 'learning_rate': 0.05
        , 'max_leaves': 31
        , 'max_depth': -1
        , 'subsample': 0.8
        , 'colsample_bytree': 0.9
        , 'scale_pos_weight': max_scale_pos_weight
        
        , 'n_jobs': -1
        , 'random_state': 777
    }
    
    model = LGBMClassifier(**params)
    model.fit(X_train, y_train, eval_metric=['auc'], verbose=False,
             eval_set=[(X_test, y_test)], early_stopping_rounds=100)
    
    
    pred_test = model.predict_proba(X_test)[:, 1]
    pred_train = model.predict_proba(X_train)[:, 1]
    
    return pred_train, pred_test

### TSP

In [41]:
target_name = 'TSP_anomalies'

#to take lag features
target_cols = [name for name in df.columns if name.startswith(target_name.split('_')[0])]
#take lag features and some other
train_columns = target_cols + all_targets + useful_columns
useless_cols = ([target_name]
#+ [f'{target_name.split("_")[0]}_weekday_average', 'is_weekend']
               )
train_columns = list(set(train_columns) - set(useless_cols))

#если захотеть просто так поделить выборку случайно, хотя так некорректно (временноя ряд же :) )
#train, test = train_test_split(df, test_size=0.2, random_state=54)
#X_train, X_test = train.drop([target_name], axis=1), test.drop([target_name], axis=1)
#y_train, y_test = train[target_name], test[target_name]

#X_train = X_train[train_columns]
#X_test = X_test[train_columns]

#pred_train_rf, pred_test_rf = predict_by_random_forest(X_train=X_train, y_train=y_train, X_test=X_test)
#pred_train_lr, pred_test_lr = predict_by_logreg(X_train=X_train, y_train=y_train, X_test=X_test)
#pred_train_lgb, pred_test_lgb = predict_by_lgbm(X_train=X_train, y_train=y_train, X_test=X_test)

In [42]:
tcsv = TimeSeriesSplit(n_splits=5)
#get best algorithms
auc_l = {algo : {'Train': [], 'Test': []} for algo in ALGORITHMS}

for train_index, test_index in tqdm(tcsv.split(df), total=tcsv.n_splits):
    train, test = df.iloc[train_index], df.iloc[test_index]
    X_train, X_test = train.drop([target_name], axis=1), test.drop([target_name], axis=1)
    y_train, y_test = train[[target_name]], test[[target_name]]
    
    algo = 'Random Forest'
    pred_train, pred_test = predict_by_random_forest(X_train=X_train, y_train=y_train, X_test=X_test)
    auc_l[algo]['Train'].append(roc_auc_score(y_train, pred_train))
    auc_l[algo]['Test'].append(roc_auc_score(y_test, pred_test))
    
    algo = 'Logistic Regression'
    pred_train, pred_test = predict_by_logreg(X_train=X_train, y_train=y_train, X_test=X_test)
    auc_l[algo]['Train'].append(roc_auc_score(y_train, pred_train))
    auc_l[algo]['Test'].append(roc_auc_score(y_test, pred_test))

    algo = 'Lightgbm'
    pred_train, pred_test = predict_by_lgbm(X_train=X_train, y_train=y_train, X_test=X_test)
    auc_l[algo]['Train'].append(roc_auc_score(y_train, pred_train))
    auc_l[algo]['Test'].append(roc_auc_score(y_test, pred_test))

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [43]:
#аггрегируем метрику и записываем в датафрейм
tmp = {}
for algo, sample in auc_l.items():
    tmp[algo] = {}
    for part, val in sample.items():
        tmp[algo][part] = np.mean(val)
auc_l = tmp.copy()
for algo in ALGORITHMS:
    stat_auc.loc[(target_name.split('_')[0], algo), 'Train'] = auc_l[algo]['Train']
    stat_auc.loc[(target_name.split('_')[0], algo), 'Test'] = auc_l[algo]['Test']

In [44]:
stat_auc.loc[(target_name.split('_')[0])]

Unnamed: 0,Train,Test
Random Forest,1.0,0.686007
Logistic Regression,0.946272,0.711946
Lightgbm,0.994864,0.95285


### SO2

In [45]:
target_name = 'SO2_anomalies'

#to take lag features
target_cols = [name for name in df.columns if name.startswith(target_name.split('_')[0])]
#take lag features and some other
train_columns = target_cols + all_targets + useful_columns
useless_cols = ([target_name]
#+ [f'{target_name.split("_")[0]}_weekday_average', 'is_weekend']
               )
train_columns = list(set(train_columns) - set(useless_cols))

In [46]:
tcsv = TimeSeriesSplit(n_splits=5)
#get best algorithms
auc_l = {algo : {'Train': [], 'Test': []} for algo in ALGORITHMS}

for train_index, test_index in tqdm(tcsv.split(df), total=tcsv.n_splits):
    train, test = df.iloc[train_index], df.iloc[test_index]
    X_train, X_test = train.drop([target_name], axis=1), test.drop([target_name], axis=1)
    y_train, y_test = train[[target_name]], test[[target_name]]
    
    algo = 'Random Forest'
    pred_train, pred_test = predict_by_random_forest(X_train=X_train, y_train=y_train, X_test=X_test)
    auc_l[algo]['Train'].append(roc_auc_score(y_train, pred_train))
    auc_l[algo]['Test'].append(roc_auc_score(y_test, pred_test))
    
    algo = 'Logistic Regression'
    pred_train, pred_test = predict_by_logreg(X_train=X_train, y_train=y_train, X_test=X_test)
    auc_l[algo]['Train'].append(roc_auc_score(y_train, pred_train))
    auc_l[algo]['Test'].append(roc_auc_score(y_test, pred_test))

    algo = 'Lightgbm'
    pred_train, pred_test = predict_by_lgbm(X_train=X_train, y_train=y_train, X_test=X_test)
    auc_l[algo]['Train'].append(roc_auc_score(y_train, pred_train))
    auc_l[algo]['Test'].append(roc_auc_score(y_test, pred_test))

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [47]:
#аггрегируем метрику и записываем в датафрейм
tmp = {}
for algo, sample in auc_l.items():
    tmp[algo] = {}
    for part, val in sample.items():
        tmp[algo][part] = np.mean(val)
auc_l = tmp.copy()
for algo in ALGORITHMS:
    stat_auc.loc[(target_name.split('_')[0], algo), 'Train'] = auc_l[algo]['Train']
    stat_auc.loc[(target_name.split('_')[0], algo), 'Test'] = auc_l[algo]['Test']

In [48]:
stat_auc.loc[(target_name.split('_')[0])]

Unnamed: 0,Train,Test
Random Forest,1.0,0.785
Logistic Regression,0.997674,0.808846
Lightgbm,0.998622,0.988542


### PB

In [49]:
target_name = 'PB_anomalies'

#to take lag features
target_cols = [name for name in df.columns if name.startswith(target_name.split('_')[0])]
#take lag features and some other
train_columns = target_cols + all_targets + useful_columns
useless_cols = ([target_name]
#+ [f'{target_name.split("_")[0]}_weekday_average', 'is_weekend']
               )
train_columns = list(set(train_columns) - set(useless_cols))

In [50]:
tcsv = TimeSeriesSplit(n_splits=5)
#get best algorithms
auc_l = {algo : {'Train': [], 'Test': []} for algo in ALGORITHMS}

for train_index, test_index in tqdm(tcsv.split(df), total=tcsv.n_splits):
    train, test = df.iloc[train_index], df.iloc[test_index]
    X_train, X_test = train.drop([target_name], axis=1), test.drop([target_name], axis=1)
    y_train, y_test = train[[target_name]], test[[target_name]]
    
    VAL = 0.5
    algo = 'Random Forest'
    pred_train, pred_test = predict_by_random_forest(X_train=X_train, y_train=y_train, X_test=X_test)
    try:
        auc_l[algo]['Train'].append(roc_auc_score(y_train, pred_train))
    except:
        auc_l[algo]['Train'].append(VAL)
    
    try:
        auc_l[algo]['Test'].append(roc_auc_score(y_test, pred_test))
    except:
        auc_l[algo]['Test'].append(VAL)
    
    algo = 'Logistic Regression'
    pred_train, pred_test = predict_by_logreg(X_train=X_train, y_train=y_train, X_test=X_test)
    
    try:
        auc_l[algo]['Train'].append(roc_auc_score(y_train, pred_train))
    except:
        auc_l[algo]['Train'].append(VAL)
    
    try:
        auc_l[algo]['Test'].append(roc_auc_score(y_test, pred_test))
    except:
        auc_l[algo]['Test'].append(VAL)

    algo = 'Lightgbm'
    pred_train, pred_test = predict_by_lgbm(X_train=X_train, y_train=y_train, X_test=X_test)
    try:
        auc_l[algo]['Train'].append(roc_auc_score(y_train, pred_train))
    except:
        auc_l[algo]['Train'].append(VAL)
    
    try:
        auc_l[algo]['Test'].append(roc_auc_score(y_test, pred_test))
    except:
        auc_l[algo]['Test'].append(VAL)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [51]:
#аггрегируем метрику и записываем в датафрейм
tmp = {}
for algo, sample in auc_l.items():
    tmp[algo] = {}
    for part, val in sample.items():
        tmp[algo][part] = np.mean(val)
auc_l = tmp.copy()
for algo in ALGORITHMS:
    stat_auc.loc[(target_name.split('_')[0], algo), 'Train'] = auc_l[algo]['Train']
    stat_auc.loc[(target_name.split('_')[0], algo), 'Test'] = auc_l[algo]['Test']

In [52]:
stat_auc.loc[(target_name.split('_')[0])]

Unnamed: 0,Train,Test
Random Forest,1.0,0.65
Logistic Regression,0.995652,0.533333
Lightgbm,0.999896,0.786176


### CD

In [53]:
target_name = 'CD_anomalies'

#to take lag features
target_cols = [name for name in df.columns if name.startswith(target_name.split('_')[0])]
#take lag features and some other
train_columns = target_cols + all_targets + useful_columns
useless_cols = ([target_name]
#+ [f'{target_name.split("_")[0]}_weekday_average', 'is_weekend']
               )
train_columns = list(set(train_columns) - set(useless_cols))

In [54]:
tcsv = TimeSeriesSplit(n_splits=10) ######
#get best algorithms
auc_l = {algo : {'Train': [], 'Test': []} for algo in ALGORITHMS}

for train_index, test_index in tqdm(tcsv.split(df), total=tcsv.n_splits):
    train, test = df.iloc[train_index], df.iloc[test_index]
    X_train, X_test = train.drop([target_name], axis=1), test.drop([target_name], axis=1)
    y_train, y_test = train[[target_name]], test[[target_name]]
    
    algo = 'Random Forest'
    pred_train, pred_test = predict_by_random_forest(X_train=X_train, y_train=y_train, X_test=X_test)
    auc_l[algo]['Train'].append(roc_auc_score(y_train, pred_train))
    auc_l[algo]['Test'].append(roc_auc_score(y_test, pred_test))
    
    algo = 'Logistic Regression'
    pred_train, pred_test = predict_by_logreg(X_train=X_train, y_train=y_train, X_test=X_test)
    auc_l[algo]['Train'].append(roc_auc_score(y_train, pred_train))
    auc_l[algo]['Test'].append(roc_auc_score(y_test, pred_test))

    algo = 'Lightgbm'
    pred_train, pred_test = predict_by_lgbm(X_train=X_train, y_train=y_train, X_test=X_test)
    auc_l[algo]['Train'].append(roc_auc_score(y_train, pred_train))
    auc_l[algo]['Test'].append(roc_auc_score(y_test, pred_test))

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [55]:
#аггрегируем метрику и записываем в датафрейм
tmp = {}
for algo, sample in auc_l.items():
    tmp[algo] = {}
    for part, val in sample.items():
        tmp[algo][part] = np.mean(val)
auc_l = tmp.copy()
for algo in ALGORITHMS:
    stat_auc.loc[(target_name.split('_')[0], algo), 'Train'] = auc_l[algo]['Train']
    stat_auc.loc[(target_name.split('_')[0], algo), 'Test'] = auc_l[algo]['Test']

In [56]:
stat_auc.loc[(target_name.split('_')[0])]

Unnamed: 0,Train,Test
Random Forest,1.0,0.875
Logistic Regression,1.0,0.781944
Lightgbm,0.99989,0.999764


### Результат

In [57]:
stat_auc

Unnamed: 0,Unnamed: 1,Train,Test
TSP,Random Forest,1.0,0.686007
TSP,Logistic Regression,0.946272,0.711946
TSP,Lightgbm,0.994864,0.95285
SO2,Random Forest,1.0,0.785
SO2,Logistic Regression,0.997674,0.808846
SO2,Lightgbm,0.998622,0.988542
PB,Random Forest,1.0,0.65
PB,Logistic Regression,0.995652,0.533333
PB,Lightgbm,0.999896,0.786176
CD,Random Forest,1.0,0.875


In [58]:
stat_auc.T

Unnamed: 0_level_0,TSP,TSP,TSP,SO2,SO2,SO2,PB,PB,PB,CD,CD,CD
Unnamed: 0_level_1,Random Forest,Logistic Regression,Lightgbm,Random Forest,Logistic Regression,Lightgbm,Random Forest,Logistic Regression,Lightgbm,Random Forest,Logistic Regression,Lightgbm
Train,1.0,0.946272,0.994864,1.0,0.997674,0.998622,1.0,0.995652,0.999896,1.0,1.0,0.99989
Test,0.686007,0.711946,0.95285,0.785,0.808846,0.988542,0.65,0.533333,0.786176,0.875,0.781944,0.999764
