In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

In [9]:
df = pd.read_csv("data_vad.csv")
data = df[(df["Valence"].notna())&(df["Arousal"].notna())].drop(["filename"], axis = 1)

### Количество шагов

In [10]:
def lag_count(df_new, batch_id = str):
    data_id = df_new[df_new['batch_id'] == batch_id]
    d = {'Corr': 1, 'Zyg': 2, 'Mas': 3}
    d_lag = {'Corr':int, 'Zyg':int, 'Mas':int}
    for name_column in d.keys():
        max_corr = corr = data_id.corr().loc[name_column, name_column]
        period = 1
        exp = np.exp(1)
        j = d[name_column]
        while corr > max_corr//exp and period < data_id.shape[0]:
            data_id[name_column + '_lag' + str(period)] = data_id.iloc[:,j:(j + 1)].shift(period)
            corr = data_id.corr().loc[name_column + '_lag' + str(period), name_column]
            period += 1
        d_lag[name_column] = period - 1
    return d_lag

In [11]:
arr_corr = []
arr_zyg = []
arr_mas = []

In [12]:
for batch_id in data["batch_id"].unique():
    d = lag_count(data, batch_id)
    arr_corr.append(d["Corr"])
    arr_zyg.append(d["Zyg"])
    arr_mas.append(d["Mas"])

In [13]:
print(f'Количество шагов для Corr: {round(np.array(arr_corr).mean(), 0)}')
print(f'Количество шагов для Zyg: {round(np.array(arr_zyg).mean(), 0)}')
print(f'Количество шагов для Mas: {round(np.array(arr_mas).mean(), 0)}')

Количество шагов для Corr: 6.0
Количество шагов для Zyg: 6.0
Количество шагов для Mas: 5.0


### Создадим DataFrame погружения

In [14]:
def delay_embedding(df_new, batch_id = str):
    data_id = df_new[df_new['batch_id'] == batch_id]
    d = {'Corr': 1, 'Zyg': 2, 'Mas': 3}
    d_lag = {'Corr': 6, 'Zyg': 6, 'Mas': 5}
    for name_column in d.keys():
        max_corr = corr = data_id.corr().loc[name_column, name_column]
        period = 1
        exp = np.exp(1)
        j = d[name_column]
        while period <= d_lag[name_column]:
            data_id[name_column + '_lag' + str(period)] = data_id.iloc[:,j:(j + 1)].shift(period)
            corr = data_id.corr().loc[name_column + '_lag' + str(period), name_column]
            period += 1
    return data_id.dropna()

In [15]:
arr_id = data["batch_id"].unique()
df_lagged = delay_embedding(data, batch_id = arr_id[0])
for batch_id in arr_id[1:]:
    df_lagged = pd.concat([df_lagged, delay_embedding(data, batch_id)], ignore_index=True)
df_lagged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2654 entries, 0 to 2653
Data columns (total 25 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   msec       2654 non-null   int64  
 1   Corr       2654 non-null   float64
 2   Zyg        2654 non-null   float64
 3   Mas        2654 non-null   float64
 4   Valence    2654 non-null   float64
 5   Arousal    2654 non-null   float64
 6   label      2654 non-null   int64  
 7   batch_id   2654 non-null   object 
 8   Corr_lag1  2654 non-null   float64
 9   Corr_lag2  2654 non-null   float64
 10  Corr_lag3  2654 non-null   float64
 11  Corr_lag4  2654 non-null   float64
 12  Corr_lag5  2654 non-null   float64
 13  Corr_lag6  2654 non-null   float64
 14  Zyg_lag1   2654 non-null   float64
 15  Zyg_lag2   2654 non-null   float64
 16  Zyg_lag3   2654 non-null   float64
 17  Zyg_lag4   2654 non-null   float64
 18  Zyg_lag5   2654 non-null   float64
 19  Zyg_lag6   2654 non-null   float64
 20  Mas_lag1

### Сохранение дата-сета в CSV файл

In [17]:
df_lagged.to_csv('data_lagged.csv')