In [13]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GroupKFold
from sklearn import linear_model 
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [14]:
df = pd.read_csv("data_vad.csv")
data = df[(df["Valence"].notna())&(df["Arousal"].notna())].drop("filename", axis = 1)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3632 entries, 10 to 165128
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   msec      3632 non-null   int64  
 1   Corr      3632 non-null   float64
 2   Zyg       3632 non-null   float64
 3   Mas       3632 non-null   float64
 4   Valence   3632 non-null   float64
 5   Arousal   3632 non-null   float64
 6   label     3632 non-null   int64  
 7   batch_id  3632 non-null   object 
dtypes: float64(5), int64(2), object(1)
memory usage: 255.4+ KB


In [15]:
df.batch_id.astype('str')

0          3_0
1          3_0
2          3_0
3          3_0
4          3_0
          ... 
165160    1_53
165161    1_53
165162    1_53
165163    1_53
165164    1_53
Name: batch_id, Length: 165165, dtype: object

# Разделяем данные

In [36]:
X1 = data[data["label"] == 1]["batch_id"].unique()
X2 = data[data["label"] == 2]["batch_id"].unique()
X3 = data[data["label"] == 3]["batch_id"].unique()

In [37]:
X1_train, X1_test = train_test_split(X1,train_size = 0.7, random_state = 42)
X2_train, X2_test = train_test_split(X2,train_size = 0.7, random_state = 42)
X3_train, X3_test = train_test_split(X3,train_size = 0.7, random_state = 42)
X_train = [*X1_train, *X2_train, *X3_train]
X_test = [*X1_test, *X2_test, *X3_test]
data_train = data[data["batch_id"].apply(lambda x: x in X_train)]
data_test = data[data["batch_id"].apply(lambda x: x in X_test)]

In [18]:
X = data_train.iloc[:,1:4]
X_test = data_test.iloc[:,1:4]
y_valence = data_train.iloc[:,4:5]
y_arousal = data_train.iloc[:,5:6]
y_valence_test = data_test.iloc[:,4:5]
y_arousal_test = data_test.iloc[:,5:6]

# Модели до кросс-валидации

In [19]:
def models_test(X, y, X_test, y_test):
    model_lr = linear_model.LinearRegression()
    model_br = linear_model.BayesianRidge()
    model_en = linear_model.ElasticNet()
    model_svr = SVR()
    model_gbr = GradientBoostingRegressor()
    model_kne = KNeighborsRegressor()
    model_dtr = DecisionTreeRegressor()
    model_rfr = RandomForestRegressor()
    models = [model_lr, model_br, model_en, model_svr, model_gbr, model_kne, model_dtr, model_rfr]
    m = {"R2_train":{},"MAE_train":{},"MSE_train":{}, "R2_test":{},"MAE_test":{},"MSE_test":{}}
    for model in models:
        model.fit(X, y) 
        l = str(model)
        m["R2_train"][l[:l.index('(')]] = r2_score(y, model.predict(X))
        m["MAE_train"][l[:l.index('(')]] = mean_absolute_error(y, model.predict(X))
        m["MSE_train"][l[:l.index('(')]] = mean_squared_error(y, model.predict(X))
        m["R2_test"][l[:l.index('(')]] = r2_score(y_test, model.predict(X_test))
        m["MAE_test"][l[:l.index('(')]] = mean_absolute_error(y_test, model.predict(X_test))
        m["MSE_test"][l[:l.index('(')]] = mean_squared_error(y_test, model.predict(X_test))
    return pd.DataFrame(m)

In [9]:
TestModelsArousal = models_test(X, y_arousal, X_test, y_arousal_test)
TestModelsValence = models_test(X, y_valence, X_test, y_valence_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  model.fit(X, y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  model.fit(X, y)


In [10]:
TestModelsArousal

Unnamed: 0,R2_train,MAE_train,MSE_train,R2_test,MAE_test,MSE_test
LinearRegression,0.109401,0.073219,0.009967,0.00163,0.078125,0.008595
BayesianRidge,0.109398,0.073229,0.009967,0.001609,0.07815,0.008595
ElasticNet,0.061871,0.076248,0.010499,-0.0187,0.080689,0.00877
SVR,0.202261,0.074166,0.008928,-0.079891,0.081039,0.009297
GradientBoostingRegressor,0.427715,0.058242,0.006405,-0.027972,0.074649,0.00885
KNeighborsRegressor,0.481324,0.052917,0.005805,-0.131288,0.076039,0.009739
DecisionTreeRegressor,1.0,0.0,0.0,-1.091646,0.095523,0.018007
RandomForestRegressor,0.899221,0.023119,0.001128,-0.177991,0.077915,0.010141


In [11]:
TestModelsValence

Unnamed: 0,R2_train,MAE_train,MSE_train,R2_test,MAE_test,MSE_test
LinearRegression,0.292631,0.245319,0.099994,0.031792,0.34496,0.177149
BayesianRidge,0.292628,0.245287,0.099994,0.032476,0.344764,0.177024
ElasticNet,0.280653,0.247715,0.101687,0.055822,0.337354,0.172752
SVR,0.339913,0.225432,0.09331,0.028515,0.333476,0.177749
GradientBoostingRegressor,0.528358,0.198331,0.066671,0.158708,0.3146,0.153928
KNeighborsRegressor,0.596347,0.176434,0.057061,0.057931,0.323869,0.172367
DecisionTreeRegressor,1.0,0.0,0.0,-0.369385,0.364716,0.250551
RandomForestRegressor,0.921361,0.07701,0.011116,0.112951,0.314501,0.1623


# Модели после кросс-валидации

In [14]:
def models_test_val(X, y, X_test, y_test):
    model_lr = linear_model.LinearRegression()
    model_br = linear_model.BayesianRidge()
    model_en = linear_model.ElasticNet()
    model_svr = SVR(kernel = 'linear')
    model_gbr = GradientBoostingRegressor(max_depth = 2, max_features = 2)
    model_kne = KNeighborsRegressor(n_neighbors = 101)
    model_dtr = DecisionTreeRegressor(max_depth = 4)
    model_rfr = RandomForestRegressor(max_depth = 4, n_estimators = 200, max_features = 7)
    models = [model_lr, model_br, model_en, model_svr, model_gbr, model_kne, model_dtr, model_rfr]
    m = {"R2_train":{},"MAE_train":{},"MSE_train":{}, "R2_test":{},"MAE_test":{},"MSE_test":{}}
    for model in models:
        model.fit(X, y) 
        l = str(model)
        m["R2_train"][l[:l.index('(')]] = r2_score(y, model.predict(X))
        m["MAE_train"][l[:l.index('(')]] = mean_absolute_error(y, model.predict(X))
        m["MSE_train"][l[:l.index('(')]] = mean_squared_error(y, model.predict(X))
        m["R2_test"][l[:l.index('(')]] = r2_score(y_test, model.predict(X_test))
        m["MAE_test"][l[:l.index('(')]] = mean_absolute_error(y_test, model.predict(X_test))
        m["MSE_test"][l[:l.index('(')]] = mean_squared_error(y_test, model.predict(X_test))
    return pd.DataFrame(m)

In [15]:
ModelsValenceCrossVal = models_test_val(X, y_valence, X_test, y_valence_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  model.fit(X, y)


In [20]:
def models_test_ar(X, y, X_test, y_test):
    model_lr = linear_model.LinearRegression()
    model_br = linear_model.BayesianRidge()
    model_en = linear_model.ElasticNet()
    model_svr = SVR(kernel = 'linear')
    model_gbr = GradientBoostingRegressor(max_features = 1, max_depth = 1, n_estimators = 95)
    model_kne = KNeighborsRegressor(n_neighbors = 61)
    model_dtr = DecisionTreeRegressor(max_depth = 2)
    model_rfr = RandomForestRegressor(max_depth = 2, n_estimators = 120, max_features = 1)
    models = [model_lr, model_br, model_en, model_svr, model_gbr, model_kne, model_dtr, model_rfr]
    m = {"R2_train":{},"MAE_train":{},"MSE_train":{}, "R2_test":{},"MAE_test":{},"MSE_test":{}}
    for model in models:
        model.fit(X, y) 
        l = str(model)
        m["R2_train"][l[:l.index('(')]] = r2_score(y, model.predict(X))
        m["MAE_train"][l[:l.index('(')]] = mean_absolute_error(y, model.predict(X))
        m["MSE_train"][l[:l.index('(')]] = mean_squared_error(y, model.predict(X))
        m["R2_test"][l[:l.index('(')]] = r2_score(y_test, model.predict(X_test))
        m["MAE_test"][l[:l.index('(')]] = mean_absolute_error(y_test, model.predict(X_test))
        m["MSE_test"][l[:l.index('(')]] = mean_squared_error(y_test, model.predict(X_test))
    return pd.DataFrame(m)

In [21]:
ModelsArousalCrossVal = models_test_ar(X, y_arousal, X_test, y_arousal_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  model.fit(X, y)


# Arousal до и после кросс-валидация

In [27]:
TestModelsArousal

Unnamed: 0,R2_train,MAE_train,MSE_train,R2_test,MAE_test,MSE_test
LinearRegression,0.109401,0.073219,0.009967,0.00163,0.078125,0.008595
BayesianRidge,0.109398,0.073229,0.009967,0.001609,0.07815,0.008595
ElasticNet,0.061871,0.076248,0.010499,-0.0187,0.080689,0.00877
SVR,0.202261,0.074166,0.008928,-0.079891,0.081039,0.009297
GradientBoostingRegressor,0.427715,0.058242,0.006405,-0.027972,0.074649,0.00885
KNeighborsRegressor,0.481324,0.052917,0.005805,-0.131288,0.076039,0.009739
DecisionTreeRegressor,1.0,0.0,0.0,-1.091646,0.095523,0.018007
RandomForestRegressor,0.899221,0.023119,0.001128,-0.177991,0.077915,0.010141


In [28]:
ModelsArousalCrossVal

Unnamed: 0,R2_train,MAE_train,MSE_train,R2_test,MAE_test,MSE_test
LinearRegression,0.109401,0.073219,0.009967,0.00163,0.078125,0.008595
BayesianRidge,0.109398,0.073229,0.009967,0.001609,0.07815,0.008595
ElasticNet,0.061871,0.076248,0.010499,-0.0187,0.080689,0.00877
SVR,0.092399,0.077732,0.010157,-0.059083,0.083974,0.009118
GradientBoostingRegressor,0.218773,0.069195,0.008743,0.031597,0.07496,0.008337
KNeighborsRegressor,0.26438,0.065172,0.008233,0.087044,0.071637,0.00786
DecisionTreeRegressor,0.154331,0.07226,0.009464,-0.031731,0.078158,0.008882
RandomForestRegressor,0.175578,0.072303,0.009227,0.005332,0.078679,0.008563


# Valence до и после кросс-валидация

In [29]:
TestModelsValence

Unnamed: 0,R2_train,MAE_train,MSE_train,R2_test,MAE_test,MSE_test
LinearRegression,0.292631,0.245319,0.099994,0.031792,0.34496,0.177149
BayesianRidge,0.292628,0.245287,0.099994,0.032476,0.344764,0.177024
ElasticNet,0.280653,0.247715,0.101687,0.055822,0.337354,0.172752
SVR,0.339913,0.225432,0.09331,0.028515,0.333476,0.177749
GradientBoostingRegressor,0.528358,0.198331,0.066671,0.158708,0.3146,0.153928
KNeighborsRegressor,0.596347,0.176434,0.057061,0.057931,0.323869,0.172367
DecisionTreeRegressor,1.0,0.0,0.0,-0.369385,0.364716,0.250551
RandomForestRegressor,0.921361,0.07701,0.011116,0.112951,0.314501,0.1623


In [31]:
ModelsValenceCrossVal

Unnamed: 0,R2_train,MAE_train,MSE_train,R2_test,MAE_test,MSE_test
LinearRegression,0.292631,0.245319,0.099994,0.031792,0.34496,0.177149
BayesianRidge,0.292628,0.245287,0.099994,0.032476,0.344764,0.177024
ElasticNet,0.280653,0.247715,0.101687,0.055822,0.337354,0.172752
SVR,0.282051,0.241916,0.101489,0.040378,0.336492,0.175578
GradientBoostingRegressor,0.449128,0.215514,0.077871,0.130426,0.321702,0.159102
KNeighborsRegressor,0.413427,0.225805,0.082918,0.097996,0.329928,0.165036
DecisionTreeRegressor,0.418043,0.221245,0.082266,0.182519,0.309654,0.149571
RandomForestRegressor,0.441378,0.217137,0.078967,0.197139,0.308319,0.146896


# Программа которой производился подбор параметров

#### Доля отложенной и обучающейся выборки на каждой итерации

In [24]:
def contents_fold(cv, data):
    X1 = data[data["label"] == 1]["batch_id"].unique()
    X2 = data[data["label"] == 2]["batch_id"].unique()
    X3 = data[data["label"] == 3]["batch_id"].unique()
    a1 = np.array_split(X1, cv)
    a2 = np.array_split(X2, cv)
    a3 = np.array_split(X3, cv)
    n = data.shape[0]
    d = {'train':[], 'test':[]}
    for i in range(cv):
        X_test = [*a1[i], *a2[i], *a3[i]] 
        X_train = np.concatenate((*a1[i + 1:], *a2[i + 1:], *a3[i + 1:], *a1[:i], *a2[:i], *a3[:i]), axis = 0)
        data_train = data[data["batch_id"].apply(lambda x: x in X_train)]
        data_test = data[data["batch_id"].apply(lambda x: x in X_test)]
        d['train'].append(data_train.shape[0]/n)
        d['test'].append(data_test.shape[0]/n)
    return d

In [32]:
def contents_fold_no_label(cv, data):
    X1 = np.array(data['batch_id'])
    a = np.array_split(X1, cv)
    n = data.shape[0]
    d = {'train':[], 'test':[]}
    for i in range(len(a)):
        X_test = a[i] 
        X_train = np.concatenate((*a[i + 1:], *a[:i]), axis = 0)
        data_train = data[data["batch_id"].apply(lambda x: x in X_train)]
        data_test = data[data["batch_id"].apply(lambda x: x in X_test)]
        d['train'].append(data_train.shape[0]/n)
        d['test'].append(data_test.shape[0]/n)
    return d

In [33]:
contents_fold_no_label(5, data)

{'train': [0.8047907488986784,
  0.8031387665198237,
  0.8061674008810573,
  0.8056167400881057,
  0.8056167400881057],
 'test': [0.20154185022026433,
  0.20952643171806168,
  0.2064977973568282,
  0.20704845814977973,
  0.200715859030837]}

In [23]:
contents_fold(5, data)

{'train': [0.79818281938326,
  0.7959801762114538,
  0.7954295154185022,
  0.8050660792951542,
  0.8053414096916299],
 'test': [0.2018171806167401,
  0.20401982378854625,
  0.2045704845814978,
  0.1949339207048458,
  0.19465859030837004]}

#### Доработанная под эту задачу KFold

In [8]:
def my_KFold(model, cv, data, name):
    k = 4
    if name == 'Arousal':
        k = 5
    arr_r2_train = []
    arr_r2_test = []
    arr_mae_train = []
    arr_mae_test = []
    arr_mse_train = []
    arr_mse_test = []
    #Разделение с учетом экспериментов
    X1 = data[data["label"] == 1]["batch_id"].unique()
    X2 = data[data["label"] == 2]["batch_id"].unique()
    X3 = data[data["label"] == 3]["batch_id"].unique()
    a1 = np.array_split(X1, cv)
    a2 = np.array_split(X2, cv)
    a3 = np.array_split(X3, cv)
    for i in range(cv):
        X_deffer = [*a1[i], *a2[i], *a3[i]] 
        X_train = np.concatenate((*a1[i + 1:], *a2[i + 1:], *a3[i + 1:], *a1[:i], *a2[:i], *a3[:i]), axis = 0)
        data_train = data[data["batch_id"].apply(lambda x: x in X_train)]
        data_test = data[data["batch_id"].apply(lambda x: x in X_deffer)]
        X = data_train.iloc[:,1:4]
        X_test = data_test.iloc[:,1:4]
        y = data_train.iloc[:,k:k + 1]
        y_test = data_test.iloc[:,k:k + 1]
        #Обучение фолда
        model.fit(X, y)
        #Оценка качества на тренировочной и отложенной выборках
        arr_r2_train.append(r2_score(y, model.predict(X)))
        arr_r2_test.append(r2_score(y_test, model.predict(X_test)))
        arr_mae_train.append(mean_absolute_error(y, model.predict(X)))
        arr_mae_test.append(mean_absolute_error(y_test, model.predict(X_test)))
        arr_mse_train.append(mean_squared_error(y, model.predict(X)))
        arr_mse_test.append(mean_squared_error(y_test, model.predict(X_test)))
    d = {'r2_train':np.mean(np.array(arr_r2_train)),
         'r2_test':np.mean(np.array(arr_r2_test)),
         'mae_train':np.mean(np.array(arr_mae_train)),
         'mae_test':np.mean(np.array(arr_mae_test)),
         'mse_train':np.mean(np.array(arr_mse_train)),
         'mse_test':np.mean(np.array(arr_mse_test))}
    return d

In [35]:
def my_KFold_2(model, cv, data, name):
    k = 4
    if name == 'Arousal':
        k = 5
    arr_r2_train = []
    arr_r2_test = []
    arr_mae_train = []
    arr_mae_test = []
    arr_mse_train = []
    arr_mse_test = []
    #Разделение с учетом экспериментов
    X1 = np.array(data["batch_id"])
    a = np.array_split(X1, cv)
    for i in range(cv):
        X_deffer = a[i] 
        X_train = np.concatenate((*a[i + 1:], *a[:i]), axis = 0)
        data_train = data[data["batch_id"].apply(lambda x: x in X_train)]
        data_test = data[data["batch_id"].apply(lambda x: x in X_deffer)]
        X = data_train.iloc[:,1:4]
        X_test = data_test.iloc[:,1:4]
        y = data_train.iloc[:,k:k + 1]
        y_test = data_test.iloc[:,k:k + 1]
        #Обучение фолда
        model.fit(X, y)
        #Оценка качества на тренировочной и отложенной выборках
        arr_r2_train.append(r2_score(y, model.predict(X)))
        arr_r2_test.append(r2_score(y_test, model.predict(X_test)))
        arr_mae_train.append(mean_absolute_error(y, model.predict(X)))
        arr_mae_test.append(mean_absolute_error(y_test, model.predict(X_test)))
        arr_mse_train.append(mean_squared_error(y, model.predict(X)))
        arr_mse_test.append(mean_squared_error(y_test, model.predict(X_test)))
    d = {'r2_train':np.mean(np.array(arr_r2_train)),
         'r2_test':np.mean(np.array(arr_r2_test)),
         'mae_train':np.mean(np.array(arr_mae_train)),
         'mae_test':np.mean(np.array(arr_mae_test)),
         'mse_train':np.mean(np.array(arr_mse_train)),
         'mse_test':np.mean(np.array(arr_mse_test))}
    return d