In [29]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GroupKFold
from sklearn import linear_model 
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [30]:
df = pd.read_csv("data_vad.csv")
data = df[(df["Valence"].notna())&(df["Arousal"].notna())].drop("filename", axis = 1)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3632 entries, 10 to 165128
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   msec      3632 non-null   int64  
 1   Corr      3632 non-null   float64
 2   Zyg       3632 non-null   float64
 3   Mas       3632 non-null   float64
 4   Valence   3632 non-null   float64
 5   Arousal   3632 non-null   float64
 6   label     3632 non-null   int64  
 7   batch_id  3632 non-null   object 
dtypes: float64(5), int64(2), object(1)
memory usage: 255.4+ KB


# Разделяем данные

In [31]:
X1 = data[data["label"] == 1]["batch_id"].unique()
X2 = data[data["label"] == 2]["batch_id"].unique()
X3 = data[data["label"] == 3]["batch_id"].unique()

In [32]:
X1_train, X1_test = train_test_split(X1,train_size = 0.7, random_state = 42)
X2_train, X2_test = train_test_split(X2,train_size = 0.7, random_state = 42)
X3_train, X3_test = train_test_split(X3,train_size = 0.7, random_state = 42)
X_train = [*X1_train, *X2_train, *X3_train]
X_test = [*X1_test, *X2_test, *X3_test]
data_train = data[data["batch_id"].apply(lambda x: x in X_train)]
data_test = data[data["batch_id"].apply(lambda x: x in X_test)]

In [33]:
X = data_train.iloc[:,1:4]
X_test = data_test.iloc[:,1:4]
y_av = data_train.iloc[:,4:6]
y_valence = data_train.iloc[:,4:5]
y_arousal = data_train.iloc[:,5:6]
y_valence_test = data_test.iloc[:,4:5]
y_arousal_test = data_test.iloc[:,5:6]
y_av_test = data_test.iloc[:,4:6] 

# Модели

In [25]:
def models_test(X, y, X_test, y_test):
    model_lr = linear_model.LinearRegression()
    model_br = linear_model.BayesianRidge()
    model_en = linear_model.ElasticNet()
    model_svr = SVR()
    model_gbr = GradientBoostingRegressor()
    model_kne = KNeighborsRegressor()
    model_dtr = DecisionTreeRegressor()
    model_rfr = RandomForestRegressor()
    models = [model_lr, model_br, model_en, model_svr, model_gbr, model_kne, model_dtr, model_rfr]
    m = {"R2_train":{},"MAE_train":{},"MSE_train":{}, "R2_test":{},"MAE_test":{},"MSE_test":{}}
    for model in models:
        model.fit(X, y) 
        l = str(model)
        m["R2_train"][l[:l.index('(')]] = r2_score(y, model.predict(X))
        m["MAE_train"][l[:l.index('(')]] = mean_absolute_error(y, model.predict(X))
        m["MSE_train"][l[:l.index('(')]] = mean_squared_error(y, model.predict(X))
        m["R2_test"][l[:l.index('(')]] = r2_score(y_test, model.predict(X_test))
        m["MAE_test"][l[:l.index('(')]] = mean_absolute_error(y_test, model.predict(X_test))
        m["MSE_test"][l[:l.index('(')]] = mean_squared_error(y_test, model.predict(X_test))
    return pd.DataFrame(m)

In [18]:
TestModelsArousal = models_test(X, y_arousal, X_test, y_arousal_test)
TestModelsValence = models_test(X, y_valence, X_test, y_valence_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  model.fit(X, y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  model.fit(X, y)


In [19]:
TestModelsArousal

Unnamed: 0,R2_train,MAE_train,MSE_train,R2_test,MAE_test,MSE_test
LinearRegression,0.109401,0.073219,0.009967,0.00163,0.078125,0.008595
BayesianRidge,0.109398,0.073229,0.009967,0.001609,0.07815,0.008595
ElasticNet,0.061871,0.076248,0.010499,-0.0187,0.080689,0.00877
SVR,0.202261,0.074166,0.008928,-0.079891,0.081039,0.009297
GradientBoostingRegressor,0.427715,0.058242,0.006405,-0.025449,0.074599,0.008828
KNeighborsRegressor,0.481324,0.052917,0.005805,-0.131288,0.076039,0.009739
DecisionTreeRegressor,1.0,0.0,0.0,-1.059136,0.094353,0.017727
RandomForestRegressor,0.897552,0.023239,0.001147,-0.176062,0.077894,0.010125


In [26]:
TestModelsValence

Unnamed: 0,R2_train,MAE_train,MSE_train,R2_test,MAE_test,MSE_test
LinearRegression,0.292631,0.245319,0.099994,0.031792,0.34496,0.177149
BayesianRidge,0.292628,0.245287,0.099994,0.032476,0.344764,0.177024
ElasticNet,0.280653,0.247715,0.101687,0.055822,0.337354,0.172752
SVR,0.339913,0.225432,0.09331,0.028515,0.333476,0.177749
GradientBoostingRegressor,0.528358,0.198331,0.066671,0.158405,0.31466,0.153983
KNeighborsRegressor,0.596347,0.176434,0.057061,0.057931,0.323869,0.172367
DecisionTreeRegressor,1.0,0.0,0.0,-0.373049,0.362907,0.251221
RandomForestRegressor,0.920841,0.077293,0.01119,0.10671,0.31541,0.163442


# Пробую написать собственный KFoldGroup для 4х фолдов(плохая версия)

In [9]:
def my_KFoldGroup(model, X_train, data):
    arr_r2_train = []
    arr_r2_test = []
    for j in range(3, len(X_train) + 1,4):
        #Выделяем подвыборку
        start = np.array(X_train[:j - 3])
        finish = np.array(X_train[j + 1:])
        meadle = X_train[j - 3:j+1]
        val_signal_train = np.concatenate((start, finish), axis = 0)
        val_signal_test = meadle
        data_train = data[data["batch_id"].apply(lambda x: x in val_signal_train)]
        data_test = data[data["batch_id"].apply(lambda x: x in val_signal_test)]
        X = data_train.iloc[:,1:4]
        X_test = data_test.iloc[:,1:4]
        y_valence = data_train.iloc[:,4:5]
        y_valence_test = data_test.iloc[:,4:5]
        #Обучаю подвыборку
        model.fit(X, y_valence)
        #Оценка качества на тренировочной и тестовой выборках
        arr_r2_train.append(r2_score(y_valence, model.predict(X)))
        arr_r2_test.append(r2_score(y_valence_test, model.predict(X_test)))
    return {'r2_train_valence':np.array(arr_r2_train), 'r2_test_valence':np.array(arr_r2_test)}

# DecisionTreeRegressor подбор гиперпараметрa max_depth

In [102]:
best_k = 0
best_r2 = 0
d_nw = {'r2_train_valence':[] , 'r2_test_valence': [], "params":[]}
for i in range(1,20, 2):
    d = my_KFoldGroup(DecisionTreeRegressor(max_depth= i), X_train, data)
    d_nw['r2_train_valence'].append(d['r2_train_valence'])
    d_nw['r2_test_valence'].append(d['r2_test_valence'])
    d_nw['params'].append(i)
params_data = pd.DataFrame(d_nw)

In [103]:
params_data

Unnamed: 0,r2_train_valence,r2_test_valence,params
0,0.226206,-1.114227,1
1,0.324154,-1.32687,3
2,0.462893,-1.569683,5
3,0.553197,-2.034895,7
4,0.653423,-2.458154,9
5,0.75759,-3.071619,11
6,0.848002,-3.637565,13
7,0.913363,-3.894417,15
8,0.956109,-4.261391,17
9,0.980699,-4.385461,19


# KNeighborsRegressor подбор гиперпараметрa n_neighbors

In [107]:
best_k = 0
best_r2 = 0
d_nw = {'r2_train_valence':[] , 'r2_test_valence': [], "params":[]}
for i in range(1,201, 10):
    d = my_KFoldGroup(KNeighborsRegressor(n_neighbors = i), X_train, data)
    d_nw['r2_train_valence'].append(d['r2_train_valence'])
    d_nw['r2_test_valence'].append(d['r2_test_valence'])
    d_nw['params'].append(i)
pd.DataFrame(d_nw)

Unnamed: 0,r2_train_valence,r2_test_valence,params
0,1.0,-4.68786,1
1,0.527678,-1.801758,11
2,0.487814,-1.577523,21
3,0.473459,-1.494886,31
4,0.461016,-1.446976,41
5,0.451768,-1.420079,51
6,0.442296,-1.379172,61
7,0.434387,-1.352712,71
8,0.428492,-1.333587,81
9,0.420931,-1.316908,91


# RandomForestRegressor 

In [109]:
best_k = 0
best_r2 = 0
d_nw = {'r2_train_valence':[] , 'r2_test_valence': [], "params":[]}
for i in range(1,52, 10):
    d = my_KFoldGroup(RandomForestRegressor(max_depth = i), X_train, data)
    d_nw['r2_train_valence'].append(d['r2_train_valence'])
    d_nw['r2_test_valence'].append(d['r2_test_valence'])
    d_nw['params'].append(i)
params_data = pd.DataFrame(d_nw)

  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)


  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)


  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)


  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)


  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)


In [110]:
params_data

Unnamed: 0,r2_train_valence,r2_test_valence,params
0,0.23625,-1.093981,1
1,0.780147,-1.715142,11
2,0.916704,-1.89258,21
3,0.920795,-1.934508,31
4,0.921042,-1.985425,41
5,0.921236,-1.999073,51


In [117]:
best_k = 0
best_r2 = 0
d_nw = {'r2_train_valence':[] , 'r2_test_valence': [], "params":[]}
for i in range(1, 11, 2):
    d = my_KFoldGroup(RandomForestRegressor(max_depth = i), X_train, data)
    d_nw['r2_train_valence'].append(d['r2_train_valence'])
    d_nw['r2_test_valence'].append(d['r2_test_valence'])
    d_nw['params'].append(i)
params_data2 = pd.DataFrame(d_nw)

  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)


  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)


  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)


  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)


  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)


In [118]:
params_data2

Unnamed: 0,r2_train_valence,r2_test_valence,params
0,0.236487,-1.099123,1
1,0.363546,-1.115405,3
2,0.492761,-1.363415,5
3,0.592781,-1.478322,7
4,0.693176,-1.567911,9


In [114]:
my_KFoldGroup(RandomForestRegressor(), X_train, data)

  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)


{'r2_train_valence': 0.9207981239446777, 'r2_test_valence': -1.934429337616865}

In [119]:
best_k = 0
best_r2 = 0
d_nw = {'r2_train_valence':[] , 'r2_test_valence': [], "params":[]}
for i in range(10, 1000, 100):
    d = my_KFoldGroup(RandomForestRegressor(max_depth = 4, n_estimators = i), X_train, data)
    d_nw['r2_train_valence'].append(d['r2_train_valence'])
    d_nw['r2_test_valence'].append(d['r2_test_valence'])
    d_nw['params'].append(i)
params_data2 = pd.DataFrame(d_nw)

  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)


  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)


  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)


  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)


  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)


  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)


  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)


  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)


  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)


In [120]:
params_data2

Unnamed: 0,r2_train_valence,r2_test_valence,params
0,0.438544,-1.191695,10
1,0.442344,-1.196653,110
2,0.442779,-1.201779,210
3,0.443159,-1.19704,310
4,0.442941,-1.202545,410
5,0.442723,-1.198587,510
6,0.443222,-1.196257,610
7,0.443034,-1.201288,710
8,0.44316,-1.199805,810
9,0.44293,-1.202426,910


In [123]:
best_k = 0
best_r2 = 0
d_nw = {'r2_train_valence':[] , 'r2_test_valence': [], "params":[]}
for i in range(605, 615, 2):
    d = my_KFoldGroup(RandomForestRegressor(max_depth = 4, n_estimators = i), X_train, data)
    d_nw['r2_train_valence'].append(d['r2_train_valence'])
    d_nw['r2_test_valence'].append(d['r2_test_valence'])
    d_nw['params'].append(i)
params_data2 = pd.DataFrame(d_nw)

  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)


  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)


  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)


  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)


  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)
  model.fit(X, y_valence)


In [124]:
params_data2

Unnamed: 0,r2_train_valence,r2_test_valence,params
0,0.442984,-1.199433,605
1,0.443211,-1.194987,607
2,0.443109,-1.191115,609
3,0.442917,-1.20138,611
4,0.443027,-1.20076,613


# Доля отложной и обучающейся выборки на каждой итерации

In [27]:
def contents_fold(cv, data):
    X1 = data[data["label"] == 1]["batch_id"].unique()
    X2 = data[data["label"] == 2]["batch_id"].unique()
    X3 = data[data["label"] == 3]["batch_id"].unique()
    a1 = np.array_split(X1, cv)
    a2 = np.array_split(X2, cv)
    a3 = np.array_split(X3, cv)
    n = data.shape[0]
    d = {'train':[], 'test':[]}
    for i in range(cv):
        X_test = [*a1[i], *a2[i], *a3[i]] 
        X_train = np.concatenate((*a1[i + 1:], *a2[i + 1:], *a3[i + 1:], *a1[:i], *a2[:i], *a3[:i]), axis = 0)
        data_train = data[data["batch_id"].apply(lambda x: x in X_train)]
        data_test = data[data["batch_id"].apply(lambda x: x in X_test)]
        d['train'].append(data_train.shape[0]/n)
        d['test'].append(data_test.shape[0]/n)
    return d

# Доработанная под эту задачу KFold

In [38]:
def my_KFold(model, cv, data):
    arr_r2_train = []
    arr_r2_test = []
    X1 = data[data["label"] == 1]["batch_id"].unique()
    X2 = data[data["label"] == 2]["batch_id"].unique()
    X3 = data[data["label"] == 3]["batch_id"].unique()
    a1 = np.array_split(X1, cv)
    a2 = np.array_split(X2, cv)
    a3 = np.array_split(X3, cv)
    n = data.shape[0]
    d = {'train':[], 'test':[]}
    for i in range(cv):
        X_deffer = [*a1[i], *a2[i], *a3[i]] 
        X_train = np.concatenate((*a1[i + 1:], *a2[i + 1:], *a3[i + 1:], *a1[:i], *a2[:i], *a3[:i]), axis = 0)
        data_train = data[data["batch_id"].apply(lambda x: x in X_train)]
        data_test = data[data["batch_id"].apply(lambda x: x in X_deffer)]
        X = data_train.iloc[:,1:4]
        X_test = data_test.iloc[:,1:4]
        y_valence = data_train.iloc[:,4:5]
        y_valence_test = data_test.iloc[:,4:5]
        #Обучение фолда
        model.fit(X, y_valence)
        #Оценка качества на тренировочной и отложенной выборках
        arr_r2_train.append(r2_score(y_valence, model.predict(X)))
        arr_r2_test.append(r2_score(y_valence_test, model.predict(X_test)))
    return {'r2_train_valence':np.mean(np.array(arr_r2_train)), 'r2_test_valence':np.mean(np.array(arr_r2_test))}

In [39]:
best_k = 0
best_r2 = 0
d_nw = {'r2_train_valence':[] , 'r2_test_valence': [], "params":[]}
for i in range(1,201, 10):
    d = my_KFold(KNeighborsRegressor(n_neighbors = i), 5, data_train)
    d_nw['r2_train_valence'].append(d['r2_train_valence'])
    d_nw['r2_test_valence'].append(d['r2_test_valence'])
    d_nw['params'].append(i)
pd.DataFrame(d_nw)

Unnamed: 0,r2_train_valence,r2_test_valence,params
0,1.0,-0.466612,1
1,0.532516,0.23662,11
2,0.492711,0.275895,21
3,0.477787,0.286896,31
4,0.464168,0.292095,41
5,0.451883,0.294979,51
6,0.441529,0.298343,61
7,0.433052,0.296542,71
8,0.42515,0.294755,81
9,0.418403,0.294299,91
