In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GroupKFold
from sklearn import linear_model 
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [24]:
df = pd.read_csv("data_vad.csv")
data = df[(df["Valence"].notna())&(df["Arousal"].notna())].drop("filename", axis = 1)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3632 entries, 10 to 165128
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   msec      3632 non-null   int64  
 1   Corr      3632 non-null   float64
 2   Zyg       3632 non-null   float64
 3   Mas       3632 non-null   float64
 4   Valence   3632 non-null   float64
 5   Arousal   3632 non-null   float64
 6   label     3632 non-null   int64  
 7   batch_id  3632 non-null   object 
dtypes: float64(5), int64(2), object(1)
memory usage: 255.4+ KB


In [25]:
data.corr()

Unnamed: 0,msec,Corr,Zyg,Mas,Valence,Arousal,label
msec,1.0,-0.004698,-0.044648,-0.047304,0.022701,0.057306,0.010783
Corr,-0.004698,1.0,-0.213923,-0.098048,-0.156296,0.259311,0.077368
Zyg,-0.044648,-0.213923,1.0,0.299039,0.439574,-0.030578,-0.188074
Mas,-0.047304,-0.098048,0.299039,1.0,0.058302,-0.161369,-0.258221
Valence,0.022701,-0.156296,0.439574,0.058302,1.0,-0.056596,-0.387387
Arousal,0.057306,0.259311,-0.030578,-0.161369,-0.056596,1.0,0.202731
label,0.010783,0.077368,-0.188074,-0.258221,-0.387387,0.202731,1.0


# Разделяем данные

In [5]:
X1 = data[data["label"] == 1]["batch_id"].unique()
X2 = data[data["label"] == 2]["batch_id"].unique()
X3 = data[data["label"] == 3]["batch_id"].unique()

In [6]:
X1_train, X1_test = train_test_split(X1,train_size = 0.7, random_state = 42)
X2_train, X2_test = train_test_split(X2,train_size = 0.7, random_state = 42)
X3_train, X3_test = train_test_split(X3,train_size = 0.7, random_state = 42)
X_train = [*X1_train, *X2_train, *X3_train]
X_test = [*X1_test, *X2_test, *X3_test]
data_train = data[data["batch_id"].apply(lambda x: x in X_train)]
data_test = data[data["batch_id"].apply(lambda x: x in X_test)]

In [7]:
X_msec = data_train.iloc[:,:1]
X_test_msec = data_test.iloc[:,:1]
X_corr = data_train.iloc[:,1:2]
X_test_corr = data_test.iloc[:,1:2]
X_zyg = data_train.iloc[:,2:3]
X_test_zyg = data_test.iloc[:,2:3]
X_mas = data_train.iloc[:,3:4]
X_test_mas = data_test.iloc[:,3:4]
y_valence = data_train.iloc[:,4:5]
y_arousal = data_train.iloc[:,5:6]
y_valence_test = data_test.iloc[:,4:5]
y_arousal_test = data_test.iloc[:,5:6] 

# Подбор модели

In [14]:
def models_test(X, y, X_test, y_test):
    model_lr = linear_model.LinearRegression()
    model_br = linear_model.BayesianRidge()
    model_en = linear_model.ElasticNet()
    model_svr = SVR()
    model_gbr = GradientBoostingRegressor()
    model_kne = KNeighborsRegressor()
    model_dtr = DecisionTreeRegressor()
    model_rfr = RandomForestRegressor()
    models = [model_lr, model_br, model_en, model_svr, model_gbr, model_kne, model_dtr, model_rfr]
    m = {"R2_train":{},"MAE_train":{},"MSE_train":{}, "R2_test":{},"MAE_test":{},"MSE_test":{}}
    for model in models:
        model.fit(X, y) 
        l = str(model)
        m["R2_train"][l[:l.index('(')]] = r2_score(y, model.predict(X))
        m["MAE_train"][l[:l.index('(')]] = mean_absolute_error(y, model.predict(X))
        m["MSE_train"][l[:l.index('(')]] = mean_squared_error(y, model.predict(X))
        m["R2_test"][l[:l.index('(')]] = r2_score(y_test, model.predict(X_test))
        m["MAE_test"][l[:l.index('(')]] = mean_absolute_error(y_test, model.predict(X_test))
        m["MSE_test"][l[:l.index('(')]] = mean_squared_error(y_test, model.predict(X_test))
    return pd.DataFrame(m)

In [15]:
TestModelsArousal_msec = models_test(X_msec, y_arousal, X_test_msec, y_arousal_test)
TestModelsValence_msec = models_test(X_msec, y_valence, X_test_msec, y_valence_test)
TestModelsArousal_corr = models_test(X_corr, y_arousal, X_test_corr, y_arousal_test)
TestModelsValence_corr = models_test(X_corr, y_valence, X_test_corr, y_valence_test)
TestModelsArousal_zyg = models_test(X_zyg, y_arousal, X_test_zyg, y_arousal_test)
TestModelsValence_zyg = models_test(X_zyg, y_valence, X_test_zyg, y_valence_test)
TestModelsArousal_mas = models_test(X_mas, y_arousal, X_test_mas, y_arousal_test)
TestModelsValence_mas = models_test(X_mas, y_valence, X_test_mas, y_valence_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  model.fit(X, y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  model.fit(X, y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  model.fit(X, y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  model.fit(X, y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  model.fit(X, y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  model.fit(X, y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  model.fit(X, y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  model.fit(X, y)


# Valence

In [16]:
TestModelsValence_msec

Unnamed: 0,R2_train,MAE_train,MSE_train,R2_test,MAE_test,MSE_test
LinearRegression,0.000248,0.279227,0.141325,-0.003661,0.322437,0.183636
BayesianRidge,0.000248,0.279227,0.141325,-0.003661,0.322437,0.183636
ElasticNet,0.000248,0.279227,0.141325,-0.00366,0.322437,0.183635
SVR,0.004053,0.274957,0.140787,0.011755,0.316553,0.180815
GradientBoostingRegressor,0.677442,0.167917,0.045597,-0.113893,0.351597,0.203804
KNeighborsRegressor,0.909142,0.039686,0.012844,-0.339231,0.372872,0.245034
DecisionTreeRegressor,0.999409,0.000494,8.4e-05,-0.375287,0.378027,0.251631
RandomForestRegressor,0.956722,0.02518,0.006118,-0.36301,0.376342,0.249384


In [17]:
TestModelsValence_corr

Unnamed: 0,R2_train,MAE_train,MSE_train,R2_test,MAE_test,MSE_test
LinearRegression,0.056021,0.274226,0.133441,-0.039198,0.332988,0.190138
BayesianRidge,0.056019,0.274219,0.133442,-0.038806,0.332865,0.190066
ElasticNet,0.053378,0.274124,0.133815,-0.025773,0.329184,0.187682
SVR,0.066533,0.26846,0.131955,-0.047893,0.335751,0.191729
GradientBoostingRegressor,0.211332,0.257494,0.111486,-0.082946,0.349923,0.198142
KNeighborsRegressor,0.283947,0.24644,0.101221,-0.213001,0.367455,0.221938
DecisionTreeRegressor,1.0,0.0,0.0,-0.682428,0.420162,0.307827
RandomForestRegressor,0.825231,0.121085,0.024705,-0.35741,0.386181,0.24836


In [18]:
TestModelsValence_zyg

Unnamed: 0,R2_train,MAE_train,MSE_train,R2_test,MAE_test,MSE_test
LinearRegression,0.271261,0.249944,0.103015,0.044106,0.340972,0.174896
BayesianRidge,0.271261,0.249935,0.103015,0.044321,0.340914,0.174857
ElasticNet,0.269652,0.249527,0.103242,0.058893,0.336724,0.17219
SVR,0.249625,0.241129,0.106073,0.056064,0.319367,0.172708
GradientBoostingRegressor,0.398127,0.227208,0.085081,0.092781,0.327452,0.16599
KNeighborsRegressor,0.45096,0.21383,0.077613,-0.022973,0.342713,0.187169
DecisionTreeRegressor,1.0,0.0,0.0,-0.499311,0.388127,0.274323
RandomForestRegressor,0.858693,0.106278,0.019975,-0.183152,0.35828,0.216477


In [19]:
TestModelsValence_mas

Unnamed: 0,R2_train,MAE_train,MSE_train,R2_test,MAE_test,MSE_test
LinearRegression,0.00661,0.281488,0.140426,-0.003772,0.324497,0.183656
BayesianRidge,0.006603,0.28137,0.140427,-0.00352,0.324355,0.18361
ElasticNet,0.001939,0.279471,0.141086,-0.000547,0.322336,0.183066
SVR,0.139861,0.265751,0.12159,-0.013525,0.346239,0.185441
GradientBoostingRegressor,0.249074,0.253599,0.106151,-0.079406,0.365909,0.197494
KNeighborsRegressor,0.342446,0.238243,0.092952,-0.174063,0.37371,0.214814
DecisionTreeRegressor,1.0,0.0,0.0,-0.587984,0.415836,0.290547
RandomForestRegressor,0.83461,0.118226,0.02338,-0.300897,0.385908,0.23802


# Arousal

In [20]:
TestModelsArousal_msec

Unnamed: 0,R2_train,MAE_train,MSE_train,R2_test,MAE_test,MSE_test
LinearRegression,7e-06,0.079081,0.011192,-0.032867,0.081175,0.008892
BayesianRidge,7e-06,0.079081,0.011192,-0.032867,0.081175,0.008892
ElasticNet,7e-06,0.07908,0.011192,-0.032881,0.081176,0.008892
SVR,-0.022595,0.084551,0.011444,-0.127717,0.087222,0.009708
GradientBoostingRegressor,0.75189,0.041185,0.002777,-0.777558,0.09548,0.015303
KNeighborsRegressor,0.948777,0.00773,0.000573,-1.984589,0.11615,0.025694
DecisionTreeRegressor,0.999696,7.9e-05,3e-06,-2.030569,0.116873,0.02609
RandomForestRegressor,0.979239,0.004757,0.000232,-2.01347,0.116536,0.025943


In [21]:
TestModelsArousal_corr

Unnamed: 0,R2_train,MAE_train,MSE_train,R2_test,MAE_test,MSE_test
LinearRegression,0.09511,0.074104,0.010127,-0.04669,0.080188,0.009011
BayesianRidge,0.09511,0.074107,0.010127,-0.046526,0.080189,0.009009
ElasticNet,0.061871,0.076248,0.010499,-0.0187,0.080689,0.00877
SVR,0.069507,0.079826,0.010414,-0.106961,0.085835,0.00953
GradientBoostingRegressor,0.204716,0.069687,0.0089,-0.071837,0.081075,0.009227
KNeighborsRegressor,0.271384,0.066882,0.008154,-0.311169,0.086364,0.011288
DecisionTreeRegressor,1.0,0.0,0.0,-1.063052,0.097936,0.017761
RandomForestRegressor,0.815832,0.032953,0.002061,-0.531928,0.089206,0.013188


In [22]:
TestModelsArousal_zyg

Unnamed: 0,R2_train,MAE_train,MSE_train,R2_test,MAE_test,MSE_test
LinearRegression,0.006132,0.079514,0.011123,-0.058987,0.083234,0.009117
BayesianRidge,0.006132,0.079512,0.011123,-0.05892,0.08323,0.009116
ElasticNet,0.0,0.079065,0.011192,-0.034154,0.081231,0.008903
SVR,0.018879,0.083353,0.01098,-0.140969,0.088023,0.009822
GradientBoostingRegressor,0.193374,0.069963,0.009027,-0.118813,0.082252,0.009632
KNeighborsRegressor,0.2401,0.067557,0.008504,-0.348099,0.087518,0.011606
DecisionTreeRegressor,1.0,0.0,0.0,-1.2756,0.103972,0.01959
RandomForestRegressor,0.798496,0.033797,0.002255,-0.605106,0.09182,0.013818


In [23]:
TestModelsArousal_mas

Unnamed: 0,R2_train,MAE_train,MSE_train,R2_test,MAE_test,MSE_test
LinearRegression,0.032254,0.077759,0.010831,-0.027287,0.08122,0.008844
BayesianRidge,0.032254,0.077755,0.010831,-0.027157,0.081212,0.008843
ElasticNet,0.0,0.079065,0.011192,-0.034154,0.081231,0.008903
SVR,0.105076,0.076852,0.010016,-0.066424,0.081871,0.009181
GradientBoostingRegressor,0.254699,0.067789,0.008341,-0.081915,0.078755,0.009314
KNeighborsRegressor,0.299917,0.065098,0.007835,-0.267875,0.083173,0.010915
DecisionTreeRegressor,1.0,0.0,0.0,-1.317295,0.103106,0.019949
RandomForestRegressor,0.817208,0.032607,0.002046,-0.661659,0.090907,0.014305
