In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GroupKFold, GridSearchCV
from sklearn import linear_model 
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.dummy import DummyRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import warnings
import pickle as pkl

warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv("data_vad.csv")
data = df[(df["Valence"].notna())&(df["Arousal"].notna())].drop(["filename"], axis = 1)

# Разделяем данные

In [5]:
X1 = data[data["label"] == 1]["batch_id"].unique()
X2 = data[data["label"] == 2]["batch_id"].unique()
X3 = data[data["label"] == 3]["batch_id"].unique()
X1_train, X1_test = train_test_split(X1,train_size = 0.7, random_state = 42)
X2_train, X2_test = train_test_split(X2,train_size = 0.7, random_state = 42)
X3_train, X3_test = train_test_split(X3,train_size = 0.7, random_state = 42)
X_train = [*X1_train, *X2_train, *X3_train]
X_test = [*X1_test, *X2_test, *X3_test]
data_train = data[data["batch_id"].apply(lambda x: x in X_train)]
data_test = data[data["batch_id"].apply(lambda x: x in X_test)]

In [6]:
X = data_train.iloc[:,1:4]
X_test = data_test.iloc[:,1:4]
y_valence = data_train.iloc[:,4:5]
y_arousal = data_train.iloc[:,5:6]
y_valence_test = data_test.iloc[:,4:5]
y_arousal_test = data_test.iloc[:,5:6]

# Сохранение моделей с подобранными параметрами.

In [7]:
#для Valence
models = [DummyRegressor(), 
          linear_model.LinearRegression(), 
          linear_model.BayesianRidge(), 
          linear_model.ElasticNet(), 
          SVR(kernel = 'linear'),  
          KNeighborsRegressor(n_neighbors = 59), 
          DecisionTreeRegressor(max_depth = 4, random_state = 42),
          RandomForestRegressor(max_depth = 4, n_estimators = 100, random_state = 42),
          GradientBoostingRegressor(max_depth = 3, n_estimators = 100, random_state = 42),
          CatBoostRegressor(random_state = 42, verbose = False, train_dir = 'catboost_logging', iterations = 300, depth = 2), 
          LGBMRegressor(max_depth = 2, n_estimators = 100, random_state = 42), 
          XGBRegressor(n_estimators = 100, max_depth = 2, random_state = 42)]
for model in models:
    model.fit(X, y_valence)
    l = str(model)
    if '(' in l:
        filename = (l[:l.index('(')] + "_valence.pkl")
    else:
        filename = 'CatBoost_valence.pkl'
    with open(filename, 'wb') as f:
        pkl.dump(model, f)

In [8]:
#для Arousal
models = [DummyRegressor(), 
          linear_model.LinearRegression(), 
          linear_model.BayesianRidge(), 
          linear_model.ElasticNet(), 
          SVR(kernel = 'linear'),  
          KNeighborsRegressor(n_neighbors = 102), 
          DecisionTreeRegressor(max_depth = 2, random_state = 42),
          RandomForestRegressor(max_depth = 2, n_estimators = 100, random_state = 42),
          GradientBoostingRegressor(max_depth = 1, n_estimators = 100, random_state = 42),
          CatBoostRegressor(random_state = 42, verbose = False, train_dir = 'catboost_logging', iterations = 100, depth = 1), 
          LGBMRegressor(max_depth = 1, n_estimators = 100, random_state = 42), 
          XGBRegressor(n_estimators = 100, max_depth = 1, random_state = 42)]
for model in models:
    model.fit(X, y_arousal)
    l = str(model)
    if '(' in l:
        filename = (l[:l.index('(')] + "_arousal.pkl")
    else:
        filename = 'CatBoost_arousal.pkl'
    with open(filename, 'wb') as f:
        pkl.dump(model, f)