In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GroupKFold, GridSearchCV
from sklearn import linear_model 
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.dummy import DummyRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import warnings

warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("data_vad.csv")
data = df[(df["Valence"].notna())&(df["Arousal"].notna())].drop(["filename"], axis = 1)

# Разделяем данные

In [3]:
X1 = data[data["label"] == 1]["batch_id"].unique()
X2 = data[data["label"] == 2]["batch_id"].unique()
X3 = data[data["label"] == 3]["batch_id"].unique()
X1_train, X1_test = train_test_split(X1,train_size = 0.7, random_state = 42)
X2_train, X2_test = train_test_split(X2,train_size = 0.7, random_state = 42)
X3_train, X3_test = train_test_split(X3,train_size = 0.7, random_state = 42)
X_train = [*X1_train, *X2_train, *X3_train]
X_test = [*X1_test, *X2_test, *X3_test]
data_train = data[data["batch_id"].apply(lambda x: x in X_train)]
data_test = data[data["batch_id"].apply(lambda x: x in X_test)]

In [7]:
X = data_train.iloc[:,1:4]
X_test = data_test.iloc[:,1:4]
y_valence = data_train.iloc[:,4:5]
y_arousal = data_train.iloc[:,5:6]
y_valence_test = data_test.iloc[:,4:5]
y_arousal_test = data_test.iloc[:,5:6]

In [8]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2499 entries, 10 to 165128
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   msec      2499 non-null   int64  
 1   Corr      2499 non-null   float64
 2   Zyg       2499 non-null   float64
 3   Mas       2499 non-null   float64
 4   Valence   2499 non-null   float64
 5   Arousal   2499 non-null   float64
 6   label     2499 non-null   int64  
 7   batch_id  2499 non-null   object 
dtypes: float64(5), int64(2), object(1)
memory usage: 175.7+ KB


## Подбор параметров с помощью GridSearchCV и кросс-валидации GroupKFold

In [5]:
groups = np.array(data_train['batch_id'])
idx = pd.Index(groups)
Xb = data_train.iloc[:,1:4].set_index(idx)
yb_valence = data_train.iloc[:,4:5].set_index(idx)
yb_arousal = data_train.iloc[:,5:6].set_index(idx)

### SVR подбор гиперпараметра "kernel"

In [6]:
#Valence
GKF = GroupKFold(n_splits=5)
cv = GKF.split(Xb, yb_valence, groups)
params = {'kernel' : ['linear', 'poly', 'rbf', 'sigmoid']}

grid = GridSearchCV(estimator = SVR(),
                    param_grid = params, 
                    scoring = 'r2',
                    cv = cv)

svr_val = grid.fit(Xb, yb_valence)
svr_val.best_params_

{'kernel': 'linear'}

In [7]:
#Arousal
GKF = GroupKFold(n_splits=5)
cv = GKF.split(Xb, yb_arousal, groups)
params = {'kernel' : ['linear', 'poly', 'rbf', 'sigmoid']}

grid = GridSearchCV(estimator = SVR(),
                    param_grid = params, 
                    scoring = 'r2',
                    cv = cv)

svr_ar = grid.fit(Xb, yb_arousal)
svr_ar.best_params_

{'kernel': 'linear'}

### KNeighborsRegressor подбор гиперпараметрa n_neighbors

In [8]:
#Valence
GKF = GroupKFold(n_splits=5)
cv = GKF.split(Xb, yb_valence, groups)
params = {'n_neighbors' : np.arange(1, 200)}

grid = GridSearchCV(estimator = KNeighborsRegressor(),
                    param_grid = params, 
                    scoring = 'r2',
                    cv = cv)

knn_val = grid.fit(Xb, yb_valence)
knn_val.best_params_

{'n_neighbors': 59}

In [6]:
#Arousal
GKF = GroupKFold(n_splits=5)
cv = GKF.split(Xb, yb_arousal, groups)
params = {'n_neighbors' : np.arange(1, 200)}

grid = GridSearchCV(estimator = KNeighborsRegressor(),
                    param_grid = params, 
                    scoring = 'r2',
                    cv = cv)

knn_ar = grid.fit(Xb, yb_arousal)
knn_ar.best_params_

{'n_neighbors': 102}

### RandomForestRegressor подбор гиперпараметров max_depth, n_estimators.

In [9]:
#Valence
GKF = GroupKFold(n_splits=5)
cv = GKF.split(Xb, yb_valence, groups)
params = {'max_depth' : np.arange(1, 7), 'n_estimators':np.arange(100, 700, 100)}

grid = GridSearchCV(estimator = RandomForestRegressor(random_state = 42),
                    param_grid = params, 
                    scoring = 'r2',
                    cv = cv)

rfr_val = grid.fit(Xb, yb_valence)
rfr_val.best_params_

{'max_depth': 4, 'n_estimators': 100}

In [30]:
#Arousal
GKF = GroupKFold(n_splits=5)
cv = GKF.split(Xb, yb_arousal, groups)
params = {'max_depth' : np.arange(2, 6), 'n_estimators':np.arange(100, 600, 100)}

grid = GridSearchCV(estimator = RandomForestRegressor(random_state = 42),
                    param_grid = params, 
                    scoring = 'r2',
                    cv = cv)

rfr_ar = grid.fit(Xb, yb_arousal)
rfr_ar.best_params_

{'max_depth': 2, 'n_estimators': 100}

### GradientBoostingRegressor подбор гиперпараметров max_depth, n_estimators.

In [40]:
#Valence
GKF = GroupKFold(n_splits=5)
cv = GKF.split(Xb, yb_valence, groups)
params = {'max_depth' : np.arange(1, 5), 'n_estimators':np.arange(100, 600, 100)}

grid = GridSearchCV(estimator = GradientBoostingRegressor(random_state = 42),
                    param_grid = params, 
                    scoring = 'r2',
                    cv = cv)
 
gbr_val = grid.fit(Xb, yb_valence)
gbr_val.best_params_

{'max_depth': 3, 'n_estimators': 100}

In [41]:
#Arousal
GKF = GroupKFold(n_splits=5)
cv = GKF.split(Xb, yb_arousal, groups)
params = {'max_depth' : np.arange(1, 5), 'n_estimators':np.arange(100, 600, 100)}

grid = GridSearchCV(estimator = GradientBoostingRegressor(random_state = 42),
                    param_grid = params, 
                    scoring = 'r2',
                    cv = cv)

gbr_ar = grid.fit(Xb, yb_arousal)
gbr_ar.best_params_

{'max_depth': 1, 'n_estimators': 100}

### LGBMRegressor подбор гиперпараметров max_depth, n_estimators.

In [42]:
#Valence
GKF = GroupKFold(n_splits=5)
cv = GKF.split(Xb, yb_valence, groups)
params = {'max_depth' : np.arange(1, 5), 'n_estimators':np.arange(100, 600, 100)}

grid = GridSearchCV(estimator = LGBMRegressor(random_state = 42),
                    param_grid = params, 
                    scoring = 'r2',
                    cv = cv)

lgbmr_val = grid.fit(Xb, yb_valence)
lgbmr_val.best_params_

{'max_depth': 2, 'n_estimators': 100}

In [45]:
#Arousal
GKF = GroupKFold(n_splits=5)
cv = GKF.split(Xb, yb_arousal, groups)
params = {'max_depth' : np.arange(1, 5), 'n_estimators':np.arange(100, 600, 100)}

grid = GridSearchCV(estimator = LGBMRegressor(random_state = 42),
                    param_grid = params, 
                    scoring = 'r2',
                    cv = cv)

lgbmr_ar = grid.fit(Xb, yb_arousal)
lgbmr_ar.best_params_

{'max_depth': 1, 'n_estimators': 100}

### XGBRegressor подбор гиперпараметров max_depth, n_estimators.

In [46]:
#Valence
GKF = GroupKFold(n_splits=5)
cv = GKF.split(Xb, yb_valence, groups)
params = {'max_depth' : np.arange(1, 5), 'n_estimators':np.arange(100, 600, 100)}

grid = GridSearchCV(estimator = XGBRegressor(random_state = 42),
                    param_grid = params, 
                    scoring = 'r2',
                    cv = cv)

xgbr_val = grid.fit(Xb, yb_valence)
xgbr_val.best_params_

{'max_depth': 2, 'n_estimators': 100}

In [47]:
#Arousal
GKF = GroupKFold(n_splits=5)
cv = GKF.split(Xb, yb_arousal, groups)
params = {'max_depth' : np.arange(1, 5), 'n_estimators':np.arange(100, 600, 100)}

grid = GridSearchCV(estimator = XGBRegressor(random_state = 42),
                    param_grid = params, 
                    scoring = 'r2',
                    cv = cv)

xgbr_ar = grid.fit(Xb, yb_arousal)
xgbr_ar.best_params_

{'max_depth': 1, 'n_estimators': 100}

### CatBoostRegressor подбор гиперпараметров max_depth, n_estimators.

In [72]:
#Valence
GKF = GroupKFold(n_splits=5)
cv = GKF.split(Xb, yb_valence, groups)
params = {'depth' : np.arange(1, 5), 'iterations':np.arange(100, 600, 100)}

grid = GridSearchCV(estimator = CatBoostRegressor(random_state = 42, verbose = False, train_dir = 'catboost_logging'),
                    param_grid = params, 
                    scoring = 'r2',
                    cv = cv)

gv = grid.fit(Xb, yb_valence)

In [73]:
#Arousal
GKF = GroupKFold(n_splits=5)
cv = GKF.split(Xb, yb_arousal, groups)
params = {'depth' : np.arange(1, 5), 'iterations':np.arange(100, 600, 100)}

grid = GridSearchCV(estimator = CatBoostRegressor(random_state = 42, verbose = False, train_dir = 'catboost_logging'),
                    param_grid = params, 
                    scoring = 'r2',
                    cv = cv)

ga = grid.fit(Xb, yb_arousal)

In [74]:
print(f'Параметры для Valence: {gv.best_params_}')
print(f'Параметры для Arousal: {ga.best_params_}')

Параметры для Valence: {'depth': 2, 'iterations': 300}
Параметры для Arousal: {'depth': 1, 'iterations': 100}
