In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GroupKFold, GridSearchCV
from sklearn import linear_model 
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.dummy import DummyRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import warnings

warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv("data_lagged.csv")

# Разделяем данные

In [3]:
X1 = data[data["label"] == 1]["batch_id"].unique()
X2 = data[data["label"] == 2]["batch_id"].unique()
X3 = data[data["label"] == 3]["batch_id"].unique()
X1_train, X1_test = train_test_split(X1,train_size = 0.7, random_state = 42)
X2_train, X2_test = train_test_split(X2,train_size = 0.7, random_state = 42)
X3_train, X3_test = train_test_split(X3,train_size = 0.7, random_state = 42)
X_train = [*X1_train, *X2_train, *X3_train]
X_test = [*X1_test, *X2_test, *X3_test]
data_train = data[data["batch_id"].apply(lambda x: x in X_train)]
data_test = data[data["batch_id"].apply(lambda x: x in X_test)]

In [5]:
groups = np.array(data_train['batch_id'])
idx = pd.Index(groups)
Xb = data_train.drop(['msec', 'Valence', 'Arousal', 'label', 'batch_id'], axis = 1).set_index(idx)
yb_valence = data_train.iloc[:,4:5].set_index(idx)
yb_arousal = data_train.iloc[:,5:6].set_index(idx)

## Подбор параметров с помощью GridSearchCV и кросс-валидации GroupKFold

### SVR подбор параметра "kernel"

In [21]:
#Valence
GKF = GroupKFold(n_splits=5)
cv = GKF.split(Xb, yb_valence, groups)
params = {'kernel' : ['linear', 'poly', 'rbf', 'sigmoid']}

grid = GridSearchCV(estimator = SVR(),
                    param_grid = params, 
                    scoring = 'r2',
                    cv = cv)

g = grid.fit(Xb, yb_valence)
g.best_params_

{'kernel': 'linear'}

In [22]:
#Arousal
GKF = GroupKFold(n_splits=5)
cv = GKF.split(Xb, yb_arousal, groups)
params = {'kernel' : ['linear', 'poly', 'rbf', 'sigmoid']}

grid = GridSearchCV(estimator = SVR(),
                    param_grid = params, 
                    scoring = 'r2',
                    cv = cv)

g = grid.fit(Xb, yb_arousal)
g.best_params_

{'kernel': 'linear'}

### KNeighborsRegressor подбор гиперпараметрa n_neighbors

In [15]:
#Valence
GKF = GroupKFold(n_splits=5)
cv = GKF.split(Xb, yb_valence, groups)
params = {'n_neighbors' : np.arange(1, 200)}

grid = GridSearchCV(estimator = KNeighborsRegressor(),
                    param_grid = params, 
                    scoring = 'r2',
                    cv = cv)

g = grid.fit(Xb, yb_valence)
g.best_params_

{'n_neighbors': 62}

In [24]:
#Arousal
GKF = GroupKFold(n_splits=5)
cv = GKF.split(Xb, yb_arousal, groups)
params = {'n_neighbors' : np.arange(1, 200)}

grid = GridSearchCV(estimator = KNeighborsRegressor(),
                    param_grid = params, 
                    scoring = 'r2',
                    cv = cv)

g = grid.fit(Xb, yb_arousal)
g.best_params_

{'n_neighbors': 94}

### DecisionTreeRegressor подбор гиперпараметрa max_depth

In [25]:
#Valence
GKF = GroupKFold(n_splits=5)
cv = GKF.split(Xb, yb_valence, groups)
params = {'max_depth' : np.arange(1, 20)}

grid = GridSearchCV(estimator = DecisionTreeRegressor(random_state = 42),
                    param_grid = params, 
                    scoring = 'r2',
                    cv = cv)

g = grid.fit(Xb, yb_valence)
g.best_params_

{'max_depth': 1}

In [26]:
#Arousal
GKF = GroupKFold(n_splits=5)
cv = GKF.split(Xb, yb_arousal, groups)
params = {'max_depth' : np.arange(1, 20)}

grid = GridSearchCV(estimator = DecisionTreeRegressor(random_state = 42),
                    param_grid = params, 
                    scoring = 'r2',
                    cv = cv)

g = grid.fit(Xb, yb_arousal)
g.best_params_

{'max_depth': 2}

### RandomForestRegressor подбор параметров max_depth, n_estimators.

In [6]:
#Valence
GKF = GroupKFold(n_splits=5)
cv = GKF.split(Xb, yb_valence, groups)
params = {'max_depth' : [1, 2, 3, 4, 5, 6, 7, None], 'n_estimators':np.arange(100, 600, 100)}

grid = GridSearchCV(estimator = RandomForestRegressor(random_state = 42),
                    param_grid = params, 
                    scoring = 'r2',
                    cv = cv)

g = grid.fit(Xb, yb_valence)
g.best_params_

{'max_depth': 4, 'n_estimators': 400}

In [7]:
#Arousal
GKF = GroupKFold(n_splits=5)
cv = GKF.split(Xb, yb_arousal, groups)
params = {'max_depth' : [None, 1, 2, 3, 4, 5], 'n_estimators':np.arange(100, 600, 100)}

grid = GridSearchCV(estimator = RandomForestRegressor(random_state = 42),
                    param_grid = params, 
                    scoring = 'r2',
                    cv = cv)

g = grid.fit(Xb, yb_arousal)
g.best_params_

{'max_depth': 2, 'n_estimators': 400}

### GradientBoostingRegressor подбор параметров max_depth, n_estimators.

In [44]:
#Valence
GKF = GroupKFold(n_splits=5)
cv = GKF.split(Xb, yb_valence, groups)
params = {'max_depth' : np.arange(1, 5), 'n_estimators':np.arange(100, 600, 100)}

grid = GridSearchCV(estimator = GradientBoostingRegressor(random_state = 42),
                    param_grid = params, 
                    scoring = 'r2',
                    cv = cv)
 
g = grid.fit(Xb, yb_valence)
g.best_params_

{'max_depth': 1, 'n_estimators': 100}

In [45]:
#Arousal
GKF = GroupKFold(n_splits=5)
cv = GKF.split(Xb, yb_arousal, groups)
params = {'max_depth' : np.arange(1, 5), 'n_estimators':np.arange(100, 600, 100)}

grid = GridSearchCV(estimator = GradientBoostingRegressor(random_state = 42),
                    param_grid = params, 
                    scoring = 'r2',
                    cv = cv)

g = grid.fit(Xb, yb_arousal)
g.best_params_

{'max_depth': 1, 'n_estimators': 100}

### LGBMRegressor подбор параметров max_depth, n_estimators.

In [46]:
#Valence
GKF = GroupKFold(n_splits=5)
cv = GKF.split(Xb, yb_valence, groups)
params = {'max_depth' : np.arange(1, 5), 'n_estimators':np.arange(100, 600, 100)}

grid = GridSearchCV(estimator = LGBMRegressor(random_state = 42),
                    param_grid = params, 
                    scoring = 'r2',
                    cv = cv)

g = grid.fit(Xb, yb_valence)
g.best_params_

{'max_depth': 1, 'n_estimators': 100}

In [51]:
#Arousal
GKF = GroupKFold(n_splits=5)
cv = GKF.split(Xb, yb_arousal, groups)
params = {'max_depth' : np.arange(1, 5), 'n_estimators':np.arange(100, 600, 100)}

grid = GridSearchCV(estimator = LGBMRegressor(random_state = 42),
                    param_grid = params, 
                    scoring = 'r2',
                    cv = cv)

g = grid.fit(Xb, yb_arousal)
g.best_params_

{'max_depth': 1, 'n_estimators': 300}

### XGBRegressor подбор параметров max_depth, n_estimators.

In [48]:
#Valence
GKF = GroupKFold(n_splits=5)
cv = GKF.split(Xb, yb_valence, groups)
params = {'max_depth' : np.arange(1, 5), 'n_estimators':np.arange(100, 600, 100)}

grid = GridSearchCV(estimator = XGBRegressor(random_state = 42),
                    param_grid = params, 
                    scoring = 'r2',
                    cv = cv)

g = grid.fit(Xb, yb_valence)
g.best_params_

{'max_depth': 1, 'n_estimators': 100}

In [49]:
#Arousal
GKF = GroupKFold(n_splits=5)
cv = GKF.split(Xb, yb_arousal, groups)
params = {'max_depth' : np.arange(1, 5), 'n_estimators':np.arange(100, 600, 100)}

grid = GridSearchCV(estimator = XGBRegressor(random_state = 42),
                    param_grid = params, 
                    scoring = 'r2',
                    cv = cv)

g = grid.fit(Xb, yb_arousal)
g.best_params_

{'max_depth': 1, 'n_estimators': 100}

### CatBoostRegressor подбор параметров max_depth, n_estimators.

In [37]:
#Valence
GKF = GroupKFold(n_splits=5)
cv = GKF.split(Xb, yb_valence, groups)
params = {'depth' : np.arange(1, 5), 'iterations':np.arange(100, 600, 100)}

grid = GridSearchCV(estimator = CatBoostRegressor(random_state = 42, verbose = False, train_dir = 'catboost_with_lagged_log'),
                    param_grid = params, 
                    scoring = 'r2',
                    cv = cv)

gv = grid.fit(Xb, yb_valence)

In [38]:
#Arousal
GKF = GroupKFold(n_splits=5)
cv = GKF.split(Xb, yb_arousal, groups)
params = {'depth' : np.arange(1, 5), 'iterations':np.arange(100, 600, 100)}

grid = GridSearchCV(estimator = CatBoostRegressor(random_state = 42, verbose = False, train_dir = 'catboost_with_lagged_log'),
                    param_grid = params, 
                    scoring = 'r2',
                    cv = cv)

ga = grid.fit(Xb, yb_arousal)

In [39]:
print(f'Параметры для Valence: {gv.best_params_}')
print(f'Параметры для Arousal: {ga.best_params_}')

Параметры для Valence: {'depth': 2, 'iterations': 200}
Параметры для Arousal: {'depth': 1, 'iterations': 400}
