In [1]:
from pathlib import Path

import pandas as pd

In [2]:
from xgboost import XGBClassifier, plot_importance
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import f1_score, make_scorer
from sklearn.decomposition import PCA

In [4]:
%matplotlib inline
from matplotlib import pyplot as plt

In [5]:
path_data_input = Path('../data/input')

In [6]:
def load_data(path_data):
    data_dct = {}

    for i, path_file in enumerate((path_data).glob('*.csv'), 1):
        print(i, path_file)
        file_name = path_file.name.replace('.csv', '')

        df = pd.read_csv(path_file, header=None, skiprows=3)
        data_dct[file_name] = {}
        data_dct[file_name] = df
    return data_dct

In [7]:
# gestures = load_data(path_data_input/'gestures')
# gestures = pd.concat(gestures)

# print(gestures.shape)
# gestures.head()

In [8]:
fingers = load_data(path_data_input/'fingers')
fingers = pd.concat(fingers)
fingers.columns = [f'x_{i}' for i in range(1, len(fingers.columns))] + ['y']

print(fingers.shape)
fingers.head()

1 ../data/input/fingers/Gesture_11_finger.csv
2 ../data/input/fingers/Gesture_12_finger.csv
3 ../data/input/fingers/Gesture_13_finger.csv
4 ../data/input/fingers/Gesture_14_finger.csv
5 ../data/input/fingers/Gesture_15_finger.csv
(24818, 9)


Unnamed: 0,Unnamed: 1,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,y
Gesture_11_finger,0,501,570,428,196,367,278,214,189,0
Gesture_11_finger,1,500,569,429,196,368,279,215,189,0
Gesture_11_finger,2,500,568,426,196,368,278,214,189,0
Gesture_11_finger,3,499,568,428,195,367,278,213,188,0
Gesture_11_finger,4,500,570,427,194,366,277,213,188,0


In [9]:
# приведем классы к значениям {0, 1, ..., 5}
fingers['y'] = fingers['y'] % 10

In [10]:
fingers['y'].value_counts()

0    12596
5     2520
2     2520
4     2394
3     2394
1     2394
Name: y, dtype: int64

In [11]:
X = fingers.drop(columns=['y'])
y = fingers['y']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=fingers['y']
)

In [13]:
pd.concat([
    y_train.value_counts(normalize=True),
    y_test.value_counts(normalize=True)
], axis=1)

Unnamed: 0,y,y.1
0,0.507555,0.507454
1,0.096454,0.096495
2,0.101541,0.101531
3,0.096454,0.096495
4,0.096454,0.096495
5,0.101541,0.101531


In [14]:
%%time
xgb = XGBClassifier(eval_metric='mlogloss') # objective='multi:softmax'
xgb.fit(X_train, y_train)



CPU times: user 10min 22s, sys: 33.1 s, total: 10min 55s
Wall time: 54.7 s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='mlogloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=12,
              num_parallel_tree=1, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [15]:
f1_score_none = make_scorer(f1_score, average=None)
f1_score_micro = make_scorer(f1_score, average='micro')
f1_score_macro = make_scorer(f1_score, average='macro')
f1_score_weighted = make_scorer(f1_score, average='weighted')

In [16]:
f1_score_weighted(xgb, X_test, y_test)

0.9370809629009843

In [17]:
cross_val_score(
    xgb,
    X,
    y,
    scoring=f1_score_weighted,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
).mean()



0.9401387055407071