# import

In [1]:
import pandas as pd

# 보통 현재 작업 폴더(CWD)가 src 이므로 부모(프로젝트 루트)를 sys.path에 추가
import sys, os
from pathlib import Path

ROOT = Path.cwd().parent
if (ROOT / "utils").exists():
    sys.path.insert(0, str(ROOT))
else:
    # 혹시 구조가 더 깊을 때 대비
    ROOT2 = ROOT.parent
    if (ROOT2 / "utils").exists():
        sys.path.insert(0, str(ROOT2))

from utils.mediapipe_util import get_landmark_data, LANDMARK_MODE

# 라벨 데이터

In [2]:
sign_code_df = pd.read_csv("../../data/sign_code.csv")

In [3]:
sign_code_df

Unnamed: 0,label,sign_num,seq_Id,sign_text,seq_count
0,0,1,0,1,1
1,1,2,0,2,1
2,2,3,0,3,1
3,3,4,0,4,1
4,4,5,0,5,1
5,5,6,0,6,1
6,6,7,0,7,1
7,7,8,0,8,1
8,8,9,0,9,1
9,9,10,0,10,2


# 데이터 병합

## 수집한 데이터 합치기

In [4]:
HAND_COUNT = 21 * 3
POSE_COUNT = 11 * 3

## 헤더 추가
🚨 주의!!!! 이미 헤더가 있는 경우 또 추가될 수 있음!

In [5]:
# for i in range(7, 8):
#     data = pd.read_csv(f"../../data/sign_data/sign_data_{i}.csv")
    # data.insert(0, 'label', [i] * len(data))
    # columns = [i for i in range(len(data.columns) - 1)]
    # columns.insert(0, 'label')
    # data.columns = columns
    # data.to_csv(f"../../data/sign_data/sign_data_{i}.csv", index=False)

In [6]:
# i = 5
# data = pd.read_csv(f"../../data/sign_data/sign_data_{i}.csv", index_col=False)
# data.loc[:, 'label'] = i
# data.to_csv(f"../../data/sign_data/sign_data_{i}.csv", index=False)

## 병합

In [7]:
merged_data = pd.DataFrame()

for i, label in enumerate([0, 1, 2, 3, 4, 5, 6, 14, 15, 16, 24]):
    print(label)
    data = pd.read_csv(f"../../data/sign_data/sign_data_{label}.csv", index_col=False)
    data = data.drop(['label'], axis=1)
    data.loc[:, 'label'] = i
    data = data.sample(n = 500, replace=False)
    merged_data = pd.concat([merged_data, data], ignore_index=True)

df = merged_data

0
1
2
3
4
5
6
14
15
16
24


In [8]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,150,151,152,153,154,155,156,157,158,label
0,0,0,0,0,0,0,0,0,0,0,...,0.415091,0.435806,-0.380730,0.505990,0.519000,-0.571500,0.460250,0.513547,-0.586901,0
1,0,0,0,0,0,0,0,0,0,0,...,0.428977,0.437294,-0.298019,0.530522,0.505926,-0.527120,0.478606,0.507628,-0.537512,0
2,0,0,0,0,0,0,0,0,0,0,...,0.417615,0.439402,-0.391175,0.522419,0.509165,-0.656677,0.467017,0.510635,-0.664220,0
3,0,0,0,0,0,0,0,0,0,0,...,0.408728,0.477404,-0.317197,0.512127,0.551097,-0.490731,0.455667,0.557498,-0.498746,0
4,0,0,0,0,0,0,0,0,0,0,...,0.415802,0.443756,-0.362995,0.515006,0.506650,-0.587958,0.463979,0.509138,-0.588319,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5495,0,0,0,0,0,0,0,0,0,0,...,0.368390,0.439761,-0.298937,0.482622,0.542663,-0.563170,0.419527,0.546160,-0.554129,10
5496,0,0,0,0,0,0,0,0,0,0,...,0.377199,0.459688,-0.372161,0.494812,0.547100,-0.645373,0.429071,0.547459,-0.640600,10
5497,0,0,0,0,0,0,0,0,0,0,...,0.365340,0.431392,-0.420475,0.483778,0.520700,-0.694358,0.418825,0.528974,-0.694986,10
5498,0,0,0,0,0,0,0,0,0,0,...,0.366521,0.420979,-0.312215,0.478119,0.510317,-0.575518,0.415252,0.512629,-0.569010,10


In [9]:
# merged_data.to_csv("../../data/merge_num_7_data.csv", index=False)

# 데이터 불러오기

In [10]:
# import pandas as pd
# df = pd.read_csv("../../data/merge_num_7_data.csv")

## 클래스 확인

In [11]:
df['label'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10], dtype=int64)

## x, y 데이터

In [12]:
y = df['label']
x = df.drop(['label'], axis=1)

print(x.shape, y.shape)

(5500, 159) (5500,)


# 데이터 전처리

In [13]:
right_points_x = x.iloc[:, 63:63+63]
face_points_x = x.iloc[:, 63+63:]
nose_point_x = face_points_x.iloc[:, 0]
print(right_points_x.shape, face_points_x.shape, nose_point_x.shape)

(5500, 63) (5500, 33) (5500,)


In [14]:
# angles_list = []
# vector_list = []
# face_hand_vector_list = []
result = []
for i in range(len(right_points_x)):
    hand_row = right_points_x.iloc[i].tolist()
    face_row = face_points_x.iloc[i].tolist()

    data = get_landmark_data({"Left": [], "Right": hand_row, "Face": face_row}, mode = LANDMARK_MODE.ANGLE_VECTOR_CURV_FACE_NOSE_WRIST)
    result.append(data.reshape((-1)))

result_x = pd.DataFrame(result)

In [15]:
# result_x.to_csv("../../data/angle_vector_num_7_data.csv", index=False)

## 데이터 분할하기

In [16]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(result_x, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)

## 데이터 스케일링

In [17]:
# from sklearn.preprocessing import MinMaxScaler
# mms = MinMaxScaler()
# x_train = mms.fit_transform(x_train)
# x_test = mms.transform(x_test)

## 모델 학습

In [18]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import numpy as np

# model = lgb.LGBMClassifier(n_estimators=300, random_state=42)
model = XGBClassifier(n_estimators=200, max_depth=3, learning_rate=0.01)
#model = RandomForestClassifier(n_estimators=300, max_depth=5, random_state=42)
#model = LogisticRegression(solver='liblinear', max_iter=100)
model.fit(x_train, y_train)

score = model.score(x_test, y_test)

print(f"Accuracy: {np.round(score*100, 2)}%")

Accuracy: 98.64%


In [19]:
from sklearn.metrics import classification_report

y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       100
           1       0.99      0.99      0.99       100
           2       0.98      0.97      0.97       100
           3       0.98      0.98      0.98       100
           4       1.00      0.99      0.99       100
           5       0.98      0.99      0.99       100
           6       0.99      0.98      0.98       100
           7       0.98      0.99      0.99       100
           8       0.98      0.99      0.99       100
           9       0.99      1.00      1.00       100
          10       1.00      0.98      0.99       100

    accuracy                           0.99      1100
   macro avg       0.99      0.99      0.99      1100
weighted avg       0.99      0.99      0.99      1100



## 모델 저장하기

In [20]:
import joblib

joblib.dump(model, "../../models/xgb_sample_model.pkl")

['../../models/xgb_sample_model.pkl']

# 모델 로드

In [None]:
MODEL_PATH = "../../models/xgb_sample_angle_vector_model.pkl"
model = joblib.load(MODEL_PATH)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [22]:
print(model.n_features_in_)
print(model.classes_)

141
[ 0  1  2  3  4  5  6  7  8  9 10]
