## 핸드폰 센서 데이터 비교실험 Train code

[4] Rahim, Mussadiq Abdul, et al. "Zero-to-stable driver identification: A non-intrusive and scalable driver identification scheme." IEEE transactions on vehicular technology 69.1 (2019): 163-171.

Import

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import top_k_accuracy_score

Data load

In [10]:
df = pd.read_csv('./data.csv', encoding='cp949')
df = df.sample(frac=1, random_state=7777).reset_index(drop=True)

In [11]:
df

Unnamed: 0,t,o,s,慣,point,label,course,round
0,270.032470,3.410006,4.266776e+00,-2878.464902,0,6,2,4
1,62.439523,0.008015,6.470535e+00,-360.645781,0,2,2,3
2,175.959987,1.454038,1.059329e+01,-707.184939,0,2,3,3
3,26.872227,0.019227,9.898116e+00,38.036683,2,6,3,2
4,577.039315,4.999152,5.484925e+00,-1407.893548,7,1,1,3
...,...,...,...,...,...,...,...,...
17332,449.345610,3.802580,5.918946e+00,-1167.647386,1,9,1,2
17333,270.936105,3.401273,4.266776e+00,-2878.464902,0,6,2,4
17334,274.199130,0.020896,9.526170e+00,150.746123,6,6,1,3
17335,293.222668,2.084522,1.789522e+01,788.262174,4,7,3,1


코스별 교차검증

valid: A코스

In [12]:
df_train = df[df['course'] != 1]
df_valid = df[df['course'] == 1]

In [14]:
# 이후의 데이터 처리 코드
train_data = df_train.drop(['point','course', 'round'], axis=1)
valid_data = df_valid.drop(['point','course', 'round'], axis=1)

In [15]:
train_data.reset_index(inplace=True, drop=True)
valid_data.reset_index(inplace=True, drop=True)

In [16]:
valid_data

Unnamed: 0,t,o,s,慣,label
0,577.039315,4.999152,5.484925,-1407.893548,1
1,189.347973,0.553695,7.021633,238.273346,10
2,356.516255,3.203700,6.520950,-4511.420596,10
3,397.731524,0.954280,5.531168,540.933958,10
4,354.286424,0.471133,4.865901,-3844.728808,0
...,...,...,...,...,...
6024,370.659433,0.972817,3.341196,508.482948,10
6025,337.224241,0.453930,4.323000,-1911.145161,0
6026,237.814754,0.013026,12.176387,122.808182,6
6027,449.345610,3.802580,5.918946,-1167.647386,9


In [17]:
X_train = train_data.drop('label', axis=1)
y_train = train_data['label']

In [18]:
X_train

Unnamed: 0,t,o,s,慣
0,270.032470,3.410006,4.266776e+00,-2878.464902
1,62.439523,0.008015,6.470535e+00,-360.645781
2,175.959987,1.454038,1.059329e+01,-707.184939
3,26.872227,0.019227,9.898116e+00,38.036683
4,152.654190,0.246216,2.767122e-02,395.728020
...,...,...,...,...
11303,173.403635,0.086051,3.590693e+00,-3086.530600
11304,358.956762,0.688184,1.691951e-02,2907.628671
11305,270.936105,3.401273,4.266776e+00,-2878.464902
11306,293.222668,2.084522,1.789522e+01,788.262174


In [19]:
X_valid = valid_data.drop('label', axis=1)
y_valid = valid_data['label']

In [20]:
X_valid

Unnamed: 0,t,o,s,慣
0,577.039315,4.999152,5.484925,-1407.893548
1,189.347973,0.553695,7.021633,238.273346
2,356.516255,3.203700,6.520950,-4511.420596
3,397.731524,0.954280,5.531168,540.933958
4,354.286424,0.471133,4.865901,-3844.728808
...,...,...,...,...
6024,370.659433,0.972817,3.341196,508.482948
6025,337.224241,0.453930,4.323000,-1911.145161
6026,237.814754,0.013026,12.176387,122.808182
6027,449.345610,3.802580,5.918946,-1167.647386


In [21]:
# 결측치 drop
X_train = X_train.dropna()
y_train = y_train[X_train.index]

X_valid = X_valid.dropna()
y_valid = y_valid[X_valid.index]

Random Forest model training

In [22]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_recall_fscore_support, precision_recall_curve
from sklearn.preprocessing import label_binarize
import numpy as np

rf_model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_mlp = rf_model.predict(X_valid)

# 예측 확률 계산
y_pred_prob = rf_model.predict_proba(X_valid)

# 레이블에 따라 확률을 그룹화
grouped_prob = {label: [] for label in np.unique(y_train)}
for prob, label in zip(y_pred_prob, y_valid):
    grouped_prob[label].append(prob)

# TOP-3 정확도 계산 함수
def top_3_accuracy(probs, labels):
    correct = 0
    for prob, label in zip(probs, labels):
        top_3_labels = np.argsort(prob)[-3:]
        correct += label in top_3_labels
    return correct / len(labels)

# 다중 클래스 MAP 계산 함수
def mean_average_precision_multiclass(probs, y_true, n_classes):
    y_true_bin = label_binarize(y_true, classes=range(n_classes))
    average_precisions = []
    for i in range(n_classes):
        precision, recall, _ = precision_recall_curve(y_true_bin[:, i], probs[:, i])
        ap = np.trapz(recall, precision)
        average_precisions.append(ap)
    return np.mean(average_precisions)

# F1 스코어 계산 함수
def f1_score(y_true, y_pred):
    _, _, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')
    return f1

# 예측을 집계하고 평가 지표를 계산하는 함수
def evaluate_performance(grouped_prob, vote_count, n_classes):
    y_true = []
    y_pred = []
    y_scores = []
    for label, probs in grouped_prob.items():
        for i in range(0, len(probs), vote_count):
            avg_prob = np.mean(probs[i:i + vote_count], axis=0)
            predicted_label = np.argmax(avg_prob)
            y_true.append(label)
            y_pred.append(predicted_label)
            y_scores.append(avg_prob)

    acc = np.mean(np.array(y_pred) == np.array(y_true))
    top_3_acc = top_3_accuracy(np.array(y_scores), np.array(y_true))
    map_score = mean_average_precision_multiclass(np.array(y_scores), np.array(y_true), n_classes)
    f1 = f1_score(np.array(y_true), np.array(y_pred))

    return acc, top_3_acc, map_score, f1

# 각 보팅 방식에 대한 평가 지표 계산
n_classes = len(np.unique(y_train))
for vote_count in [1, 3, 10, len(y_valid)]:
    acc, top_3_acc, map_score, f1 = evaluate_performance(grouped_prob, vote_count, n_classes)
    print(f"Voting {vote_count}: 정확도 = {acc}, TOP-3 정확도 = {top_3_acc}, MAP = {map_score}, F1 스코어 = {f1}")

Voting 1: 정확도 = 0.30502570907281473, TOP-3 정확도 = 0.5400563940952064, MAP = 0.20359618452824887, F1 스코어 = 0.22510218621499406
Voting 3: 정확도 = 0.26726279185295576, TOP-3 정확도 = 0.6299056135121709, MAP = 0.2353383358279764, F1 스코어 = 0.18671693639565878
Voting 10: 정확도 = 0.3021346469622332, TOP-3 정확도 = 0.6617405582922824, MAP = 0.2968342460627328, F1 스코어 = 0.19780654444137905
Voting 6029: 정확도 = 0.2727272727272727, TOP-3 정확도 = 0.5454545454545454, MAP = 0.3806818181818182, F1 스코어 = 0.19696969696969696


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


코스별 교차검증

valid: B코스

In [27]:
df = pd.read_csv('./data.csv', encoding='cp949')
df = df.sample(frac=1, random_state=7777).reset_index(drop=True)

df_train = df[df['course'] != 2]
df_valid = df[df['course'] == 2]

# 이후의 데이터 처리 코드
train_data = df_train.drop(['point','course', 'round'], axis=1)
valid_data = df_valid.drop(['point','course', 'round'], axis=1)

train_data.reset_index(inplace=True, drop=True)
valid_data.reset_index(inplace=True, drop=True)

X_train = train_data.drop('label', axis=1)
y_train = train_data['label']

X_valid = valid_data.drop('label', axis=1)
y_valid = valid_data['label']

# 결측치 drop
X_train = X_train.dropna()
y_train = y_train[X_train.index]

X_valid = X_valid.dropna()
y_valid = y_valid[X_valid.index]

In [28]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_recall_fscore_support, precision_recall_curve
from sklearn.preprocessing import label_binarize
import numpy as np

rf_model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_mlp = rf_model.predict(X_valid)

# 예측 확률 계산
y_pred_prob = rf_model.predict_proba(X_valid)

# 레이블에 따라 확률을 그룹화
grouped_prob = {label: [] for label in np.unique(y_train)}
for prob, label in zip(y_pred_prob, y_valid):
    grouped_prob[label].append(prob)

# TOP-3 정확도 계산 함수
def top_3_accuracy(probs, labels):
    correct = 0
    for prob, label in zip(probs, labels):
        top_3_labels = np.argsort(prob)[-3:]
        correct += label in top_3_labels
    return correct / len(labels)

# 다중 클래스 MAP 계산 함수
def mean_average_precision_multiclass(probs, y_true, n_classes):
    y_true_bin = label_binarize(y_true, classes=range(n_classes))
    average_precisions = []
    for i in range(n_classes):
        precision, recall, _ = precision_recall_curve(y_true_bin[:, i], probs[:, i])
        ap = np.trapz(recall, precision)
        average_precisions.append(ap)
    return np.mean(average_precisions)

# F1 스코어 계산 함수
def f1_score(y_true, y_pred):
    _, _, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')
    return f1

# 예측을 집계하고 평가 지표를 계산하는 함수
def evaluate_performance(grouped_prob, vote_count, n_classes):
    y_true = []
    y_pred = []
    y_scores = []
    for label, probs in grouped_prob.items():
        for i in range(0, len(probs), vote_count):
            avg_prob = np.mean(probs[i:i + vote_count], axis=0)
            predicted_label = np.argmax(avg_prob)
            y_true.append(label)
            y_pred.append(predicted_label)
            y_scores.append(avg_prob)

    acc = np.mean(np.array(y_pred) == np.array(y_true))
    top_3_acc = top_3_accuracy(np.array(y_scores), np.array(y_true))
    map_score = mean_average_precision_multiclass(np.array(y_scores), np.array(y_true), n_classes)
    f1 = f1_score(np.array(y_true), np.array(y_pred))

    return acc, top_3_acc, map_score, f1

# 각 보팅 방식에 대한 평가 지표 계산
n_classes = len(np.unique(y_train))
for vote_count in [1, 3, 10, len(y_valid)]:
    acc, top_3_acc, map_score, f1 = evaluate_performance(grouped_prob, vote_count, n_classes)
    print(f"Voting {vote_count}: 정확도 = {acc}, TOP-3 정확도 = {top_3_acc}, MAP = {map_score}, F1 스코어 = {f1}")

Voting 1: 정확도 = 0.26363896029705797, TOP-3 정확도 = 0.5412739217366467, MAP = 0.18616597575378746, F1 스코어 = 0.14727474833420964
Voting 3: 정확도 = 0.3153846153846154, TOP-3 정확도 = 0.6358974358974359, MAP = 0.223480509746625, F1 스코어 = 0.16030035174363907
Voting 10: 정확도 = 0.36338028169014086, TOP-3 정확도 = 0.7098591549295775, MAP = 0.2510228415569838, F1 스코어 = 0.16957109146374894
Voting 3501: 정확도 = 0.2222222222222222, TOP-3 정확도 = 0.5555555555555556, MAP = 0.21212121212121213, F1 스코어 = 0.12121212121212123


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


코스별 교차검증

valid: C코스

In [29]:
df = pd.read_csv('./data.csv', encoding='cp949')
df = df.sample(frac=1, random_state=7777).reset_index(drop=True)

df_train = df[df['course'] != 3]
df_valid = df[df['course'] == 3]

# 이후의 데이터 처리 코드
train_data = df_train.drop(['point','course', 'round'], axis=1)
valid_data = df_valid.drop(['point','course', 'round'], axis=1)

train_data.reset_index(inplace=True, drop=True)
valid_data.reset_index(inplace=True, drop=True)

X_train = train_data.drop('label', axis=1)
y_train = train_data['label']

X_valid = valid_data.drop('label', axis=1)
y_valid = valid_data['label']

# 결측치 drop
X_train = X_train.dropna()
y_train = y_train[X_train.index]

X_valid = X_valid.dropna()
y_valid = y_valid[X_valid.index]

In [30]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_recall_fscore_support, precision_recall_curve
from sklearn.preprocessing import label_binarize
import numpy as np

rf_model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_mlp = rf_model.predict(X_valid)

# 예측 확률 계산
y_pred_prob = rf_model.predict_proba(X_valid)

# 레이블에 따라 확률을 그룹화
grouped_prob = {label: [] for label in np.unique(y_train)}
for prob, label in zip(y_pred_prob, y_valid):
    grouped_prob[label].append(prob)

# TOP-3 정확도 계산 함수
def top_3_accuracy(probs, labels):
    correct = 0
    for prob, label in zip(probs, labels):
        top_3_labels = np.argsort(prob)[-3:]
        correct += label in top_3_labels
    return correct / len(labels)

# 다중 클래스 MAP 계산 함수
def mean_average_precision_multiclass(probs, y_true, n_classes):
    y_true_bin = label_binarize(y_true, classes=range(n_classes))
    average_precisions = []
    for i in range(n_classes):
        precision, recall, _ = precision_recall_curve(y_true_bin[:, i], probs[:, i])
        ap = np.trapz(recall, precision)
        average_precisions.append(ap)
    return np.mean(average_precisions)

# F1 스코어 계산 함수
def f1_score(y_true, y_pred):
    _, _, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')
    return f1

# 예측을 집계하고 평가 지표를 계산하는 함수
def evaluate_performance(grouped_prob, vote_count, n_classes):
    y_true = []
    y_pred = []
    y_scores = []
    for label, probs in grouped_prob.items():
        for i in range(0, len(probs), vote_count):
            avg_prob = np.mean(probs[i:i + vote_count], axis=0)
            predicted_label = np.argmax(avg_prob)
            y_true.append(label)
            y_pred.append(predicted_label)
            y_scores.append(avg_prob)

    acc = np.mean(np.array(y_pred) == np.array(y_true))
    top_3_acc = top_3_accuracy(np.array(y_scores), np.array(y_true))
    map_score = mean_average_precision_multiclass(np.array(y_scores), np.array(y_true), n_classes)
    f1 = f1_score(np.array(y_true), np.array(y_pred))

    return acc, top_3_acc, map_score, f1

# 각 보팅 방식에 대한 평가 지표 계산
n_classes = len(np.unique(y_train))
for vote_count in [1, 3, 10, len(y_valid)]:
    acc, top_3_acc, map_score, f1 = evaluate_performance(grouped_prob, vote_count, n_classes)
    print(f"Voting {vote_count}: 정확도 = {acc}, TOP-3 정확도 = {top_3_acc}, MAP = {map_score}, F1 스코어 = {f1}")

  _warn_prf(average, modifier, msg_start, len(result))


Voting 1: 정확도 = 0.18073523760727553, TOP-3 정확도 = 0.39848853592929423, MAP = 0.08376983559370667, F1 스코어 = 0.14680145051690122
Voting 3: 정확도 = 0.1925584963559647, TOP-3 정확도 = 0.4330648254698888, MAP = 0.12488395823346696, F1 스코어 = 0.11982919918122334
Voting 10: 정확도 = 0.23439490445859873, TOP-3 정확도 = 0.42420382165605097, MAP = 0.1800970693299511, F1 스코어 = 0.126020598315803
Voting 7807: 정확도 = 0.2727272727272727, TOP-3 정확도 = 0.36363636363636365, MAP = 0.19623179850452577, F1 스코어 = 0.17748917748917747


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


회차별 교차검증

valid: 1회차

In [31]:
df = pd.read_csv('./data.csv', encoding='cp949')
df = df.sample(frac=1, random_state=7777).reset_index(drop=True)

df_train = df[df['round'] != 1]
df_valid = df[df['round'] == 1]

# 이후의 데이터 처리 코드
train_data = df_train.drop(['point','course', 'round'], axis=1)
valid_data = df_valid.drop(['point','course', 'round'], axis=1)

train_data.reset_index(inplace=True, drop=True)
valid_data.reset_index(inplace=True, drop=True)

X_train = train_data.drop('label', axis=1)
y_train = train_data['label']

X_valid = valid_data.drop('label', axis=1)
y_valid = valid_data['label']

# 결측치 drop
X_train = X_train.dropna()
y_train = y_train[X_train.index]

X_valid = X_valid.dropna()
y_valid = y_valid[X_valid.index]

In [32]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_recall_fscore_support, precision_recall_curve
from sklearn.preprocessing import label_binarize
import numpy as np

rf_model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_mlp = rf_model.predict(X_valid)

# 예측 확률 계산
y_pred_prob = rf_model.predict_proba(X_valid)

# 레이블에 따라 확률을 그룹화
grouped_prob = {label: [] for label in np.unique(y_train)}
for prob, label in zip(y_pred_prob, y_valid):
    grouped_prob[label].append(prob)

# TOP-3 정확도 계산 함수
def top_3_accuracy(probs, labels):
    correct = 0
    for prob, label in zip(probs, labels):
        top_3_labels = np.argsort(prob)[-3:]
        correct += label in top_3_labels
    return correct / len(labels)

# 다중 클래스 MAP 계산 함수
def mean_average_precision_multiclass(probs, y_true, n_classes):
    y_true_bin = label_binarize(y_true, classes=range(n_classes))
    average_precisions = []
    for i in range(n_classes):
        precision, recall, _ = precision_recall_curve(y_true_bin[:, i], probs[:, i])
        ap = np.trapz(recall, precision)
        average_precisions.append(ap)
    return np.mean(average_precisions)

# F1 스코어 계산 함수
def f1_score(y_true, y_pred):
    _, _, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')
    return f1

# 예측을 집계하고 평가 지표를 계산하는 함수
def evaluate_performance(grouped_prob, vote_count, n_classes):
    y_true = []
    y_pred = []
    y_scores = []
    for label, probs in grouped_prob.items():
        for i in range(0, len(probs), vote_count):
            avg_prob = np.mean(probs[i:i + vote_count], axis=0)
            predicted_label = np.argmax(avg_prob)
            y_true.append(label)
            y_pred.append(predicted_label)
            y_scores.append(avg_prob)

    acc = np.mean(np.array(y_pred) == np.array(y_true))
    top_3_acc = top_3_accuracy(np.array(y_scores), np.array(y_true))
    map_score = mean_average_precision_multiclass(np.array(y_scores), np.array(y_true), n_classes)
    f1 = f1_score(np.array(y_true), np.array(y_pred))

    return acc, top_3_acc, map_score, f1

# 각 보팅 방식에 대한 평가 지표 계산
n_classes = len(np.unique(y_train))
for vote_count in [1, 3, 10, len(y_valid)]:
    acc, top_3_acc, map_score, f1 = evaluate_performance(grouped_prob, vote_count, n_classes)
    print(f"Voting {vote_count}: 정확도 = {acc}, TOP-3 정확도 = {top_3_acc}, MAP = {map_score}, F1 스코어 = {f1}")

Voting 1: 정확도 = 0.1665024630541872, TOP-3 정확도 = 0.5152709359605911, MAP = 0.0851594687950567, F1 스코어 = 0.14221665428806254
Voting 3: 정확도 = 0.21296978629329402, TOP-3 정확도 = 0.5143699336772292, MAP = 0.12976657390106275, F1 스코어 = 0.16800991900433468
Voting 10: 정확도 = 0.25183374083129584, TOP-3 정확도 = 0.5452322738386308, MAP = 0.19694590979513613, F1 스코어 = 0.1899948888438043
Voting 4060: 정확도 = 0.36363636363636365, TOP-3 정확도 = 0.6363636363636364, MAP = 0.20153482880755605, F1 스코어 = 0.2484848484848485


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


회차별 교차검증

valid: 2회차

In [33]:
df = pd.read_csv('./data.csv', encoding='cp949')
df = df.sample(frac=1, random_state=7777).reset_index(drop=True)

df_train = df[df['round'] != 2]
df_valid = df[df['round'] == 2]

# 이후의 데이터 처리 코드
train_data = df_train.drop(['point','course', 'round'], axis=1)
valid_data = df_valid.drop(['point','course', 'round'], axis=1)

train_data.reset_index(inplace=True, drop=True)
valid_data.reset_index(inplace=True, drop=True)

X_train = train_data.drop('label', axis=1)
y_train = train_data['label']

X_valid = valid_data.drop('label', axis=1)
y_valid = valid_data['label']

# 결측치 drop
X_train = X_train.dropna()
y_train = y_train[X_train.index]

X_valid = X_valid.dropna()
y_valid = y_valid[X_valid.index]

In [34]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_recall_fscore_support, precision_recall_curve
from sklearn.preprocessing import label_binarize
import numpy as np

rf_model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_mlp = rf_model.predict(X_valid)

# 예측 확률 계산
y_pred_prob = rf_model.predict_proba(X_valid)

# 레이블에 따라 확률을 그룹화
grouped_prob = {label: [] for label in np.unique(y_train)}
for prob, label in zip(y_pred_prob, y_valid):
    grouped_prob[label].append(prob)

# TOP-3 정확도 계산 함수
def top_3_accuracy(probs, labels):
    correct = 0
    for prob, label in zip(probs, labels):
        top_3_labels = np.argsort(prob)[-3:]
        correct += label in top_3_labels
    return correct / len(labels)

# 다중 클래스 MAP 계산 함수
def mean_average_precision_multiclass(probs, y_true, n_classes):
    y_true_bin = label_binarize(y_true, classes=range(n_classes))
    average_precisions = []
    for i in range(n_classes):
        precision, recall, _ = precision_recall_curve(y_true_bin[:, i], probs[:, i])
        ap = np.trapz(recall, precision)
        average_precisions.append(ap)
    return np.mean(average_precisions)

# F1 스코어 계산 함수
def f1_score(y_true, y_pred):
    _, _, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')
    return f1

# 예측을 집계하고 평가 지표를 계산하는 함수
def evaluate_performance(grouped_prob, vote_count, n_classes):
    y_true = []
    y_pred = []
    y_scores = []
    for label, probs in grouped_prob.items():
        for i in range(0, len(probs), vote_count):
            avg_prob = np.mean(probs[i:i + vote_count], axis=0)
            predicted_label = np.argmax(avg_prob)
            y_true.append(label)
            y_pred.append(predicted_label)
            y_scores.append(avg_prob)

    acc = np.mean(np.array(y_pred) == np.array(y_true))
    top_3_acc = top_3_accuracy(np.array(y_scores), np.array(y_true))
    map_score = mean_average_precision_multiclass(np.array(y_scores), np.array(y_true), n_classes)
    f1 = f1_score(np.array(y_true), np.array(y_pred))

    return acc, top_3_acc, map_score, f1

# 각 보팅 방식에 대한 평가 지표 계산
n_classes = len(np.unique(y_train))
for vote_count in [1, 3, 10, len(y_valid)]:
    acc, top_3_acc, map_score, f1 = evaluate_performance(grouped_prob, vote_count, n_classes)
    print(f"Voting {vote_count}: 정확도 = {acc}, TOP-3 정확도 = {top_3_acc}, MAP = {map_score}, F1 스코어 = {f1}")

Voting 1: 정확도 = 0.3647840531561462, TOP-3 정확도 = 0.6506644518272425, MAP = 0.19049325379791793, F1 스코어 = 0.25915511530417534
Voting 3: 정확도 = 0.4368159203980099, TOP-3 정확도 = 0.7402985074626866, MAP = 0.26930865407400073, F1 스코어 = 0.2987944491541421
Voting 10: 정확도 = 0.5313531353135313, TOP-3 정확도 = 0.7227722772277227, MAP = 0.34366459153552875, F1 스코어 = 0.3401602169742401
Voting 6020: 정확도 = 0.36363636363636365, TOP-3 정확도 = 0.6363636363636364, MAP = 0.3117621015348288, F1 스코어 = 0.303030303030303


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


회차별 교차검증

valid: 3회차

In [35]:
df = pd.read_csv('./data.csv', encoding='cp949')
df = df.sample(frac=1, random_state=7777).reset_index(drop=True)

df_train = df[df['round'] != 3]
df_valid = df[df['round'] == 3]

# 이후의 데이터 처리 코드
train_data = df_train.drop(['point','course', 'round'], axis=1)
valid_data = df_valid.drop(['point','course', 'round'], axis=1)

train_data.reset_index(inplace=True, drop=True)
valid_data.reset_index(inplace=True, drop=True)

X_train = train_data.drop('label', axis=1)
y_train = train_data['label']

X_valid = valid_data.drop('label', axis=1)
y_valid = valid_data['label']

# 결측치 drop
X_train = X_train.dropna()
y_train = y_train[X_train.index]

X_valid = X_valid.dropna()
y_valid = y_valid[X_valid.index]

In [36]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_recall_fscore_support, precision_recall_curve
from sklearn.preprocessing import label_binarize
import numpy as np

rf_model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_mlp = rf_model.predict(X_valid)

# 예측 확률 계산
y_pred_prob = rf_model.predict_proba(X_valid)

# 레이블에 따라 확률을 그룹화
grouped_prob = {label: [] for label in np.unique(y_train)}
for prob, label in zip(y_pred_prob, y_valid):
    grouped_prob[label].append(prob)

# TOP-3 정확도 계산 함수
def top_3_accuracy(probs, labels):
    correct = 0
    for prob, label in zip(probs, labels):
        top_3_labels = np.argsort(prob)[-3:]
        correct += label in top_3_labels
    return correct / len(labels)

# 다중 클래스 MAP 계산 함수
def mean_average_precision_multiclass(probs, y_true, n_classes):
    y_true_bin = label_binarize(y_true, classes=range(n_classes))
    average_precisions = []
    for i in range(n_classes):
        precision, recall, _ = precision_recall_curve(y_true_bin[:, i], probs[:, i])
        ap = np.trapz(recall, precision)
        average_precisions.append(ap)
    return np.mean(average_precisions)

# F1 스코어 계산 함수
def f1_score(y_true, y_pred):
    _, _, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')
    return f1

# 예측을 집계하고 평가 지표를 계산하는 함수
def evaluate_performance(grouped_prob, vote_count, n_classes):
    y_true = []
    y_pred = []
    y_scores = []
    for label, probs in grouped_prob.items():
        for i in range(0, len(probs), vote_count):
            avg_prob = np.mean(probs[i:i + vote_count], axis=0)
            predicted_label = np.argmax(avg_prob)
            y_true.append(label)
            y_pred.append(predicted_label)
            y_scores.append(avg_prob)

    acc = np.mean(np.array(y_pred) == np.array(y_true))
    top_3_acc = top_3_accuracy(np.array(y_scores), np.array(y_true))
    map_score = mean_average_precision_multiclass(np.array(y_scores), np.array(y_true), n_classes)
    f1 = f1_score(np.array(y_true), np.array(y_pred))

    return acc, top_3_acc, map_score, f1

# 각 보팅 방식에 대한 평가 지표 계산
n_classes = len(np.unique(y_train))
for vote_count in [1, 3, 10, len(y_valid)]:
    acc, top_3_acc, map_score, f1 = evaluate_performance(grouped_prob, vote_count, n_classes)
    print(f"Voting {vote_count}: 정확도 = {acc}, TOP-3 정확도 = {top_3_acc}, MAP = {map_score}, F1 스코어 = {f1}")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Voting 1: 정확도 = 0.30201342281879195, TOP-3 정확도 = 0.5234899328859061, MAP = 0.2451296058359561, F1 스코어 = 0.22253028989130721
Voting 3: 정확도 = 0.4146718146718147, TOP-3 정확도 = 0.6262548262548262, MAP = 0.35151457112663703, F1 스코어 = 0.29220181005895296
Voting 10: 정확도 = 0.4631043256997455, TOP-3 정확도 = 0.7099236641221374, MAP = 0.4621015150376536, F1 스코어 = 0.3716686310965007
Voting 3874: 정확도 = 0.4, TOP-3 정확도 = 0.7, MAP = 0.4, F1 스코어 = 0.3333333333333333


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


회차별 교차검증

valid: 4회차

In [38]:
df = pd.read_csv('./data.csv', encoding='cp949')
df = df.sample(frac=1, random_state=7777).reset_index(drop=True)

df_train = df[df['round'] != 4]
df_valid = df[df['round'] == 4]

# 이후의 데이터 처리 코드
train_data = df_train.drop(['point','course', 'round'], axis=1)
valid_data = df_valid.drop(['point','course', 'round'], axis=1)

train_data.reset_index(inplace=True, drop=True)
valid_data.reset_index(inplace=True, drop=True)

X_train = train_data.drop('label', axis=1)
y_train = train_data['label']

X_valid = valid_data.drop('label', axis=1)
y_valid = valid_data['label']

# 결측치 drop
X_train = X_train.dropna()
y_train = y_train[X_train.index]

X_valid = X_valid.dropna()
y_valid = y_valid[X_valid.index]

In [39]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_recall_fscore_support, precision_recall_curve
from sklearn.preprocessing import label_binarize
import numpy as np

rf_model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_mlp = rf_model.predict(X_valid)

# 예측 확률 계산
y_pred_prob = rf_model.predict_proba(X_valid)

# 레이블에 따라 확률을 그룹화
grouped_prob = {label: [] for label in np.unique(y_train)}
for prob, label in zip(y_pred_prob, y_valid):
    grouped_prob[label].append(prob)

# TOP-3 정확도 계산 함수
def top_3_accuracy(probs, labels):
    correct = 0
    for prob, label in zip(probs, labels):
        top_3_labels = np.argsort(prob)[-3:]
        correct += label in top_3_labels
    return correct / len(labels)

# 다중 클래스 MAP 계산 함수
def mean_average_precision_multiclass(probs, y_true, n_classes):
    y_true_bin = label_binarize(y_true, classes=range(n_classes))
    average_precisions = []
    for i in range(n_classes):
        precision, recall, _ = precision_recall_curve(y_true_bin[:, i], probs[:, i])
        ap = np.trapz(recall, precision)
        average_precisions.append(ap)
    return np.mean(average_precisions)

# F1 스코어 계산 함수
def f1_score(y_true, y_pred):
    _, _, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')
    return f1

# 예측을 집계하고 평가 지표를 계산하는 함수
def evaluate_performance(grouped_prob, vote_count, n_classes):
    y_true = []
    y_pred = []
    y_scores = []
    for label, probs in grouped_prob.items():
        for i in range(0, len(probs), vote_count):
            avg_prob = np.mean(probs[i:i + vote_count], axis=0)
            predicted_label = np.argmax(avg_prob)
            y_true.append(label)
            y_pred.append(predicted_label)
            y_scores.append(avg_prob)

    acc = np.mean(np.array(y_pred) == np.array(y_true))
    top_3_acc = top_3_accuracy(np.array(y_scores), np.array(y_true))
    map_score = mean_average_precision_multiclass(np.array(y_scores), np.array(y_true), n_classes)
    f1 = f1_score(np.array(y_true), np.array(y_pred))

    return acc, top_3_acc, map_score, f1

# 각 보팅 방식에 대한 평가 지표 계산
n_classes = len(np.unique(y_train))
for vote_count in [1, 3, 10, len(y_valid)]:
    acc, top_3_acc, map_score, f1 = evaluate_performance(grouped_prob, vote_count, n_classes)
    print(f"Voting {vote_count}: 정확도 = {acc}, TOP-3 정확도 = {top_3_acc}, MAP = {map_score}, F1 스코어 = {f1}")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Voting 1: 정확도 = 0.21814957138634347, TOP-3 정확도 = 0.4546260715341413, MAP = 0.16345526146912281, F1 스코어 = 0.20406069295026918
Voting 3: 정확도 = 0.20229681978798586, TOP-3 정확도 = 0.5079505300353356, MAP = 0.21867644088613758, F1 스코어 = 0.20150703908622872
Voting 10: 정확도 = 0.20760233918128654, TOP-3 정확도 = 0.5467836257309941, MAP = 0.2613266504428525, F1 스코어 = 0.2044308810999911
Voting 3383: 정확도 = 0.2, TOP-3 정확도 = 0.5, MAP = 0.3151515151515152, F1 스코어 = 0.1515151515151515


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
