In [None]:
!pip install -q numpy pandas scikit-learn

In [None]:
import json, io, math
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

In [None]:
# 실데이터 없을 때 사용할 합성 데이터 (분포를 TLX 느낌으로 설계)

def make_dummy(n=300, seed=7):

    rng = np.random.default_rng(seed)

    # 베이스(중간 부하)
    base = rng.normal(55, 12, size=(n, 6)).clip(0, 100)

    # 몇몇 샘플에 '시간 압박↑, 노력↑' 패턴 섞기
    idx = rng.choice(n, size=n//3, replace=False)
    base[idx, 2] = (base[idx, 2] + rng.normal(20, 10, size=len(idx))).clip(0,100)  # temporal up
    base[idx, 4] = (base[idx, 4] + rng.normal(15, 8, size=len(idx))).clip(0,100)   # effort up

    # 몇몇 샘플에 '좌절↑, 성과 낮음(=OutcomePressure↑)' 패턴 섞기
    idx2 = rng.choice(n, size=n//3, replace=False)
    base[idx2, 5] = (base[idx2, 5] + rng.normal(20, 10, size=len(idx2))).clip(0,100) # frustration up

    # performance 원점수는 평균 60 근처
    base[:, 3] = rng.normal(60, 12, size=n).clip(0,100)

    df = pd.DataFrame(base, columns=['mental','physical','temporal','performance','effort','frustration'])

    return df

# 위 업로드 블록을 건너뛰었다면 더미 생성
if 'df_raw' not in globals():
    df_raw = make_dummy()
df_raw.head()

Unnamed: 0,mental,physical,temporal,performance,effort,frustration
0,55.014762,58.584946,92.703594,62.06127,78.248919,43.100241
1,55.721723,71.082583,74.38314,54.574113,92.785659,59.282644
2,56.264971,43.834383,75.887253,53.841804,56.603919,70.210691
3,32.185327,39.525547,51.358384,69.395699,47.179429,58.255172
4,56.881013,52.756829,24.798883,62.13705,54.417989,69.186528


In [None]:
# 6D → 8D 특징 구성 : [Δ6축, tlx_mean, stress]

def build_8d_features(df: pd.DataFrame) -> pd.DataFrame:

    # OutcomePressure = 100 - Performance
    outcome_pressure = 100.0 - df['performance']

    six_axes = pd.concat([
        df['mental'],
        df['physical'],
        df['temporal'],
        outcome_pressure.rename('outcome_pressure'),
        df['effort'],
        df['frustration']
    ], axis=1)

    # 전반적 부담 요약치
    tlx_mean = six_axes.mean(axis=1)

    # 각 축의 "평균 대비 편차" (개별 치우침을 반영)
    deltas = six_axes.subtract(tlx_mean, axis=0)
    deltas.columns = ['d_mental','d_physical','d_temporal','d_outcome','d_effort','d_frustration']

    # Stress 산출 정의
    # 주로 PC1(정신적 부담) 쪽 성질을 강화한다고 볼 수 있음
    stress = 0.5*df['frustration'] + 0.3*df['temporal'] + 0.2*df['effort']

    # 최종 8D 특징
    feats = pd.concat([deltas, tlx_mean.rename('tlx_mean'), stress.rename('stress')], axis=1)
    return feats

X8_df = build_8d_features(df_raw)
X8 = X8_df.values.astype(float)  # (N,8)
X8_df.head()

Unnamed: 0,d_mental,d_physical,d_temporal,d_outcome,d_effort,d_frustration,tlx_mean,stress
0,-5.917104,-2.346919,31.771729,-22.993136,17.317054,-17.831624,60.931865,65.010983
1,-10.725216,4.635644,7.936201,-21.021052,26.33872,-7.164295,66.446939,70.513396
2,-1.894931,-14.325519,17.727351,-12.001706,-1.555984,12.050789,58.159902,69.192305
3,-10.999366,-3.659146,8.17369,-12.580392,3.994736,15.070479,43.184693,53.970987
4,7.563648,3.439463,-24.518482,-11.454415,5.100623,19.869162,49.317365,52.916527


In [None]:
# 표준화 및 PCA 학습 (2차원)

# 평균/표준편차(분모 0 방지)
mu = X8.mean(axis=0)             # (8,)
sigma = X8.std(axis=0, ddof=0)   # (8,)
sigma[sigma == 0] = 1.0          # 0 방지


# 표준화된 행렬 Z (각 feature가 평균 0, 분산 1에 가까움)
Z = (X8 - mu) / sigma            # (N,8)


# PCA 학습
# scikit-learn: components_.shape = (2, 8)  (행=PC, 열=feature), 각 PC는 단위벡터
pca = PCA(n_components=2, svd_solver='full', random_state=0)
scores = pca.fit_transform(Z)    # (N,2)  = Z dot components_.T


# 예: [0.42, 0.23] → 첫 두 축이 전체 분산의 65%를 설명
print("explained_variance_ratio_:", pca.explained_variance_ratio_)


# 주성분 방향(부호) 정렬
# - PCA는 부호가 임의(±)라 해석 일관성을 위해 방향을 고정
# - PC1: tlx_mean과 +상관
# - PC2: "정신적 - 시간/물리" 대비축과 +상관
tlx_mean_vec = X8_df['tlx_mean'].values

# "정신적 vs 시간/물리" 대비
affect_vs_time = (0.5*df_raw['frustration'] + 0.5*(100.0 - df_raw['performance']) # 정신/성과압박↑
                  - 0.5*df_raw['temporal']  - 0.5*df_raw['effort']                # 시간/물리압박↓
                  ).values

def corr(a, b):
    a = a - a.mean(); b = b - b.mean()
    den = (np.linalg.norm(a) * np.linalg.norm(b))
    return 0.0 if den == 0 else float(np.dot(a, b) / den)


# PC1 정렬: tlx_mean과 양(+) 상관
if corr(scores[:,0], tlx_mean_vec) < 0:
    pca.components_[0,:] *= -1
    scores[:,0]           *= -1


# PC2 정렬: affect_vs_time와 양(+) 상관
if corr(scores[:,1], affect_vs_time) < 0:
    pca.components_[1,:] *= -1
    scores[:,1]           *= -1

# components_: (2,8) → 행=PC, 열=feature
W_rowwise = pca.components_          # (2,8)
W = W_rowwise.T                      # (8,2)  ← 자바 JSON 포맷과 동일하게 전치시킴 (열=PC1,PC2)

explained_variance_ratio_: [0.34669571 0.19071913]


In [None]:
# JSON으로 내보내기 (서버가 로드할 PCA 파라미터)
# - mu, sigma: 표준화에 사용
# - W(8x2): 표준화된 벡터 z에 곱해 (u, v) 산출

model = {
    "mu":    mu.tolist(),
    "sigma": sigma.tolist(),
    "W":     W.tolist()
}

with open("pca_model.json", "w", encoding="utf-8") as f:
    json.dump(model, f, ensure_ascii=False, indent=2)


# Colab이라면 다운로드
from google.colab import files
files.download("pca_model.json")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>