In [2]:
# ============================================================================
# 1. 라이브러리 및 설정
# ============================================================================

import pandas as pd
import numpy as np
import os
import json
import math
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import BatchNormalization
from time import time
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import roc_auc_score, log_loss, ndcg_score
from sklearn.model_selection import KFold
from argparse import Namespace

# 경로 설정
DATA_PATH = './processed_data/'
MODEL_SAVE_PATH = './saved_models/autoint/'

In [3]:
# ============================================================================
# 2. 전처리 함수들
# ============================================================================

def safe_divide(numerator, denominator, default=0.0):
    """안전한 나눗셈 헬퍼 함수"""
    if denominator > 0:
        return numerator / denominator
    return default

class DataPreprocessor:
    def __init__(self, chair_path, person_path, output_dir):
        self.chair_path = chair_path
        self.person_path = person_path
        self.output_dir = output_dir
        
        # 사용할 Chair 피처 정의
        self.categorical_features = [
            '헤드레스트 유무', '팔걸이 유무', '요추지지대 유무', 
            '높이 조절 레버 유무', '틸팅 여부', '등받이 곧/꺾'
        ]
        
        self.numerical_features = [
            'h8_지면-좌석 높이_MIN', 'h8_지면-좌석 높이_MAX',
            'b3_좌석 가로 길이', 't4_좌석 세로 길이 일반',
            'b4_등받이 가로 길이', 'h7_등받이 세로 길이'
        ]
        
        self.person_features = [
            'human-height', 'A_Buttock-popliteal length',
            'B_Popliteal-height', 'C_Hip-breadth',
            'F_Sitting-height', 'G_Shoulder-breadth'
        ]
        
        os.makedirs(output_dir, exist_ok=True)

    def load_data(self):
        """데이터 로드 및 기본 전처리"""
        # Chair 데이터 로드
        self.chair_df = pd.read_excel(self.chair_path, engine='openpyxl')
        
        # Person 데이터 로드 (cm -> mm 변환)
        self.person_df = pd.read_csv(self.person_path, encoding='utf-8')
        for col in self.person_features:
            if col in self.person_df.columns:
                self.person_df[col] *= 10  # cm to mm
        
        # 범주형 피처 전처리
        for col in self.categorical_features:
            if col in self.chair_df.columns:
                if col == '등받이 곧/꺾':
                    self.chair_df[col] = self.chair_df[col].map({'곧': 0, '꺾': 1})
                else:
                    self.chair_df[col] = self.chair_df[col].map({'O': 1, 'X': 0})
        
        # 결측값 처리
        self.chair_df['h8_지면-좌석 높이_MAX'] = np.where(pd.isna(self.chair_df['h8_지면-좌석 높이_MAX']),
                                                        self.chair_df['h8_지면-좌석 높이_MIN'],
                                                        self.chair_df['h8_지면-좌석 높이_MAX'])
        self.chair_df[self.categorical_features] = self.chair_df[self.categorical_features].fillna(0)
        self.chair_df[self.numerical_features] = self.chair_df[self.numerical_features].fillna(self.chair_df[self.numerical_features].mean())

    def create_feature_mappings(self):
        """피처 인덱스 매핑 생성"""
        self.feature_idx_map = {}
        idx = 1
        
        # Person 피처 (6개)
        for feat in self.person_features:
            self.feature_idx_map[f'person_{feat}'] = idx
            idx += 1
        
        # Chair 수치형 피처 (6개)
        for feat in self.numerical_features:
            self.feature_idx_map[f'chair_{feat}'] = idx
            idx += 1
        
        # 상호작용 피처 (6개)
        interaction_features = [
            'height_match_score', 'width_margin_ratio', 'depth_margin_ratio',
            'backrest_height_ratio', 'shoulder_width_ratio', 'adjustable_range'
        ]
        for feat in interaction_features:
            self.feature_idx_map[feat] = idx
            idx += 1
        
        # 이진 범주형 피처 시작 오프셋
        self.binary_offset = idx
        
        # 이진 피처 인덱스
        for i, feat in enumerate(self.categorical_features):
            self.feature_idx_map[f'{feat}_0'] = self.binary_offset + i * 2
            self.feature_idx_map[f'{feat}_1'] = self.binary_offset + i * 2 + 1

    def calculate_interaction_features(self, person_row, chair_row):
        """Person과 Chair 간의 상호작용 피처 계산"""
        features = {}
        
        h8_mid = (chair_row['h8_지면-좌석 높이_MIN'] + chair_row['h8_지면-좌석 높이_MAX']) / 2
        h8_range = chair_row['h8_지면-좌석 높이_MAX'] - chair_row['h8_지면-좌석 높이_MIN']
        popliteal_height = person_row['B_Popliteal-height']
        
        if h8_range > 0:
            if chair_row['h8_지면-좌석 높이_MIN'] <= popliteal_height <= chair_row['h8_지면-좌석 높이_MAX']:
                features['height_match_score'] = 1.0
            else:
                if popliteal_height < chair_row['h8_지면-좌석 높이_MIN']:
                    dist = chair_row['h8_지면-좌석 높이_MIN'] - popliteal_height
                else:
                    dist = popliteal_height - chair_row['h8_지면-좌석 높이_MAX']
                features['height_match_score'] = max(0, 1 - dist / 100)
        else:
            features['height_match_score'] = max(0, 1 - abs(h8_mid - popliteal_height) / 50)
        
        features['width_margin_ratio'] = safe_divide(
            chair_row['b3_좌석 가로 길이'] - person_row['C_Hip-breadth'], 
            person_row['C_Hip-breadth']
        )
        features['depth_margin_ratio'] = safe_divide(
            person_row['A_Buttock-popliteal length'] - chair_row['t4_좌석 세로 길이 일반'], 
            person_row['A_Buttock-popliteal length']
        )
        features['backrest_height_ratio'] = safe_divide(
            chair_row['h7_등받이 세로 길이'], 
            person_row['F_Sitting-height']
        )
        features['shoulder_width_ratio'] = safe_divide(
            chair_row['b4_등받이 가로 길이'], 
            person_row['G_Shoulder-breadth']
        )
        features['adjustable_range'] = h8_range
        
        return features
    
    def check_matching_conditions(self, person_row, chair_row):
        """필수 매칭 조건 확인 및 레이블 생성"""
        conditions = {
            't4 < A': chair_row['t4_좌석 세로 길이 일반'] < person_row['A_Buttock-popliteal length'],
            'h8 ≈ B': (chair_row['h8_지면-좌석 높이_MIN'] <= person_row['B_Popliteal-height'] <= chair_row['h8_지면-좌석 높이_MAX']) 
                      if chair_row['h8_지면-좌석 높이_MAX'] > chair_row['h8_지면-좌석 높이_MIN']
                      else abs((chair_row['h8_지면-좌석 높이_MIN'] + chair_row['h8_지면-좌석 높이_MAX'])/2 - person_row['B_Popliteal-height']) < 50,
            'b3 > C': chair_row['b3_좌석 가로 길이'] > person_row['C_Hip-breadth'],
            'h7 < F': chair_row['h7_등받이 세로 길이'] < person_row['F_Sitting-height'],
            'b4 ≥ G': chair_row['b4_등받이 가로 길이'] >= person_row['G_Shoulder-breadth']
        }
        
        all_satisfied = all(conditions.values())
        soft_label = sum(conditions.values()) / len(conditions)
        
        return int(all_satisfied), soft_label, conditions
    
    def process_data(self):
        """전체 데이터 처리 및 파일 생성"""
        self.load_data()
        self.create_feature_mappings()
        
        f_train_value = open(os.path.join(self.output_dir, 'train_x.txt'), 'w')
        f_train_index = open(os.path.join(self.output_dir, 'train_i.txt'), 'w')
        f_train_label = open(os.path.join(self.output_dir, 'train_y.txt'), 'w')
        
        cnt = 0
        positive_cnt = 0
        
        for _, person in self.person_df.iterrows():
            for _, chair in self.chair_df.iterrows():
                cnt += 1
                
                values = []
                indices = []
                
                # Person 수치형 피처
                for feat in self.person_features:
                    if feat in person.index:
                        values.append(str(person[feat]))
                        indices.append(str(self.feature_idx_map[f'person_{feat}']))
                
                # Chair 수치형 피처
                for feat in self.numerical_features:
                    values.append(str(chair[feat]))
                    indices.append(str(self.feature_idx_map[f'chair_{feat}']))
                
                # 상호작용 피처
                interaction_feats = self.calculate_interaction_features(person, chair)
                for feat_name, feat_value in interaction_feats.items():
                    values.append(str(feat_value))
                    indices.append(str(self.feature_idx_map[feat_name]))
                
                # 이진 범주형 피처
                for feat in self.categorical_features:
                    values.append('1')
                    feat_value = int(chair[feat]) if not pd.isna(chair[feat]) else 0
                    idx_key = f'{feat}_{feat_value}'
                    indices.append(str(self.feature_idx_map[idx_key]))
                
                # 레이블 계산
                hard_label, soft_label, conditions = self.check_matching_conditions(person, chair)
                
                f_train_value.write(' '.join(values) + '\n')
                f_train_index.write(' '.join(indices) + '\n')
                f_train_label.write(f'{soft_label:.4f}\n')
                
                if hard_label == 1:
                    positive_cnt += 1
                
                if cnt % 1000 == 0:
                    print(f'Processed {cnt} combinations...')
        
        f_train_value.close()
        f_train_index.close()
        f_train_label.close()
        
        print(f"\nTotal combinations: {cnt}")
        print(f"Positive matches: {positive_cnt} ({positive_cnt/cnt*100:.2f}%)")
        
        # 메타데이터 저장
        metadata = {
            'feature_mappings': self.feature_idx_map,
            'total_features': len(self.feature_idx_map),
            'numerical_features': self.binary_offset - 1,
            'categorical_features': self.categorical_features,
            'person_features': self.person_features,
            'chair_numerical_features': self.numerical_features
        }
        
        with open(os.path.join(self.output_dir, 'metadata.json'), 'w') as f:
            json.dump(metadata, f, indent=2, ensure_ascii=False)

In [4]:
# ============================================================================
# 3. K-fold 분할
# ============================================================================

def _load_data(_nrows=None):
    """train_x.txt와 train_y.txt 파일을 읽어 데이터를 로드"""
    train_x = pd.read_csv(DATA_PATH + 'train_x.txt', header=None, sep=' ', nrows=_nrows, dtype=np.float64)
    train_y = pd.read_csv(DATA_PATH + 'train_y.txt', header=None, sep=' ', nrows=_nrows, dtype=np.float64)
    
    train_x = train_x.values
    train_y = train_y.values.reshape([-1])
    
    print('Data loading done!')
    print('Training data: %d' % train_y.shape[0])
    
    assert train_x.shape[0] == train_y.shape[0]
    return train_x, train_y

def save_x_y(fold_index, train_x, train_y):
    """10개의 fold로 분할된 데이터를 각각 저장"""
    _get = lambda x, l: [x[i] for i in l]

    for i in range(len(fold_index)):
        print("Now part %d" % (i+1))
        part_index = fold_index[i]
        
        Xv_train_, y_train_ = _get(train_x, part_index), _get(train_y, part_index)
        
        save_dir = DATA_PATH + "part" + str(i+1) + "/"
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        
        np.save(save_dir + 'train_x.npy', Xv_train_)
        np.save(save_dir + 'train_y.npy', y_train_)

def save_i(fold_index):
    """train_i.txt 파일을 읽어 범주형 특성 인덱스를 fold별로 저장"""
    _get = lambda x, l: [x[i] for i in l]
    
    train_i = pd.read_csv(DATA_PATH + 'train_i.txt', header=None, sep=' ', dtype=np.float64)
    train_i = train_i.values
    
    feature_size = train_i.max() + 1
    print("Feature size = %d" % feature_size)
    
    np.save(DATA_PATH + "feature_size.npy", np.array([feature_size]))
    
    for i in range(len(fold_index)):
        print("Now part %d" % (i+1))
        part_index = fold_index[i]
        Xi_train_ = _get(train_i, part_index)
        np.save(DATA_PATH + "part" + str(i+1) + '/train_i.npy', Xi_train_)


In [5]:
# ============================================================================
# 4. 스케일링
# ============================================================================

def scale(x):
    """개별 수치형 피처를 스케일링하는 함수"""
    if x > 2:
        x = int(math.log(float(x))**2)
    return x

def scale_each_fold():
    """10개의 fold에 대해 각각 스케일링을 수행"""
    for i in range(1, 11):
        print('Now part %d' % i)
        
        data = np.load(DATA_PATH + 'part'+str(i)+'/train_x.npy')
        part = data[:, 0:18]  # 처음 18개 수치형 피처만
        
        for j in range(part.shape[0]):
            if j % 100000 == 0:
                print(j)
            part[j] = list(map(scale, part[j]))
        
        np.save(DATA_PATH + 'part' + str(i) + '/train_x2.npy', data)

In [6]:
# ============================================================================
# 5. DRM (Differentiable Ranking Metrics) 구현
# ============================================================================

class DifferentiableRankingLoss:
    """TensorFlow implementation of differentiable ranking metrics for top-k"""
    
    @staticmethod
    def detNeuralSort(s, tau=1.0, k=5):
        """Deterministic neural sort for ranking"""
        batch_size = tf.shape(s)[0]
        n = tf.shape(s)[1]
        
        # Expand dimensions for broadcasting
        su = tf.expand_dims(s, axis=-1)  # [batch_size, n_items, 1]
        
        # Create matrices
        one = tf.ones([n, 1], dtype=tf.float32)
        one_k = tf.ones([1, k], dtype=tf.float32)
        
        # Compute A_s = |s_i - s_j|
        A_s = tf.abs(su - tf.transpose(su, [0, 2, 1]))  # [batch_size, n, n]
        
        # Compute B
        B = tf.matmul(A_s, tf.matmul(one, one_k))  # [batch_size, n, k]
        
        # Compute scaling
        scaling = tf.cast(n + 1 - 2 * (tf.range(n) + 1), tf.float32)
        scaling = tf.expand_dims(scaling, 0)  # [1, n]
        
        # Compute C
        C = tf.expand_dims(s * scaling, -1)[:, :, :k]  # [batch_size, n, k]
        
        # Compute P_max
        P_max = tf.transpose(C - B, [0, 2, 1])  # [batch_size, k, n]
        
        # Apply softmax
        P_hat = tf.nn.softmax(P_max / tau, axis=-1)
        
        return P_hat
    
    @staticmethod
    def neuNDCGLoss(scores, labels, k=5, tau=10.0):
        """Neural NDCG Loss for ranking"""
        batch_size = tf.shape(scores)[0]
        n_items = tf.shape(scores)[1]
        
        # Create discount matrix
        discounts = 1.0 / tf.math.log(tf.range(2, k + 2, dtype=tf.float32))  # log base is e, so we use range(2, k+2)
        diag = tf.linalg.diag(discounts)
        
        # Get top-k for efficiency (사실상 전체를 다 사용하지만 구조상 필요)
        k_actual = tf.minimum(k, n_items)
        
        # Neural sort 적용
        P_hat = DifferentiableRankingLoss.detNeuralSort(scores, tau=tau, k=k_actual)
        
        # IDCG 계산 (Ideal DCG)
        sorted_labels, _ = tf.nn.top_k(labels, k=k_actual)
        ideal_dcg = tf.reduce_sum(sorted_labels * discounts[:k_actual], axis=1)
        
        # DCG 계산
        # P_hat: [batch_size, k, n_items], labels: [batch_size, n_items]
        # 각 포지션에서의 기대 레이블 값 계산
        expected_labels_at_positions = tf.matmul(P_hat, tf.expand_dims(labels, -1))  # [batch_size, k, 1]
        expected_labels_at_positions = tf.squeeze(expected_labels_at_positions, -1)  # [batch_size, k]
        
        # DCG 계산
        dcg = tf.reduce_sum(expected_labels_at_positions * discounts[:k_actual], axis=1)
        
        # NDCG 계산 (0으로 나누기 방지)
        ndcg = dcg / (ideal_dcg + 1e-10)
        
        # Loss는 -NDCG (최대화하기 위해)
        loss = -tf.reduce_mean(ndcg)
        
        return loss
    
    @staticmethod
    def neuPrecisionLoss(scores, labels, k=5, tau=10.0):
        """Neural Precision@k Loss"""
        batch_size = tf.shape(scores)[0]
        n_items = tf.shape(scores)[1]
        k_actual = tf.minimum(k, n_items)
        
        # Neural sort
        P_hat = DifferentiableRankingLoss.detNeuralSort(scores, tau=tau, k=k_actual)
        
        # Precision@k 계산
        # P_hat: [batch_size, k, n_items]에서 각 포지션의 기대 레이블 값
        expected_labels = tf.matmul(P_hat, tf.expand_dims(labels, -1))  # [batch_size, k, 1]
        expected_labels = tf.squeeze(expected_labels, -1)  # [batch_size, k]
        
        # Precision = (top-k에서 relevant한 것들의 수) / k
        precision = tf.reduce_sum(expected_labels, axis=1) / tf.cast(k_actual, tf.float32)
        
        # Loss는 -Precision
        loss = -tf.reduce_mean(precision)
        
        return loss

In [7]:
# ============================================================================
# 6. AutoInt 모델 정의 (Ranking Loss 포함)
# ============================================================================

def normalize(inputs, epsilon=1e-8):
    """Layer Normalization"""
    inputs_shape = inputs.get_shape()
    params_shape = inputs_shape[-1:]

    mean, variance = tf.nn.moments(inputs, [-1], keepdims=True)
    beta = tf.Variable(tf.zeros(params_shape))
    gamma = tf.Variable(tf.ones(params_shape))
    normalized = (inputs - mean) / ((variance + epsilon) ** (.5))
    outputs = gamma * normalized + beta

    return outputs

def multihead_attention(queries, keys, values, num_units=None, num_heads=1,
                        dropout_keep_prob=1, is_training=True, has_residual=True):
    """Multi-head Self-Attention"""
    if num_units is None:
        num_units = queries.get_shape().as_list()[-1]

    # Linear projections
    Q = tf.keras.layers.Dense(num_units, activation=tf.nn.relu)(queries)
    K = tf.keras.layers.Dense(num_units, activation=tf.nn.relu)(keys)
    V = tf.keras.layers.Dense(num_units, activation=tf.nn.relu)(values)
    if has_residual:
        V_res = tf.keras.layers.Dense(num_units, activation=tf.nn.relu)(values)

    # Split and concat
    Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0)
    K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0)
    V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0)

    # Multiplication
    weights = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1]))
    weights = weights / (K_.get_shape().as_list()[-1] ** 0.5)
    weights = tf.nn.softmax(weights)

    # Dropouts
    weights = tf.cond(is_training,
                      lambda: tf.nn.dropout(weights, keep_prob=dropout_keep_prob),
                      lambda: weights)

    # Weighted sum
    outputs = tf.matmul(weights, V_)
    outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2)

    # Residual connection
    if has_residual:
        outputs += V_res

    outputs = tf.nn.relu(outputs)
    outputs = normalize(outputs)
        
    return outputs

class AutoInt():
    def __init__(self, args, feature_size, run_cnt):
        """AutoInt 모델 초기화"""
        self.feature_size = int(feature_size)
        self.field_size = args.field_size
        self.run_cnt = run_cnt
        self.embedding_size = int(args.embedding_size)
        self.blocks = args.blocks
        self.heads = args.heads
        self.block_shape = args.block_shape
        self.output_size = args.block_shape[-1]
        self.has_residual = args.has_residual
        self.deep_layers = args.deep_layers

        # 학습 관련 하이퍼파라미터
        self.batch_norm = args.batch_norm
        self.batch_norm_decay = args.batch_norm_decay
        self.drop_keep_prob = args.dropout_keep_prob
        self.l2_reg = args.l2_reg
        self.epoch = args.epoch
        self.batch_size = args.batch_size
        self.learning_rate = args.learning_rate
        self.optimizer_type = args.optimizer_type

        # 모델 저장 경로
        self.save_path = args.save_path + str(run_cnt) + '/'
        self.is_save = args.is_save
        if (args.is_save == True and os.path.exists(self.save_path) == False):
            os.makedirs(self.save_path)	

        self.verbose = args.verbose
        self.random_seed = args.random_seed
        # Ranking loss 관련 파라미터
        self.loss_type = args.loss_type
        self.ranking_k = getattr(args, 'ranking_k', 5)
        self.ranking_tau = getattr(args, 'ranking_tau', 10.0)
        self.eval_metric = roc_auc_score
        self.best_loss = 1.0
        self.greater_is_better = args.greater_is_better
        self.train_result, self.valid_result = [], []
        self.train_loss, self.valid_loss = [], []
        
        self._init_graph()

    def _init_graph(self):
        """TensorFlow 계산 그래프 초기화"""
        self.graph = tf.Graph()
        with self.graph.as_default():
            tf.set_random_seed(self.random_seed)

            # 입력 placeholder 정의
            self.feat_index = tf.placeholder(tf.int32, shape=[None, None], name="feat_index")
            self.feat_value = tf.placeholder(tf.float32, shape=[None, None], name="feat_value")
            self.label = tf.placeholder(tf.float32, shape=[None, 1], name="label")
            self.dropout_keep_prob = tf.placeholder(tf.float32, shape=[None], name="dropout_keep_prob")
            self.train_phase = tf.placeholder(tf.bool, name="train_phase")

            self.weights = self._initialize_weights()

            # 1. Embedding layer
            self.embeddings = tf.nn.embedding_lookup(self.weights["feature_embeddings"], self.feat_index)
            feat_value = tf.reshape(self.feat_value, shape=[-1, tf.shape(self.feat_index)[1], 1])
            self.embeddings = tf.multiply(self.embeddings, feat_value)
            self.embeddings = tf.nn.dropout(self.embeddings, self.dropout_keep_prob[1])
            
            # 2. DNN 부분 (선택적)
            if self.deep_layers != None:
                self.y_dense = tf.reshape(self.embeddings, shape=[-1, self.field_size * self.embedding_size])
                
                for i in range(0, len(self.deep_layers)):
                    self.y_dense = tf.add(tf.matmul(self.y_dense, self.weights["layer_%d" %i]), self.weights["bias_%d"%i])
                    if self.batch_norm:
                        bn_layer = BatchNormalization(momentum=self.batch_norm_decay, epsilon=1e-5, center=True, scale=True, name="bn_%d" % i)
                        self.y_dense = bn_layer(self.y_dense, training=self.train_phase)
                    self.y_dense = tf.nn.relu(self.y_dense)
                    self.y_dense = tf.nn.dropout(self.y_dense, self.dropout_keep_prob[2])
                    
                self.y_dense = tf.add(tf.matmul(self.y_dense, self.weights["prediction_dense"]),
                                      self.weights["prediction_bias_dense"], name='logits_dense')
            
            # 3. AutoInt 핵심 부분: Multi-head Self-Attention
            self.y_deep = self.embeddings
            for i in range(self.blocks):   
                self.y_deep = multihead_attention(queries=self.y_deep, keys=self.y_deep, values=self.y_deep,
                                                  num_units=self.block_shape[i], num_heads=self.heads,
                                                  dropout_keep_prob=self.dropout_keep_prob[0],
                                                  is_training=self.train_phase, has_residual=self.has_residual)

            # Flatten
            self.flat = tf.reshape(self.y_deep, [tf.shape(self.y_deep)[0], -1])

            # 최종 예측
            self.y = tf.add(tf.matmul(self.flat, self.weights['prediction']), self.weights['prediction_bias'], name='logits')
            self.out = tf.nn.sigmoid(self.y)

            # DNN과 AutoInt 결합 (선택적)
            if self.deep_layers != None:
                self.out = tf.nn.sigmoid(self.y + self.y_dense)
            else:
                self.out = tf.nn.sigmoid(self.y)
        
            # ============ Ranking 손실 함수 정의 ============
            if self.loss_type == "ranking_ndcg":
                # 배치를 재구성: 각 사람당 여러 의자들을 하나의 랭킹 문제로 처리
                # 현재 구조에서는 배치 내의 모든 아이템을 하나의 랭킹으로 처리
                self.loss = DifferentiableRankingLoss.neuNDCGLoss(
                    scores=self.out, 
                    labels=self.label, 
                    k=self.ranking_k,
                    tau=self.ranking_tau
                )
            elif self.loss_type == "ranking_precision":
                self.loss = DifferentiableRankingLoss.neuPrecisionLoss(
                    scores=self.out,
                    labels=self.label,
                    k=self.ranking_k, 
                    tau=self.ranking_tau
                )
            elif self.loss_type == "logloss":
                # 기본 logloss (비교용)
                self.loss = tf.losses.log_loss(self.label, self.out)
            elif self.loss_type == "mse":
                self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out))
            else:
                # 기본값: soft ranking loss (weighted binary cross entropy)
                epsilon = 1e-7
                self.loss = tf.reduce_mean(
                    -self.label * tf.log(self.out + epsilon) - 
                    (1 - self.label) * tf.log(1 - self.out + epsilon)
                )

            # L2 정규화
            if self.l2_reg > 0:
                reg_weights = [weight for name, weight in self.weights.items() if 'bias' not in name]
                if reg_weights:
                    self.loss += tf.add_n([tf.nn.l2_loss(w) for w in reg_weights]) * self.l2_reg
           
            # Optimizer 설정
            self.global_step = tf.Variable(0, name="global_step", trainable=False)
            
            if self.optimizer_type == "adam":
                self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8).minimize(self.loss, global_step=self.global_step)
            elif self.optimizer_type == "adagrad":
                self.optimizer = tf.train.AdagradOptimizer(learning_rate=self.learning_rate, initial_accumulator_value=1e-8).minimize(self.loss, global_step=self.global_step)
            elif self.optimizer_type == "gd":
                self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate).minimize(self.loss, global_step=self.global_step)

            # 초기화
            self.saver = tf.train.Saver(max_to_keep=5)
            init = tf.global_variables_initializer()
            self.sess = self._init_session()
            self.sess.run(init)
            self.count_param()

    def count_param(self):
        """모델의 전체 파라미터 개수 계산"""
        k = np.sum([np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()])
        print("Total parameters: %d" % k) 
        print("Extra parameters: %d" % (k - self.feature_size * self.embedding_size))

    def _init_session(self):
        """TensorFlow 세션 초기화"""
        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        return tf.Session(config=config)

    def _initialize_weights(self):
        """모델 가중치 초기화"""
        weights = dict()

        # 특성 임베딩 테이블
        weights["feature_embeddings"] = tf.Variable(
            tf.random_normal([self.feature_size, self.embedding_size], 0.0, 0.01),
            name="feature_embeddings")

        if self.blocks > 0:
            final_attention_size = self.block_shape[-1] * self.field_size
            input_size = final_attention_size
        else:
            input_size = self.embedding_size * self.field_size

        # DNN 레이어 가중치 (선택적)
        if self.deep_layers != None:
            num_layer = len(self.deep_layers)
            layer0_size = self.field_size * self.embedding_size
            glorot = np.sqrt(2.0 / (layer0_size + self.deep_layers[0]))
            weights["layer_0"] = tf.Variable(
                np.random.normal(loc=0, scale=glorot, size=(layer0_size, self.deep_layers[0])), dtype=np.float32)
            weights["bias_0"] = tf.Variable(np.random.normal(loc=0, scale=glorot, size=(1, self.deep_layers[0])), dtype=np.float32)
            
            for i in range(1, num_layer):
                glorot = np.sqrt(2.0 / (self.deep_layers[i-1] + self.deep_layers[i]))
                weights["layer_%d" % i] = tf.Variable(
                    np.random.normal(loc=0, scale=glorot, size=(self.deep_layers[i-1], self.deep_layers[i])), dtype=np.float32)
                weights["bias_%d" % i] = tf.Variable(
                    np.random.normal(loc=0, scale=glorot, size=(1, self.deep_layers[i])), dtype=np.float32)
            
            glorot = np.sqrt(2.0 / (self.deep_layers[-1] + 1))
            weights["prediction_dense"] = tf.Variable(
                                np.random.normal(loc=0, scale=glorot, size=(self.deep_layers[-1], 1)), dtype=np.float32, name="prediction_dense")
            weights["prediction_bias_dense"] = tf.Variable(np.random.normal(), dtype=np.float32, name="prediction_bias_dense")

        # AutoInt 최종 예측층 가중치
        glorot = np.sqrt(2.0 / (input_size + 1))
        weights["prediction"] = tf.Variable(
                            np.random.normal(loc=0, scale=glorot, size=(input_size, 1)), dtype=np.float32, name="prediction")
        weights["prediction_bias"] = tf.Variable(np.random.normal(), dtype=np.float32, name="prediction_bias")

        return weights

    def get_batch(self, Xi, Xv, y, batch_size, index):
        """배치 데이터 추출"""
        start = index * batch_size
        end = (index+1) * batch_size
        end = end if end < len(y) else len(y)
        return Xi[start:end], Xv[start:end], [[y_] for y_ in y[start:end]]

    def shuffle_in_unison_scary(self, a, b, c):
        """세 개의 배열을 동일한 순서로 셔플"""
        rng_state = np.random.get_state()
        np.random.shuffle(a)
        np.random.set_state(rng_state)
        np.random.shuffle(b)
        np.random.set_state(rng_state)
        np.random.shuffle(c)

    def fit_on_batch(self, Xi, Xv, y):
        """한 배치에 대한 학습 수행"""
        feed_dict = {self.feat_index: Xi, self.feat_value: Xv, self.label: y,
                     self.dropout_keep_prob: self.drop_keep_prob, self.train_phase: True}
        step, loss, opt = self.sess.run((self.global_step, self.loss, self.optimizer), feed_dict=feed_dict)
        return step, loss

    def fit_once(self, Xi_train, Xv_train, y_train, epoch, file_count, Xi_valid=None, Xv_valid=None, y_valid=None, early_stopping=False):
        """하나의 데이터 파일에 대한 전체 학습 수행"""
        has_valid = Xv_valid is not None
        last_step = 0
        t1 = time()
        
        self.shuffle_in_unison_scary(Xi_train, Xv_train, y_train)
        total_batch = int(len(y_train) / self.batch_size)
        
        for i in range(total_batch):
            Xi_batch, Xv_batch, y_batch = self.get_batch(Xi_train, Xv_train, y_train, self.batch_size, i)
            step, loss = self.fit_on_batch(Xi_batch, Xv_batch, y_batch)
            last_step = step

        # 학습 데이터 평가
        train_result, train_loss = self.evaluate(Xi_train, Xv_train, y_train)
        self.train_result.append(train_result)
        self.train_loss.append(train_loss)

        # 검증 데이터 평가
        if has_valid:
            valid_result, valid_loss = self.evaluate(Xi_valid, Xv_valid, y_valid)
            self.valid_result.append(valid_result)
            self.valid_loss.append(valid_loss)

            # 최적 모델 저장
            if valid_loss < self.best_loss and self.is_save == True:
                old_loss = self.best_loss
                self.best_loss = valid_loss
                self.saver.save(self.sess, self.save_path + 'model.ckpt', global_step=last_step)
                print("[%d-%d] Model saved! Valid loss improved from %.4f to %.4f" % (epoch, file_count, old_loss, self.best_loss))

        # 학습 결과 출력
        if self.verbose > 0 and ((epoch-1)*9 + file_count) % self.verbose == 0:
            if has_valid:
                print("[%d-%d] train-result=%.4f, train-loss=%.4f, valid-result=%.4f, valid-loss=%.4f [%.1f s]" % 
                      (epoch, file_count, train_result, train_loss, valid_result, valid_loss, time() - t1))
            else:
                print("[%d-%d] train-result=%.4f [%.1f s]" % (epoch, file_count, train_result, time() - t1))
                
        return True

    def predict(self, Xi, Xv):
        """예측 수행"""
        dummy_y = [1] * len(Xi)
        batch_index = 0
        Xi_batch, Xv_batch, y_batch = self.get_batch(Xi, Xv, dummy_y, self.batch_size, batch_index)
        y_pred = None

        while len(Xi_batch) > 0:
            num_batch = len(y_batch)
            feed_dict = {self.feat_index: Xi_batch, self.feat_value: Xv_batch, self.label: y_batch,
                         self.dropout_keep_prob: [1.0] * len(self.drop_keep_prob), self.train_phase: False}
            batch_out = self.sess.run(self.out, feed_dict=feed_dict)

            if batch_index == 0:
                y_pred = np.reshape(batch_out, (num_batch,))
            else:
                y_pred = np.concatenate((y_pred, np.reshape(batch_out, (num_batch,))))

            batch_index += 1
            Xi_batch, Xv_batch, y_batch = self.get_batch(Xi, Xv, dummy_y, self.batch_size, batch_index)

        return y_pred

    def evaluate(self, Xi, Xv, y):
        """모델 평가 - Top-k 지표 중심"""
        y_pred = self.predict(Xi, Xv)
        y_pred = np.clip(y_pred, 1e-6, 1-1e-6)

        # Top-k 평가 지표들
        results = {}
        
        # NDCG@5, NDCG@10 계산
        for k in [5, 10, 20]:
            try:
                if len(y) >= k:
                    ndcg_k = ndcg_score(y.reshape(1, -1), y_pred.reshape(1, -1), k=k)
                    results[f'ndcg@{k}'] = ndcg_k
                else:
                    results[f'ndcg@{k}'] = 0.0
            except:
                results[f'ndcg@{k}'] = 0.0
        
        # Precision@k 계산
        for k in [5, 10, 20]:
            if len(y) >= k:
                # Top-k 예측의 인덱스
                top_k_indices = np.argsort(y_pred)[-k:]
                # Top-k에서 relevant items 수 (soft label > 0.5)
                relevant_in_topk = np.sum(y[top_k_indices] > 0.5)
                precision_k = relevant_in_topk / k
                results[f'precision@{k}'] = precision_k
            else:
                results[f'precision@{k}'] = 0.0
        
        # Recall@k 계산
        total_relevant = np.sum(y > 0.5)
        for k in [5, 10, 20]:
            if len(y) >= k and total_relevant > 0:
                top_k_indices = np.argsort(y_pred)[-k:]
                relevant_in_topk = np.sum(y[top_k_indices] > 0.5)
                recall_k = relevant_in_topk / total_relevant
                results[f'recall@{k}'] = recall_k
            else:
                results[f'recall@{k}'] = 0.0
        
        # 메인 메트릭으로 NDCG@10 사용
        main_metric = results.get('ndcg@10', 0.0)
        
        # 손실 계산 (ranking loss 근사)
        epsilon = 1e-7
        y_pred_clipped = np.clip(y_pred, epsilon, 1 - epsilon)
        loss = -np.mean(y * np.log(y_pred_clipped) + (1 - y) * np.log(1 - y_pred_clipped))
        
        # 상세 결과 출력
        print(f"Evaluation Results:")
        for metric, value in results.items():
            print(f"  {metric}: {value:.4f}")
        
        return main_metric, loss

    def restore(self, save_path=None):
        """저장된 모델 복원"""
        if save_path == None:
            save_path = self.save_path
        ckpt = tf.train.get_checkpoint_state(save_path)  
        if ckpt and ckpt.model_checkpoint_path:  
            self.saver.restore(self.sess, ckpt.model_checkpoint_path) 
            if self.verbose > 0:
                print("Restored from %s" % save_path)


In [8]:
# ============================================================================
# 7. 실행 코드
# ============================================================================

# 설정 정의 - Ranking Loss 사용
args = Namespace(
    is_save=True,
    greater_is_better=True,
    has_residual=True,
    blocks=2,
    block_shape=[16, 16],
    heads=2,
    embedding_size=16,
    dropout_keep_prob=[0.8, 0.8, 0.5],  # 드롭아웃 약간 적용
    epoch=5,  # 조금 더 많은 에폭
    batch_size=512,  # 배치 크기 줄임 (ranking loss가 메모리를 더 사용)
    learning_rate=0.001,
    optimizer_type='adam',
    l2_reg=0.001,
    random_seed=42,
    save_path=MODEL_SAVE_PATH,
    field_size=24,
    
    # ⭐ Top-k Ranking Loss 설정
    loss_type='ranking_ndcg',      # 'ranking_ndcg' 또는 'ranking_precision'
    ranking_k=10,                  # Top-10으로 평가
    ranking_tau=5.0,               # Neural sort temperature (낮을수록 sharp)
    
    verbose=1,
    run_times=1,
    deep_layers=None,              # 또는 [128, 64] 등 DNN 추가 가능
    batch_norm=0,
    batch_norm_decay=0.995
)

print("=== Chair Recommendation AutoInt Model Training ===")
print("🎯 Top-k Ranking 최적화 모델 훈련")
print("이 노트북은 전체 파이프라인을 실행합니다:")
print("1. 데이터 전처리")
print("2. K-fold 분할") 
print("3. 스케일링")
print("4. AutoInt + DRM 모델 훈련 (Top-k 최적화)")
print("5. Top-k 지표 평가 (NDCG@k, Precision@k, Recall@k)")
print("6. 모델 저장")
print()
print(f"📊 현재 설정:")
print(f"   Loss Function: {args.loss_type}")
print(f"   Ranking K: {args.ranking_k}")
print(f"   Temperature: {args.ranking_tau}")
print(f"   Batch Size: {args.batch_size} (ranking loss 최적화)")
print()

# 단계별 실행
run_preprocessing = True  # 전처리 실행 여부
run_training = True       # 훈련 실행 여부

if run_preprocessing:
    print("=== Step 1: 데이터 전처리 ===")
    # 실제 파일 경로로 수정 필요
    preprocessor = DataPreprocessor(
        chair_path='chair_dummy.xlsx',  # 실제 의자 데이터 파일
        person_path='person.csv',       # 실제 사람 데이터 파일
        output_dir=DATA_PATH
    )
    preprocessor.process_data()
    print("전처리 완료!")
    print()

    print("=== Step 2: K-fold 분할 ===")
    train_x, train_y = _load_data()
    folds = list(KFold(n_splits=10, shuffle=True, random_state=args.random_seed).split(train_x))
    fold_index = [valid_id for train_id, valid_id in folds]
    
    np.save(DATA_PATH + "fold_index.npy", np.array(fold_index))
    save_x_y(fold_index, train_x, train_y)
    save_i(fold_index)
    print("K-fold 분할 완료!")
    print()

    print("=== Step 3: 스케일링 ===")
    scale_each_fold()
    print("스케일링 완료!")
    print()

if run_training:
    print("=== Step 4: AutoInt + DRM 모델 훈련 ===")
    print(f"🎯 Ranking Loss: {args.loss_type}")
    print(f"📊 Top-{args.ranking_k} 최적화")
    print()
    
    # 특성 크기 로드
    feature_size = np.load(DATA_PATH + '/feature_size.npy')[0]
    print(f"Feature size: {feature_size}")
    
    # 검증 데이터 로드
    Xi_valid = np.load(DATA_PATH + '/part2/train_i.npy')
    Xv_valid = np.load(DATA_PATH + '/part2/train_x2.npy')
    y_valid = np.load(DATA_PATH + '/part2/train_y.npy')
    
    # 실제 field_size 설정
    args.field_size = Xi_valid.shape[1]
    print(f"Field size: {args.field_size}")
    
    # 모델 생성
    model = AutoInt(args=args, feature_size=feature_size, run_cnt=1)
    
    # 훈련 시작
    for k in range(args.epoch):
        print(f"\n=== Epoch {k+1}/{args.epoch} ===")
        
        for j in range(3, 6):  # part3-5만 사용 (빠른 훈련을 위해)
            print(f"Training on part {j}...")
            
            Xi_train = np.load(DATA_PATH + f'/part{j}/train_i.npy')
            Xv_train = np.load(DATA_PATH + f'/part{j}/train_x2.npy')
            y_train = np.load(DATA_PATH + f'/part{j}/train_y.npy')
            
            model.fit_once(Xi_train, Xv_train, y_train, k+1, j-2,
                          Xi_valid, Xv_valid, y_valid, early_stopping=True)
    
    print("\n=== 🎉 훈련 완료! ===")
    print(f"🎯 Top-k 최적화 모델이 저장되었습니다: {MODEL_SAVE_PATH}1/")
    print(f"📊 사용된 지표: NDCG@{args.ranking_k}, Precision@{args.ranking_k}, Recall@{args.ranking_k}")
    print()
    
    # 최종 검증
    if os.path.exists(MODEL_SAVE_PATH + '1/checkpoint'):
        print("✅ 체크포인트 파일 확인 완료")
        print("🚀 웹 서비스에서 Top-k 추천이 가능합니다!")
        print()
        print("📈 기대 효과:")
        print("   - 상위 랭킹 정확도 향상")
        print("   - 사용자 맞춤 추천 품질 개선") 
        print("   - NDCG, Precision, Recall 지표 최적화")
    else:
        print("❌ 체크포인트 파일을 찾을 수 없습니다")

print("\n=== 🏁 완료 ===")
print("🎯 Top-k 최적화 AutoInt 모델 훈련이 완료되었습니다!")
print()
print("💡 이제 웹 서비스에서 다음 설정을 사용하세요:")
print(f"   model_path = '{MODEL_SAVE_PATH}1/'")
print(f"   ranking_optimized = True")
print(f"   top_k_performance = 'NDCG@{args.ranking_k} optimized'")
print()
print("🚀 추천 시스템 성능:")
print("   ✅ 상위 랭킹 정확도 극대화")
print("   ✅ 개인화 추천 품질 향상")
print("   ✅ Top-k 지표 최적화 완료")
print("   ✅ 실시간 웹 서비스 준비 완료")

=== Chair Recommendation AutoInt Model Training ===
🎯 Top-k Ranking 최적화 모델 훈련
이 노트북은 전체 파이프라인을 실행합니다:
1. 데이터 전처리
2. K-fold 분할
3. 스케일링
4. AutoInt + DRM 모델 훈련 (Top-k 최적화)
5. Top-k 지표 평가 (NDCG@k, Precision@k, Recall@k)
6. 모델 저장

📊 현재 설정:
   Loss Function: ranking_ndcg
   Ranking K: 10
   Temperature: 5.0
   Batch Size: 512 (ranking loss 최적화)

=== Step 1: 데이터 전처리 ===
Processed 1000 combinations...
Processed 2000 combinations...
Processed 3000 combinations...
Processed 4000 combinations...
Processed 5000 combinations...
Processed 6000 combinations...
Processed 7000 combinations...
Processed 8000 combinations...
Processed 9000 combinations...
Processed 10000 combinations...
Processed 11000 combinations...
Processed 12000 combinations...
Processed 13000 combinations...
Processed 14000 combinations...
Processed 15000 combinations...
Processed 16000 combinations...
Processed 17000 combinations...
Processed 18000 combinations...
Processed 19000 combinations...
Processed 20000 combinations...
Pro

I0000 00:00:1749524816.688886 17326308 mlir_graph_optimization_pass.cc:425] MLIR V1 optimization pass is not enabled


Total parameters: 3121
Extra parameters: 2625

=== Epoch 1/5 ===
Training on part 3...
Evaluation Results:
  ndcg@5: 0.5789
  ndcg@10: 0.5789
  ndcg@20: 0.5789
  precision@5: 0.8000
  precision@10: 0.7000
  precision@20: 0.8000
  recall@5: 0.0000
  recall@10: 0.0000
  recall@20: 0.0000
Evaluation Results:
  ndcg@5: 0.5790
  ndcg@10: 0.5790
  ndcg@20: 0.5790
  precision@5: 0.6000
  precision@10: 0.4000
  precision@20: 0.3000
  recall@5: 0.0000
  recall@10: 0.0000
  recall@20: 0.0000
[1-1] Model saved! Valid loss improved from 1.0000 to 0.7358
[1-1] train-result=0.5789, train-loss=0.7358, valid-result=0.5790, valid-loss=0.7358 [123.4 s]
Training on part 4...
Evaluation Results:
  ndcg@5: 0.5791
  ndcg@10: 0.5791
  ndcg@20: 0.5791
  precision@5: 0.8000
  precision@10: 0.9000
  precision@20: 0.7000
  recall@5: 0.0000
  recall@10: 0.0000
  recall@20: 0.0000
Evaluation Results:
  ndcg@5: 0.5790
  ndcg@10: 0.5790
  ndcg@20: 0.5790
  precision@5: 0.6000
  precision@10: 0.4000
  precision@20: 0