In [1]:
# !pip install pandas	
# !pip install numpy
# !pip install tqdm	
# !pip install matplotlib

In [1]:
# 라이브러리
%load_ext autoreload
%autoreload 2
    
import random
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from numpy.random import randint, rand

import ga_util
from ga_core import GeneticAlgorithm


SEED_VALUE = 20250604

In [2]:
# 데이터 불러오기
machine_info = pd.read_csv("./data/machine_info.csv")
order_info = pd.read_csv("./data/order_info.csv")
order_info_nl = pd.read_csv("./data/order_info_nl.csv", header=None)
order_info_nl_eng = pd.read_csv("./data/order_info_nl_eng.csv")

In [3]:
order_info_nl.columns = ['영업납기', '중산도면', '단가', '수량', '선급', '자연어주문']

In [4]:
# 전처리 (1): column name 변경
order_info = order_info.rename(columns={
    '영업납기': 'time',
    '중산도면': 'item',
    '단가': 'cost',
    '수량': 'qty',
    '선급': 'urgent'
})

machine_info = machine_info.rename(columns={
    'JSDWG': 'item',
    'MCNO': 'machine',
    'AVG_CT': 'capacity'
})

order_info_nl = order_info_nl.rename(columns={
    '영업납기': 'time',
    '중산도면': 'item',
    '단가': 'cost',
    '수량': 'qty',
    '선급': 'urgent',
    '자연어주문': 'nl_order'
})

order_info_nl_eng.columns = ['nl_order']

In [5]:
# 머신인포 dropna, 4자리 숫자 문자열로 변환
machine_info.dropna(inplace=True)
machine_info['machine'] = machine_info['machine'].apply(lambda x: str(int(x)).zfill(4))

In [6]:
# 아이템 -> item01
# qty -> 그대로
# 날짜 -> 일수(날짜 - 2021-01-01) date001(1~365) / 자연어는 수정X
# 선급 -> 일단 자연어 그대로
# cost -> 일단 그대로
# machine -> machine01
# capacity -> 일단 그대로

In [7]:
# 아이템 -> item01
unique_items = sorted(order_info['item'].unique(), reverse=True)

items_mapping = {
        item: f"item{str(i+1).zfill(2)}"
        for i, item in enumerate(unique_items)
    }

def change_item_name(target):
    result = str(target)
    for k, v in items_mapping.items():
        result = result.replace(k, v)

    return result

change_item_name('057791')

'item87'

In [8]:
machine_info['item'] = machine_info['item'].apply(change_item_name)
cond = machine_info['item'].isin(items_mapping.values())
machine_info[cond].head()

Unnamed: 0,item,machine,capacity
897,item96,408,4.89
898,item96,409,5.92
899,item96,410,4.91
900,item96,412,3.5
901,item96,416,1.03


In [9]:
order_info['item'] = order_info['item'].apply(change_item_name)
order_info.head()

Unnamed: 0,time,item,cost,qty,urgent
0,2021-05-13,item15,25870,318,검사품
1,2021-05-24,item16,16229,383,검사품
2,2021-05-30,item96,8333,19,
3,2021-06-03,item91,36533,4,
4,2021-06-18,item87,45500,196,검사품


In [10]:
order_info_nl['item'] = order_info_nl['item'].apply(change_item_name)
order_info_nl['nl_order'] = order_info_nl['nl_order'].apply(change_item_name)
order_info_nl.head()

Unnamed: 0,time,item,cost,qty,urgent,nl_order
0,2021-05-13,item15,25870,318,검사품,item15 맞지? 318개 하는거? 21년 5월 13일까지 하면 되구 단가가 25...
1,2021-05-24,item16,16229,383,검사품,헐 방금 item16 도면 왔는데 21년 5월 24일까지래. 383개 해야되구 16...
2,2021-05-30,item96,8333,19,,"저기 item96 도면 19개 해야돼. 21년 5월 30일까지구 8,333원. 그리..."
3,2021-06-03,item91,36533,4,,어우 빨리 봐봐. item91 4개 21년 6월 3일까지 해야되는데 어떡하지? 단가...
4,2021-06-18,item87,45500,196,검사품,item87 맞지? 196개 하는거? 21년 6월 18일까지 하면 되구 단가가 45...


In [11]:
order_info_nl_eng['nl_order'] = order_info_nl_eng['nl_order'].apply(change_item_name)
order_info_nl_eng.head()

Unnamed: 0,nl_order
0,"item15, right? 318? You can do it by May 13th,..."
1,"Oh, I just got the item16 drawing, and it says..."
2,"There, I need to do 19 drawings for item96. It..."
3,"Hey, look quickly. item91 4 I have to do it by..."
4,"item87, right? Doing 196? You can do it until ..."


In [12]:
# 날짜 -> 일수(날짜 - 2021-01-01) (1~365)
def change_time_name(target):
    time = pd.to_datetime(target)
    base_date = pd.to_datetime('2021-01-01')

    return f'date{str((time-base_date).days+1).zfill(3)}'

change_time_name('2021-01-01'), change_time_name('2021-12-31')

('date001', 'date365')

In [13]:
order_info['time'] = order_info['time'].apply(change_time_name)
order_info.head()

Unnamed: 0,time,item,cost,qty,urgent
0,date133,item15,25870,318,검사품
1,date144,item16,16229,383,검사품
2,date150,item96,8333,19,
3,date154,item91,36533,4,
4,date169,item87,45500,196,검사품


In [14]:
order_info_nl['time'] = order_info_nl['time'].apply(change_time_name)
order_info_nl.head()

Unnamed: 0,time,item,cost,qty,urgent,nl_order
0,date133,item15,25870,318,검사품,item15 맞지? 318개 하는거? 21년 5월 13일까지 하면 되구 단가가 25...
1,date144,item16,16229,383,검사품,헐 방금 item16 도면 왔는데 21년 5월 24일까지래. 383개 해야되구 16...
2,date150,item96,8333,19,,"저기 item96 도면 19개 해야돼. 21년 5월 30일까지구 8,333원. 그리..."
3,date154,item91,36533,4,,어우 빨리 봐봐. item91 4개 21년 6월 3일까지 해야되는데 어떡하지? 단가...
4,date169,item87,45500,196,검사품,item87 맞지? 196개 하는거? 21년 6월 18일까지 하면 되구 단가가 45...


In [15]:
# machine -> machine01
unique_machines = sorted(machine_info['machine'].dropna().unique())

machines_mapping = {
        machine: f"machine{str(i+1).zfill(2)}"
        for i, machine in enumerate(unique_machines)
    }

def change_machine_name(target):
    result = str(target)
    for k, v in machines_mapping.items():
        result = result.replace(k, v)

    return result

change_machine_name('0421')

'machine19'

In [16]:
machine_info['machine'] = machine_info['machine'].apply(change_machine_name)
machine_info.head()

Unnamed: 0,item,machine,capacity
0,50060,machine28,1.08
1,50093,machine04,7.13
2,50093,machine08,4.67
3,50093,machine10,4.5
4,50093,machine14,3.92


In [17]:
# 전처리 - order
def preprocess_order(df):
    # 긴급 생산 요건이 있는 경우 1, 그렇지 않은 경우는 0
    df['urgent'] = df['urgent'].fillna(0)

    for i in range(len(df)):
        if df.loc[i, 'urgent'] != 0:
            df.loc[i, 'urgent'] = 1

    df.dropna(axis=0, inplace=True)

    # 같은 종류의 order 묶기
    df = df.groupby(['time','item','cost','urgent']).sum().reset_index()
    
    return df

# 전처리 - machine
def preprocess_machine(df):
    df.dropna(axis=0, inplace=True)

    return df

In [18]:
# 슬라이딩 윈도우 함수 정의
def create_shifted_dataset(df, window_size=5):
    datasets = []
    for i in range(len(df) - window_size + 1):
        window = df.iloc[i:i+window_size].reset_index(drop=True)
        datasets.append(window)
    return datasets

In [19]:
machine_info = preprocess_machine(machine_info)

def make_dataset(order_info_nl, machine_info, window_size=5):
    dataset = create_shifted_dataset(order_info_nl, window_size=window_size)
    dataset_for_ga = list()
    dataset_for_nl = list()
    dataset_for_order = list()

    for data in dataset:
        order = data.iloc[:, :5]
        nl = data.iloc[:, 5:]

        order = preprocess_order(order)
        dataset_for_order.append(order)
        order = pd.merge(order, machine_info, on='item', how='inner')

        dataset_for_ga.append(order)
        dataset_for_nl.append(nl)

    return dataset_for_ga, dataset_for_nl, dataset_for_order

# raw_order_info_nl_eng = order_info_nl_eng.copy()
order_info_nl_eng = pd.concat([order_info, order_info_nl_eng], axis=1)
dataset_for_ga, dataset_for_nl, dataset_for_order = make_dataset(order_info_nl_eng, machine_info)

In [56]:
# 노트북 셀 (dataset_for_ga가 생성된 이후)

if 'dataset_for_ga' in globals() and dataset_for_ga:
    rng = np.random.RandomState(seed=20250526) # 시드 고정 (결과 비교를 위해)
    # TODO: random_ls를 만들 때 dataset_for_ga의 실제 길이로 size를 제한해야 할 수 있음
    #       (만약 dataset_for_ga 길이가 10보다 작다면 에러 발생 가능)
    #       여기서는 dataset_for_ga 길이가 충분히 크다고 가정.
    #       또는, 테스트를 위해 실제 샘플 개수를 줄여도 좋음 (예: size=1 또는 2)
    num_samples_to_test = min(2, len(dataset_for_ga)) # 실제 샘플 수 제한
    random_ls = rng.choice(a=len(dataset_for_ga), size=num_samples_to_test, replace=False)
    temp0_test_samples = [dataset_for_ga[i] for i in random_ls]
    print(f"테스트용 데이터 샘플 {len(temp0_test_samples)}개 준비 완료 (temp0_test_samples).")
else:
    print("오류: dataset_for_ga가 준비되지 않았습니다. 이전 셀들을 실행해주세요.")
    temp0_test_samples = []

# 사용할 고정 하이퍼파라미터 (네가 지정한 값)
ga_fixed_hyperparams = {
    'n_iter': 500,
    'n_pop': 20,
    'r_cross': 0.4,
    'r_mut': 0.1,
    'max_machine_work_time': 600,
    'overproduction_penalty_factor': 10000000,
    'gene_swap_prob': 0.5 # 우리가 교차 연산자에 추가했던 확률
}
print(f"사용할 고정 하이퍼파라미터: {ga_fixed_hyperparams}")

테스트용 데이터 샘플 2개 준비 완료 (temp0_test_samples).
사용할 고정 하이퍼파라미터: {'n_iter': 500, 'n_pop': 20, 'r_cross': 0.4, 'r_mut': 0.1, 'max_machine_work_time': 600, 'overproduction_penalty_factor': 10000000, 'gene_swap_prob': 0.5}


In [57]:
rng = np.random.RandomState(seed=20250526)
random_ls = rng.choice(a=len(dataset_for_ga), size=(10, ))
random_ls

array([ 68,   6,  77,  91,  29,  10,  58, 109,   0, 111])

In [58]:
import pickle

hyper_parameters_ls_refac = [] # 이 변수는 원래 find_hyper_parameters 관련이었으므로, 여기서는 사용 안 함
log_ls_refac = []              # GA 실행 로그 (best_score 변화) 저장용
log_detail_ls_refac = []       # GA 상세 로그 (best_solution 변화 포함) 저장용
best_solutions_dict_ls_refac = [] # 각 샘플의 best_solution (딕셔너리) 저장용
best_scores_ls_refac = []      # 각 샘플의 best_score 저장용
gt_ls_refactored = []          # 최종 GT DataFrame 저장용


def generate_ground_truth_with_refactored_ga(dataset_for_ga_input, fixed_hyperparams):
    """
    리팩토링된 GeneticAlgorithm 클래스를 사용하여 전체 데이터셋에 대해
    GA를 실행하고 Ground Truth 생산 계획(DataFrame 리스트)을 생성합니다.
    """
    # 전역 리스트를 사용하지 않고 함수 내에서 결과를 만들어서 반환하는 것이 더 좋은 패턴이지만,
    # 오리지널 코드의 구조를 최대한 따르기 위해 외부 리스트에 append 하는 방식을 유지해볼 수 있음.
    # (하지만, 여기서는 함수가 결과를 반환하도록 수정하는 것을 추천)
    
    # 이 함수 내에서 사용할 임시 리스트들
    current_run_best_solutions = []
    current_run_best_scores = []
    current_run_logs = []
    current_run_log_details = []
    current_run_gt_dataframes = []

    data_len = len(dataset_for_ga_input)
    print(f"\n--- 리팩토링된 GA로 Ground Truth 생성 시작 (총 {data_len}개 샘플) ---")

    for i, data_sample_df in enumerate(dataset_for_ga_input):
        print(f"  샘플 {i + 1}/{data_len} 처리 중...")

        # 1. 데이터 전처리 (ga_util 사용)
        #    T_set, I_set, J_set 등은 GA 인스턴스 생성 시점에 전달됨.
        T_set, I_set, J_set, cit_data, pit_data, dit_data, mijt_data = \
            ga_util.preprocess_ga_inputs_dense(data_sample_df.copy()) # 또는 _sparse 버전

        # 2. GeneticAlgorithm 인스턴스 생성 (고정된 하이퍼파라미터 사용)
        #    xijt_keys, mijt_keys는 클래스 __init__에서 자동 생성되도록 None 전달
        ga_instance = GeneticAlgorithm(
            T_set=T_set, I_set=I_set, J_set=J_set,
            cit_data=cit_data, pit_data=pit_data, dit_data=dit_data, mijt_data=mijt_data,
            n_pop=fixed_hyperparams['n_pop'],
            n_iter=fixed_hyperparams['n_iter'],
            r_cross=fixed_hyperparams['r_cross'],
            r_mut=fixed_hyperparams['r_mut'],
            # __init__에 추가했던 다른 하이퍼파라미터들도 여기서 전달 가능
            max_machine_work_time=fixed_hyperparams.get('max_machine_work_time', 600),
            overproduction_penalty_factor=fixed_hyperparams.get('overproduction_penalty_factor', 10000000),
            gene_swap_prob=fixed_hyperparams.get('gene_swap_prob', 0.5),
            xijt_keys_list=None 
        )

        # 3. GA 실행
        #    solve() 메소드는 best_solution(dict), best_score, log, log_detail을 반환
        best_solution_dict, best_score, log, log_detail = ga_instance.solve()

        # 결과 저장 (함수 스코프 내의 리스트에)
        current_run_best_solutions.append(best_solution_dict)
        current_run_best_scores.append(int(best_score))
        current_run_logs.append(log)
        current_run_log_details.append(log_detail)

        # 로그 파일 저장 (오리지널 코드와 유사하게 매번 저장하거나, 루프 후 한 번에 저장)
        # 여기서는 루프 후 한 번에 저장하는 것을 가정하고, 이 부분은 주석 처리 또는 삭제
        # with open('log_n5_refactored.pkl', 'wb') as f:
        #     pickle.dump([current_run_best_solutions, current_run_best_scores, 
        #                  current_run_logs, current_run_log_details], f)
        # print(f"  샘플 {i + 1} 로그 임시 저장됨 (log_n5_refactored.pkl).")


        # 4. best_solution_dict (생산 비율 딕셔너리)를 GT DataFrame (실제 생산량)으로 변환
        #    이 로직은 이전 `run_refactored_ga_on_samples` 함수에서 사용했던 것과 동일
        if best_solution_dict:
            processed_gt_rows = []
            # dit_data는 현재 data_sample_df에 대한 전처리 결과 사용
            for (item_k, machine_k, time_k), ratio_val in best_solution_dict.items():
                # best_solution_dict의 값(ratio_val)은 생산 비율이라고 가정
                demand_for_item_time = dit_data.get((item_k, time_k), 0) 
                actual_qty = round(ratio_val * demand_for_item_time)
                if actual_qty > 0: # 실제 생산량이 0보다 큰 경우만 GT에 포함
                   processed_gt_rows.append({
                       'item': item_k, 
                       'machine': machine_k, 
                       'time': time_k, 
                       'qty': actual_qty
                   })
            
            if processed_gt_rows:
                gt_df = pd.DataFrame(processed_gt_rows)
                # 컬럼 순서 및 정렬 (오리지널 코드 참고 또는 필요에 맞게)
                gt_df = gt_df[['item', 'machine', 'time', 'qty']].sort_values(
                    by=['time', 'item', 'machine', 'qty'] # 정렬 기준은 원래 코드와 동일하게
                ).reset_index(drop=True)
                current_run_gt_dataframes.append(gt_df)
            else:
                current_run_gt_dataframes.append(pd.DataFrame(columns=['item', 'machine', 'time', 'qty']))
        else:
            # best_solution_dict가 None인 경우 (GA가 해를 못 찾았거나 하는 예외 상황)
            current_run_gt_dataframes.append(pd.DataFrame(columns=['item', 'machine', 'time', 'qty']))
        print(f"  샘플 {i + 1} 완료. Best Score: {best_score}")

    # 모든 샘플 처리 후 최종 결과 저장 (선택 사항, 오리지널 코드 방식)
    # 로그 저장
    # with open('log_n5_refactored_final.pkl', 'wb') as f:
    #     pickle.dump([current_run_best_solutions, current_run_best_scores, 
    #                  current_run_logs, current_run_log_details], f)
    # print("\n최종 로그 저장 완료 (log_n5_refactored_final.pkl).")

    # GT 리스트 저장
    with open('gt_n5_refactored_final.pkl', 'wb') as f: # 파일 이름 변경
        pickle.dump(current_run_gt_dataframes, f)
    print("최종 GT 리스트 저장 완료 (gt_n5_refactored_final.pkl).")

    # 이 함수가 결과를 직접 반환하도록 수정 (더 좋은 패턴)
    return current_run_best_scores, current_run_gt_dataframes, current_run_logs, current_run_log_details


In [59]:
if 'dataset_for_ga' in globals() and dataset_for_ga: # dataset_for_ga는 전체 데이터셋
    # dataset_for_ga 전체 또는 일부(temp0)를 사용
    # 예시: dataset_to_process_now = temp0_test_samples # 축소 실행 시
    dataset_to_process_now = dataset_for_ga # 전체 실행 시

    final_scores, final_gt_list, final_logs, _ = \
        generate_ground_truth_with_refactored_ga(dataset_to_process_now, ga_fixed_hyperparams)
    best_scores_ls_refac = final_scores
    print("\n--- 리팩토링된 GA 최종 실행 요약 ---")
    print(f"처리된 샘플 수: {len(final_scores)}")
    # print(f"평균 Best Score: {np.mean(final_scores) if final_scores else 'N/A'}")
    # print(f"첫 번째 샘플의 GT (상위 3개):")
    # if final_gt_list and not final_gt_list[0].empty:
    #     print(final_gt_list[0].head(3))

else:
    print("dataset_for_ga가 준비되지 않아 실행할 수 없습니다.")



--- 리팩토링된 GA로 Ground Truth 생성 시작 (총 123개 샘플) ---
  샘플 1/123 처리 중...
  [solve] 세대 0: 초기 모집단 생성됨 (크기: 20)
  [solve] 세대 0: 단일 염색체 크기: 625)
세대 225에서 조기 종료: 200 세대 동안 개선 없음.
  샘플 1 완료. Best Score: 1770326.0
  샘플 2/123 처리 중...
  [solve] 세대 0: 초기 모집단 생성됨 (크기: 20)
  [solve] 세대 0: 단일 염색체 크기: 540)
세대 475에서 조기 종료: 200 세대 동안 개선 없음.
  샘플 2 완료. Best Score: 2373997.0
  샘플 3/123 처리 중...
  [solve] 세대 0: 초기 모집단 생성됨 (크기: 20)
  [solve] 세대 0: 단일 염색체 크기: 500)
세대 233에서 조기 종료: 200 세대 동안 개선 없음.
  샘플 3 완료. Best Score: 2047500.0
  샘플 4/123 처리 중...
  [solve] 세대 0: 초기 모집단 생성됨 (크기: 20)
  [solve] 세대 0: 단일 염색체 크기: 420)
세대 349에서 조기 종료: 200 세대 동안 개선 없음.
  샘플 4 완료. Best Score: 1371128.0
  샘플 5/123 처리 중...
  [solve] 세대 0: 초기 모집단 생성됨 (크기: 20)
  [solve] 세대 0: 단일 염색체 크기: 240)
세대 213에서 조기 종료: 200 세대 동안 개선 없음.
  샘플 5 완료. Best Score: 2287141
  샘플 6/123 처리 중...
  [solve] 세대 0: 초기 모집단 생성됨 (크기: 20)
  [solve] 세대 0: 단일 염색체 크기: 210)
세대 389에서 조기 종료: 200 세대 동안 개선 없음.
  샘플 6 완료. Best Score: 53606130000000.0
  샘플 7/123 처리 중...
  [solve

In [60]:
with open('gt_n5_refactored_final.pkl', 'rb') as f:
    refactored_gt_ls = pickle.load(f)

refactored_gt_ls[0]

Unnamed: 0,item,machine,time,qty
0,item15,machine04,date133,19
1,item15,machine05,date133,65
2,item15,machine07,date133,201
3,item15,machine08,date133,223
4,item15,machine09,date133,199
...,...,...,...,...
56,item87,machine19,date169,17
57,item87,machine20,date169,70
58,item87,machine21,date169,110
59,item87,machine32,date169,80


In [61]:
# GT 결과값 데이테프레임화
# for idx, gt in enumerate(refactored_gt_ls):
#     gt[['item', 'machine', 'time']] = pd.DataFrame(gt['(item, machine, time)'].tolist(), index=gt.index)
#     gt = gt.drop(columns='(item, machine, time)')
#     gt = gt[['item', 'machine', 'time', 'qty']].sort_values(['time', 'item', 'machine', 'qty'])
#     refactored_gt_ls[idx] = gt

refactored_gt_ls[0]

Unnamed: 0,item,machine,time,qty
0,item15,machine04,date133,19
1,item15,machine05,date133,65
2,item15,machine07,date133,201
3,item15,machine08,date133,223
4,item15,machine09,date133,199
...,...,...,...,...
56,item87,machine19,date169,17
57,item87,machine20,date169,70
58,item87,machine21,date169,110
59,item87,machine32,date169,80


In [62]:
# order 정보 묶기
def combine_order_nl(data):
    combined_data = '\n'.join(f'• {text}' for text in data['nl_order'])

    return combined_data

combine_order_nl(dataset_for_nl[0])

"• item15, right? 318? You can do it by May 13th, 2021, and the unit price is 25,870 won. Oh, and the inspection product.\n• Oh, I just got the item16 drawing, and it says it's due on May 24th, 2021. I need to do 383 of them, and it costs 16,229 won, and it's an inspection item.\n• There, I need to do 19 drawings for item96. It's until May 30th, 2021. It's 8,333 won. And nothing special.\n• Hey, look quickly. item91 4 I have to do it by June 3rd, 2021. What should I do? The unit price is 36,533 won and there's nothing special about it.\n• item87, right? Doing 196? You can do it until June 18th, 2021, and the unit price is 45,500 won. Oh, and the inspection product"

In [63]:
order_nl_for_dataset = [combine_order_nl(data) for data in dataset_for_nl]
order_nl_for_dataset[0]

"• item15, right? 318? You can do it by May 13th, 2021, and the unit price is 25,870 won. Oh, and the inspection product.\n• Oh, I just got the item16 drawing, and it says it's due on May 24th, 2021. I need to do 383 of them, and it costs 16,229 won, and it's an inspection item.\n• There, I need to do 19 drawings for item96. It's until May 30th, 2021. It's 8,333 won. And nothing special.\n• Hey, look quickly. item91 4 I have to do it by June 3rd, 2021. What should I do? The unit price is 36,533 won and there's nothing special about it.\n• item87, right? Doing 196? You can do it until June 18th, 2021, and the unit price is 45,500 won. Oh, and the inspection product"

In [64]:
dataset_for_order[0]

Unnamed: 0,time,item,cost,urgent,qty
0,date133,item15,25870,1,318
1,date144,item16,16229,1,383
2,date150,item96,8333,0,19
3,date154,item91,36533,0,4
4,date169,item87,45500,1,196


In [65]:
dataset_for_ga[0]

Unnamed: 0,time,item,cost,urgent,qty,machine,capacity
0,date133,item15,25870,1,318,machine04,12.84
1,date133,item15,25870,1,318,machine05,10.42
2,date133,item15,25870,1,318,machine07,8.69
3,date133,item15,25870,1,318,machine08,9.12
4,date133,item15,25870,1,318,machine09,10.33
...,...,...,...,...,...,...,...
58,date169,item87,45500,1,196,machine19,10.00
59,date169,item87,45500,1,196,machine20,11.37
60,date169,item87,45500,1,196,machine21,7.85
61,date169,item87,45500,1,196,machine32,5.49


In [66]:
# order machine join 정보 묶기
def combine_order_machine_join(data):
    combined_data = list()
    for _, row in data.iterrows():
        combined_data.append(f"• {row['time']} {row['item']} {row['cost']} {row['urgent']} {row['qty']} {row['machine']} {row['capacity']}")

    combined_data = "\n".join(combined_data)
        
    return combined_data

combine_order_machine_join(dataset_for_ga[0])

'• date133 item15 25870 1 318 machine04 12.84\n• date133 item15 25870 1 318 machine05 10.42\n• date133 item15 25870 1 318 machine07 8.69\n• date133 item15 25870 1 318 machine08 9.12\n• date133 item15 25870 1 318 machine09 10.33\n• date133 item15 25870 1 318 machine10 8.99\n• date133 item15 25870 1 318 machine12 12.32\n• date133 item15 25870 1 318 machine14 11.23\n• date133 item15 25870 1 318 machine20 4.0\n• date133 item15 25870 1 318 machine21 9.87\n• date133 item15 25870 1 318 machine23 11.77\n• date133 item15 25870 1 318 machine28 9.54\n• date133 item15 25870 1 318 machine29 13.05\n• date133 item15 25870 1 318 machine30 9.0\n• date133 item15 25870 1 318 machine31 9.0\n• date133 item15 25870 1 318 machine33 4.4\n• date133 item15 25870 1 318 machine34 12.47\n• date133 item15 25870 1 318 machine35 9.4\n• date144 item16 16229 1 383 machine07 4.85\n• date144 item16 16229 1 383 machine08 3.83\n• date144 item16 16229 1 383 machine09 5.37\n• date144 item16 16229 1 383 machine10 3.09\n• date

In [67]:
order_machine_join = [combine_order_machine_join(data) for data in dataset_for_ga]
print(order_machine_join[0])

• date133 item15 25870 1 318 machine04 12.84
• date133 item15 25870 1 318 machine05 10.42
• date133 item15 25870 1 318 machine07 8.69
• date133 item15 25870 1 318 machine08 9.12
• date133 item15 25870 1 318 machine09 10.33
• date133 item15 25870 1 318 machine10 8.99
• date133 item15 25870 1 318 machine12 12.32
• date133 item15 25870 1 318 machine14 11.23
• date133 item15 25870 1 318 machine20 4.0
• date133 item15 25870 1 318 machine21 9.87
• date133 item15 25870 1 318 machine23 11.77
• date133 item15 25870 1 318 machine28 9.54
• date133 item15 25870 1 318 machine29 13.05
• date133 item15 25870 1 318 machine30 9.0
• date133 item15 25870 1 318 machine31 9.0
• date133 item15 25870 1 318 machine33 4.4
• date133 item15 25870 1 318 machine34 12.47
• date133 item15 25870 1 318 machine35 9.4
• date144 item16 16229 1 383 machine07 4.85
• date144 item16 16229 1 383 machine08 3.83
• date144 item16 16229 1 383 machine09 5.37
• date144 item16 16229 1 383 machine10 3.09
• date144 item16 16229 1 383 

In [68]:
def combine_order(data):
    combined_data = list()
    for _, row in data.iterrows():
        combined_data.append(f"• {row['time']} {row['item']} {row['cost']} {row['urgent']} {row['qty']}")

    combined_data = "\n".join(combined_data)
        
    return combined_data

print(combine_order(dataset_for_order[0]))

• date133 item15 25870 1 318
• date144 item16 16229 1 383
• date150 item96 8333 0 19
• date154 item91 36533 0 4
• date169 item87 45500 1 196


In [69]:
order_for_dataset = [combine_order(data) for data in dataset_for_ga]
order_for_dataset[0]

'• date133 item15 25870 1 318\n• date133 item15 25870 1 318\n• date133 item15 25870 1 318\n• date133 item15 25870 1 318\n• date133 item15 25870 1 318\n• date133 item15 25870 1 318\n• date133 item15 25870 1 318\n• date133 item15 25870 1 318\n• date133 item15 25870 1 318\n• date133 item15 25870 1 318\n• date133 item15 25870 1 318\n• date133 item15 25870 1 318\n• date133 item15 25870 1 318\n• date133 item15 25870 1 318\n• date133 item15 25870 1 318\n• date133 item15 25870 1 318\n• date133 item15 25870 1 318\n• date133 item15 25870 1 318\n• date144 item16 16229 1 383\n• date144 item16 16229 1 383\n• date144 item16 16229 1 383\n• date144 item16 16229 1 383\n• date144 item16 16229 1 383\n• date144 item16 16229 1 383\n• date144 item16 16229 1 383\n• date144 item16 16229 1 383\n• date144 item16 16229 1 383\n• date144 item16 16229 1 383\n• date144 item16 16229 1 383\n• date144 item16 16229 1 383\n• date144 item16 16229 1 383\n• date150 item96 8333 0 19\n• date150 item96 8333 0 19\n• date150 ite

In [70]:
def combine_machine(data):
    combined_data = list()
    for _, row in data.iterrows():
        combined_data.append(f"• {row['item']} {row['machine']} {row['capacity']}")

    combined_data = list(set(combined_data))
    combined_data = "\n".join(combined_data)
        
    return combined_data

print(combine_machine(dataset_for_ga[0]))

• item87 machine01 5.88
• item16 machine21 3.94
• item15 machine09 10.33
• item15 machine28 9.54
• item16 machine09 5.37
• item87 machine17 12.0
• item96 machine09 5.92
• item15 machine20 4.0
• item96 machine29 7.12
• item15 machine31 9.0
• item96 machine35 1.83
• item16 machine35 3.85
• item96 machine34 3.0
• item87 machine03 6.5
• item16 machine23 6.25
• item91 machine10 5.32
• item91 machine28 4.33
• item96 machine12 3.5
• item16 machine14 2.73
• item16 machine07 4.85
• item15 machine33 4.4
• item16 machine08 3.83
• item87 machine32 5.49
• item87 machine18 11.82
• item16 machine22 5.47
• item15 machine12 12.32
• item96 machine08 4.89
• item96 machine28 4.64
• item16 machine31 4.53
• item91 machine09 4.03
• item87 machine05 11.64
• item87 machine21 7.85
• item16 machine10 3.09
• item15 machine07 8.69
• item15 machine29 13.05
• item16 machine30 3.8
• item96 machine21 4.6
• item91 machine21 4.8
• item91 machine34 5.25
• item91 machine35 4.5
• item15 machine08 9.12
• item87 machine20 11

In [71]:
machine_for_dataset = [combine_machine(data) for data in dataset_for_ga]
machine_for_dataset[0]

'• item87 machine01 5.88\n• item16 machine21 3.94\n• item15 machine09 10.33\n• item15 machine28 9.54\n• item16 machine09 5.37\n• item87 machine17 12.0\n• item96 machine09 5.92\n• item15 machine20 4.0\n• item96 machine29 7.12\n• item15 machine31 9.0\n• item96 machine35 1.83\n• item16 machine35 3.85\n• item96 machine34 3.0\n• item87 machine03 6.5\n• item16 machine23 6.25\n• item91 machine10 5.32\n• item91 machine28 4.33\n• item96 machine12 3.5\n• item16 machine14 2.73\n• item16 machine07 4.85\n• item15 machine33 4.4\n• item16 machine08 3.83\n• item87 machine32 5.49\n• item87 machine18 11.82\n• item16 machine22 5.47\n• item15 machine12 12.32\n• item96 machine08 4.89\n• item96 machine28 4.64\n• item16 machine31 4.53\n• item91 machine09 4.03\n• item87 machine05 11.64\n• item87 machine21 7.85\n• item16 machine10 3.09\n• item15 machine07 8.69\n• item15 machine29 13.05\n• item16 machine30 3.8\n• item96 machine21 4.6\n• item91 machine21 4.8\n• item91 machine34 5.25\n• item91 machine35 4.5\n• it

In [72]:
# gt 정보 텍스트화
def combine_gt(data):
    combined_data = list()
    for _, row in data.iterrows():
        combined_data.append(f"• {row['item']} {row['machine']} {row['time']} {row['qty']}")

    combined_data = "\n".join(combined_data)
        
    return combined_data

print(combine_gt(refactored_gt_ls[0]))

• item15 machine04 date133 19
• item15 machine05 date133 65
• item15 machine07 date133 201
• item15 machine08 date133 223
• item15 machine09 date133 199
• item15 machine10 date133 147
• item15 machine12 date133 200
• item15 machine14 date133 221
• item15 machine20 date133 115
• item15 machine21 date133 3
• item15 machine23 date133 28
• item15 machine28 date133 92
• item15 machine29 date133 250
• item15 machine30 date133 149
• item15 machine31 date133 185
• item15 machine33 date133 128
• item15 machine34 date133 61
• item15 machine35 date133 266
• item16 machine07 date144 245
• item16 machine08 date144 163
• item16 machine09 date144 65
• item16 machine10 date144 92
• item16 machine14 date144 280
• item16 machine21 date144 166
• item16 machine22 date144 312
• item16 machine23 date144 319
• item16 machine28 date144 65
• item16 machine29 date144 117
• item16 machine30 date144 382
• item16 machine31 date144 140
• item16 machine35 date144 317
• item96 machine08 date150 10
• item96 machine09 

In [73]:
refactored_gt_for_dataset = [combine_gt(data) for data in refactored_gt_ls]
refactored_gt_for_dataset[0]

'• item15 machine04 date133 19\n• item15 machine05 date133 65\n• item15 machine07 date133 201\n• item15 machine08 date133 223\n• item15 machine09 date133 199\n• item15 machine10 date133 147\n• item15 machine12 date133 200\n• item15 machine14 date133 221\n• item15 machine20 date133 115\n• item15 machine21 date133 3\n• item15 machine23 date133 28\n• item15 machine28 date133 92\n• item15 machine29 date133 250\n• item15 machine30 date133 149\n• item15 machine31 date133 185\n• item15 machine33 date133 128\n• item15 machine34 date133 61\n• item15 machine35 date133 266\n• item16 machine07 date144 245\n• item16 machine08 date144 163\n• item16 machine09 date144 65\n• item16 machine10 date144 92\n• item16 machine14 date144 280\n• item16 machine21 date144 166\n• item16 machine22 date144 312\n• item16 machine23 date144 319\n• item16 machine28 date144 65\n• item16 machine29 date144 117\n• item16 machine30 date144 382\n• item16 machine31 date144 140\n• item16 machine35 date144 317\n• item96 machine0

In [74]:
refactored_dataset = pd.DataFrame(columns=['order', 'nl', 'machine', 'gt', 'objective'])
refactored_dataset

Unnamed: 0,order,nl,machine,gt,objective


In [75]:
print(f"best_scores_ls_refac의 길이: {len(best_scores_ls_refac)}")

best_scores_ls_refac의 길이: 123


In [76]:
refactored_dataset['order'] = order_for_dataset
refactored_dataset['nl'] = order_nl_for_dataset
refactored_dataset['machine'] = machine_for_dataset
refactored_dataset['gt'] = refactored_gt_for_dataset
refactored_dataset['objective'] = best_scores_ls_refac
refactored_dataset.head()

Unnamed: 0,order,nl,machine,gt,objective
0,• date133 item15 25870 1 318\n• date133 item15...,"• item15, right? 318? You can do it by May 13t...",• item87 machine01 5.88\n• item16 machine21 3....,• item15 machine04 date133 19\n• item15 machin...,1770326
1,• date144 item16 16229 1 383\n• date144 item16...,"• Oh, I just got the item16 drawing, and it sa...",• item87 machine01 5.88\n• item16 machine21 3....,• item16 machine07 date144 13\n• item16 machin...,2373997
2,• date041 item05 2990 0 500\n• date150 item96 ...,"• There, I need to do 19 drawings for item96. ...",• item87 machine01 5.88\n• item20 machine02 10...,• item05 machine10 date041 291\n• item96 machi...,2047500
3,• date041 item05 2990 0 500\n• date049 item95 ...,"• Hey, look quickly. item91 4 I have to do it ...",• item87 machine01 5.88\n• item20 machine02 10...,• item05 machine10 date041 433\n• item95 machi...,1371128
4,• date041 item05 2990 0 500\n• date049 item95 ...,"• item87, right? Doing 196? You can do it unti...",• item87 machine01 5.88\n• item20 machine02 10...,• item05 machine10 date041 133\n• item05 machi...,2287141


In [77]:
refactored_dataset['nl_machine'] = (
    "Order information\n" + refactored_dataset['nl'] + "\n\n" +
    "Machine information\n" + refactored_dataset['machine']
)

refactored_dataset['nl_machine_gt'] = (
    "Order information\n" + refactored_dataset['nl'] + "\n\n" +
    "Machine information\n" + refactored_dataset['machine'] + "\n\n" +
    "Schedule result\n" + refactored_dataset['gt']
)

refactored_dataset['order_machine'] = (
    "Order information\n" + refactored_dataset['order'] + "\n\n" +
    "Machine information\n" + refactored_dataset['machine']
)

refactored_dataset['order_machine_gt'] = (
    "Order information\n" + refactored_dataset['order'] + "\n\n" +
    "Machine information\n" + refactored_dataset['machine'] + "\n\n" +
    "Schedule result\n" + refactored_dataset['gt']
)

refactored_dataset['order_machine_join'] = order_machine_join

refactored_dataset['order_machine_join_gt'] = (
    "Order and machine information\n" + refactored_dataset['order_machine_join'] + "\n\n" +
    "Schedule result\n" + refactored_dataset['gt']
)

refactored_dataset.head()

Unnamed: 0,order,nl,machine,gt,objective,nl_machine,nl_machine_gt,order_machine,order_machine_gt,order_machine_join,order_machine_join_gt
0,• date133 item15 25870 1 318\n• date133 item15...,"• item15, right? 318? You can do it by May 13t...",• item87 machine01 5.88\n• item16 machine21 3....,• item15 machine04 date133 19\n• item15 machin...,1770326,"Order information\n• item15, right? 318? You c...","Order information\n• item15, right? 318? You c...",Order information\n• date133 item15 25870 1 31...,Order information\n• date133 item15 25870 1 31...,• date133 item15 25870 1 318 machine04 12.84\n...,Order and machine information\n• date133 item1...
1,• date144 item16 16229 1 383\n• date144 item16...,"• Oh, I just got the item16 drawing, and it sa...",• item87 machine01 5.88\n• item16 machine21 3....,• item16 machine07 date144 13\n• item16 machin...,2373997,"Order information\n• Oh, I just got the item16...","Order information\n• Oh, I just got the item16...",Order information\n• date144 item16 16229 1 38...,Order information\n• date144 item16 16229 1 38...,• date144 item16 16229 1 383 machine07 4.85\n•...,Order and machine information\n• date144 item1...
2,• date041 item05 2990 0 500\n• date150 item96 ...,"• There, I need to do 19 drawings for item96. ...",• item87 machine01 5.88\n• item20 machine02 10...,• item05 machine10 date041 291\n• item96 machi...,2047500,"Order information\n• There, I need to do 19 dr...","Order information\n• There, I need to do 19 dr...",Order information\n• date041 item05 2990 0 500...,Order information\n• date041 item05 2990 0 500...,• date041 item05 2990 0 500 machine10 4.6\n• d...,Order and machine information\n• date041 item0...
3,• date041 item05 2990 0 500\n• date049 item95 ...,"• Hey, look quickly. item91 4 I have to do it ...",• item87 machine01 5.88\n• item20 machine02 10...,• item05 machine10 date041 433\n• item95 machi...,1371128,"Order information\n• Hey, look quickly. item91...","Order information\n• Hey, look quickly. item91...",Order information\n• date041 item05 2990 0 500...,Order information\n• date041 item05 2990 0 500...,• date041 item05 2990 0 500 machine10 4.6\n• d...,Order and machine information\n• date041 item0...
4,• date041 item05 2990 0 500\n• date049 item95 ...,"• item87, right? Doing 196? You can do it unti...",• item87 machine01 5.88\n• item20 machine02 10...,• item05 machine10 date041 133\n• item05 machi...,2287141,"Order information\n• item87, right? Doing 196?...","Order information\n• item87, right? Doing 196?...",Order information\n• date041 item05 2990 0 500...,Order information\n• date041 item05 2990 0 500...,• date041 item05 2990 0 500 machine10 4.6\n• d...,Order and machine information\n• date041 item0...


In [78]:
print(refactored_dataset.iloc[0, -3])

Order information
• date133 item15 25870 1 318
• date133 item15 25870 1 318
• date133 item15 25870 1 318
• date133 item15 25870 1 318
• date133 item15 25870 1 318
• date133 item15 25870 1 318
• date133 item15 25870 1 318
• date133 item15 25870 1 318
• date133 item15 25870 1 318
• date133 item15 25870 1 318
• date133 item15 25870 1 318
• date133 item15 25870 1 318
• date133 item15 25870 1 318
• date133 item15 25870 1 318
• date133 item15 25870 1 318
• date133 item15 25870 1 318
• date133 item15 25870 1 318
• date133 item15 25870 1 318
• date144 item16 16229 1 383
• date144 item16 16229 1 383
• date144 item16 16229 1 383
• date144 item16 16229 1 383
• date144 item16 16229 1 383
• date144 item16 16229 1 383
• date144 item16 16229 1 383
• date144 item16 16229 1 383
• date144 item16 16229 1 383
• date144 item16 16229 1 383
• date144 item16 16229 1 383
• date144 item16 16229 1 383
• date144 item16 16229 1 383
• date150 item96 8333 0 19
• date150 item96 8333 0 19
• date150 item96 8333 0 19
• 

In [79]:
refactored_dataset.to_csv('./refactored_dataset_n5_20250526.csv')

In [80]:
import pandas as pd
import numpy as np # np.isclose 같은 함수를 사용할 수 있음

# 1. 데이터 로드
try:
    df_original = pd.read_csv("./dataset_n5_20250526.csv")
    df_refactored = pd.read_csv("./refactored_dataset_n5_20250526.csv")
    print("CSV 파일 로드 성공!")
except FileNotFoundError:
    print("오류: CSV 파일 중 하나 또는 모두를 찾을 수 없습니다. 파일 경로와 이름을 확인해주세요.")
    # 이 경우 아래 코드는 실행되지 않도록 처리 필요 (예: sys.exit() 또는 함수화)

# 2. 두 DataFrame의 길이 확인 (동일한 샘플 개수에 대한 결과인지 확인)
if len(df_original) != len(df_refactored):
    print("\n!! 경고: 두 CSV 파일의 행 개수가 다릅니다 !!")
    print(f"  오리지널 데이터셋 행 개수: {len(df_original)}")
    print(f"  리팩토링된 데이터셋 행 개수: {len(df_refactored)}")
    print("  결과 비교를 위해서는 동일한 수의 샘플에 대한 결과여야 합니다.")
else:
    print(f"\n두 데이터셋 모두 {len(df_original)}개의 샘플 결과를 포함하고 있습니다. 'objective' 값 비교를 시작합니다...")

    # 3. 'objective' 컬럼 비교를 위한 새로운 DataFrame 생성
    #    'gt' 컬럼도 참고용으로 포함 (문자열이 길 수 있으니 주의)
    comparison_df = pd.DataFrame({
        'objective_original': df_original['objective'],
        'objective_refactored': df_refactored['objective'],
        'gt_original_preview': df_original['gt'].str[:100] + '...', # GT 문자열이 너무 길면 일부만 표시
        'gt_refactored_preview': df_refactored['gt'].str[:100] + '...'
    })
    
    # 점수 차이 계산 (리팩토링 - 오리지널, 작을수록 리팩토링이 좋은 것)
    comparison_df['score_difference'] = comparison_df['objective_refactored'] - comparison_df['objective_original']
    
    print("\n--- 목적 함수 값 비교 (상위 10개 샘플) ---")
    print(comparison_df.head(10))
    
    print("\n--- 'objective' 값에 대한 기술 통계 ---")
    print("\n[오리지널 GA Objective]")
    print(df_original['objective'].describe())
    print("\n[리팩토링된 GA Objective]")
    print(df_refactored['objective'].describe())
    
    print("\n--- 점수 차이 (Refactored - Original)에 대한 기술 통계 ---")
    print(comparison_df['score_difference'].describe())
    
    # 4. 어떤 버전이 더 좋은 점수를 얻었는지 요약
    #    (부동소수점 비교 시 np.isclose 사용 고려, 여기서는 단순 비교)
    refactored_better_count = (comparison_df['objective_refactored'] < comparison_df['objective_original']).sum()
    original_better_count = (comparison_df['objective_original'] < comparison_df['objective_refactored']).sum()
    # 점수가 같은 경우 (부동소수점 오차 감안)
    scores_equal_count = np.isclose(comparison_df['objective_original'], comparison_df['objective_refactored']).sum()

    print("\n--- 최종 성능 비교 요약 ---")
    print(f"리팩토링된 GA가 더 좋은 (낮은) 점수를 찾은 횟수: {refactored_better_count} / {len(comparison_df)}")
    print(f"오리지널 GA가 더 좋은 (낮은) 점수를 찾은 횟수  : {original_better_count} / {len(comparison_df)}")
    print(f"두 GA의 점수가 동일(또는 매우 근접)한 횟수    : {scores_equal_count} / {len(comparison_df)}")

    # 5. (선택 사항) 시각화로 비교
    # import matplotlib.pyplot as plt
    # plt.figure(figsize=(12, 6))
    # plt.subplot(1, 2, 1)
    # comparison_df['objective_original'].plot(kind='hist', bins=30, alpha=0.7, label='Original')
    # comparison_df['objective_refactored'].plot(kind='hist', bins=30, alpha=0.7, label='Refactored')
    # plt.title('Objective Score Distribution')
    # plt.xlabel('Objective Score')
    # plt.legend()

    # plt.subplot(1, 2, 2)
    # plt.scatter(comparison_df['objective_original'], comparison_df['objective_refactored'], alpha=0.5)
    # plt.plot([comparison_df['objective_original'].min(), comparison_df['objective_original'].max()], 
    #          [comparison_df['objective_original'].min(), comparison_df['objective_original'].max()], 
    #          'r--', lw=2, label='y=x line') # 기준선
    # plt.title('Original vs Refactored Scores')
    # plt.xlabel('Original Objective Score')
    # plt.ylabel('Refactored Objective Score')
    # plt.grid(True)
    # plt.legend()
    # plt.tight_layout()
    # plt.show()

CSV 파일 로드 성공!

두 데이터셋 모두 123개의 샘플 결과를 포함하고 있습니다. 'objective' 값 비교를 시작합니다...

--- 목적 함수 값 비교 (상위 10개 샘플) ---
   objective_original  objective_refactored  \
0             1792473               1770326   
1             1532972               2373997   
2             1865500               2047500   
3             1231564               1371128   
4             1037423               2287141   
5      51300490000000        53606130000000   
6                   0                     0   
7                   0                     0   
8                   0                     0   
9                   0                     0   

                                 gt_original_preview  \
0  • item15 machine04 date133 35\n• item15 machin...   
1  • item16 machine07 date144 1\n• item16 machine...   
2  • item05 machine10 date041 102\n• item96 machi...   
3  • item95 machine19 date049 2\n• item91 machine...   
4  • item05 machine10 date041 36\n• item05 machin...   
5  • item05 machine10 date041 42\n• it