In [53]:
import pandas as pd
code_info = pd.read_excel('./data/code_info.xlsx')

In [54]:
import numpy as np
import pandas as pd

RNG = np.random.default_rng(2025)

def gen_series_by_example(example, n: int, value_kinds=None):
    """
    example: '변수예시' 문자열 (nan 가능)
    value_kinds: '값 종류' 컬럼 (nan 가능, 콤마로 구분된 값 리스트)
    n: 생성할 데이터 개수
    """
    # 우선 값 종류가 있으면 그것을 최우선으로 사용
    if value_kinds is not None and str(value_kinds).strip() not in ["", "nan", "NaN"]:
        cats = [v.strip() for v in str(value_kinds).split(",") if v.strip() != ""]
        # 모두 숫자처럼 보이더라도 문자열로 처리 (범주형)
        return RNG.choice(cats, size=n).astype(str)

    # 그 외에는 변수예시 기준으로 처리
    ex = str(example).strip() if example is not None else ""

    # 정수, 비율, 코드, 일수 처리
    if "정수" in ex:
        return RNG.integers(0, 101, size=n)
    if "비율" in ex:
        return RNG.random(size=n).round(3) 
    
    if "5자리" in ex:
        return RNG.integers(10000, 100000, size=n).astype(str)  # 10000~99999
    if "4자리" in ex:
        return RNG.integers(1000, 10000, size=n).astype(str)
    if "3자리" in ex:
        return RNG.integers(100, 1000, size=n).astype(str)
    if "코드" in ex:
        return RNG.integers(100, 1000, size=n).astype(str)   # 3자리 코드 임의 생성
    
    if "일수" in ex:
        return RNG.integers(0, 366, size=n)
    if "점수" in ex:
        return RNG.integers(0, 101, size=n)
    if "번호" in ex:
        return RNG.integers(100000, 1000000, size=n).astype(str)  # 6자리 번호 임의 생성
    if "13자리" in ex:
        return RNG.integers(10**12, 10**13, size=n).astype(str)
    if "17자리" in ex:
        return RNG.integers(10**16, 10**17, size=n).astype(str)
    
    if "여부" in ex:
        return RNG.choice([1, 0], size=n)
    
    if ex in ["예, 아니요", "1(6개월 내 취업), 0(미취업)", "1(일치), 0(미일치)"]:
        return RNG.choice([1, 0], size=n)

    # fallback → NaN
    return np.array([np.nan] * n)


In [55]:
N = 100
data_dict = {}

for idx, row in code_info.iterrows():
    var = str(row["변수명"]).strip()
    example = row.get("변수예시", "")
    value_kinds = row.get("종류", None)   # 새 컬럼
    data_dict[var] = gen_series_by_example(example, N, value_kinds=value_kinds)

example_df = pd.DataFrame(data_dict)

In [56]:
example_df_nonnull = example_df.dropna(axis=1, how='all')
example_df_nonnull

Unnamed: 0,JHNT_MBN,JHNT_CTN,FRFT_AFTR_JHNT_REQR_DYCT,JHNT_RQUT_CHNL_SECD,INFO_OTPB_GRAD_CD,MDTN_HOPE_GRD_CD,IDIF_AOFR_YN,EMAIL_RCYN,DSPT_LABR_YN,COMM_WAGE_TYCD,...,AVG_HOPE_WAGE_SM_AMT,CRQF_CT,ACQ_180_YN,ACQ_DT,MATCH_L_YN,MATCH_M_YN,MATCH_S_YN,LAST_JSCD,SEEK_CUST_NO,CRSE_ID
0,502725,736876,2,고용24,0,필요,1,1,1,상용,...,6,27,0,29,0,1,1,525,243,64461569069418729
1,995012,581715,10,고용24,1,필요,1,0,1,일용,...,39,65,1,11,1,0,1,809,799,33091149255113564
2,993324,761073,13,온라인,1,필요,0,1,0,일용,...,7,71,1,25,1,0,1,388,412,16514797298539507
3,443808,614348,24,고용24,1,불필요,0,0,0,일용,...,11,5,1,38,0,0,1,815,440,30696939908864922
4,958229,867683,89,오프라인,0,필요,1,0,1,일용,...,49,24,0,36,1,1,0,471,884,63690232213469385
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,985174,782765,96,오프라인,1,불필요,0,1,1,상용,...,20,58,1,46,0,1,1,533,768,86535037931963840
96,226048,499694,6,오프라인,0,불필요,1,0,1,상용,...,82,20,0,46,1,0,0,925,749,78006105064715548
97,595683,521660,95,오프라인,1,필요,0,0,0,일용,...,64,99,0,99,1,1,1,839,413,82497571551931977
98,518228,235785,0,온라인,1,불필요,1,1,1,상용,...,87,3,0,76,1,1,1,899,415,33566983138938037


In [57]:
example_df_nonnull.dtypes

JHNT_MBN                    object
JHNT_CTN                    object
FRFT_AFTR_JHNT_REQR_DYCT     int64
JHNT_RQUT_CHNL_SECD         object
INFO_OTPB_GRAD_CD            int32
                             ...  
MATCH_M_YN                   int32
MATCH_S_YN                   int32
LAST_JSCD                   object
SEEK_CUST_NO                object
CRSE_ID                     object
Length: 79, dtype: object

In [58]:
example_df_nonnull.to_csv('./data/synthetic_data.csv', index=False)