In [1]:
import pandas as pd
code_info = pd.read_excel('./data/code_info.xlsx')

In [2]:
code_info.shape

(85, 3)

In [12]:
import numpy as np
import pandas as pd

RNG = np.random.default_rng(2025)

def gen_series_by_example(example, n: int, value_kinds=None):
    """
    example: '변수예시' 문자열 (nan 가능)
    value_kinds: '값 종류' 컬럼 (nan 가능, 콤마로 구분된 값 리스트)
    n: 생성할 데이터 개수
    """
    # 우선 값 종류가 있으면 그것을 최우선으로 사용
    if value_kinds is not None and str(value_kinds).strip() not in ["", "nan", "NaN"]:
        cats = [v.strip() for v in str(value_kinds).split(",") if v.strip() != ""]
        # 모두 숫자처럼 보이더라도 문자열로 처리 (범주형)
        return RNG.choice(cats, size=n).astype(str)

    # 그 외에는 변수예시 기준으로 처리
    ex = str(example).strip() if example is not None else ""

    # 정수, 비율, 코드, 일수 처리
    if "정수" in ex:
        return RNG.integers(0, 101, size=n)
    if "비율" in ex:
        return RNG.random(size=n).round(3) 
    
    if "5자리" in ex:
        return RNG.integers(10000, 100000, size=n).astype(str)  # 10000~99999
    if "4자리" in ex:
        return RNG.integers(1000, 10000, size=n).astype(str)
    if "3자리" in ex:
        return RNG.integers(100, 1000, size=n).astype(str)
    if "코드" in ex:
        return RNG.integers(100, 1000, size=n).astype(str)   # 3자리 코드 임의 생성
    
    if "일수" in ex:
        return RNG.integers(0, 366, size=n)
    if "점수" in ex:
        return RNG.integers(0, 101, size=n)
    if "번호" in ex:
        return RNG.integers(100000, 1000000, size=n).astype(str)  # 6자리 번호 임의 생성
    if "13자리" in ex:
        return RNG.integers(10**12, 10**13, size=n).astype(str)
    if "17자리" in ex:
        return RNG.integers(10**16, 10**17, size=n).astype(str)
    
    if "여부" in ex:
        return RNG.choice([1, 0], size=n)
    if ex in ["예, 아니요", "1(6개월 내 취업), 0(미취업)", "1(일치), 0(미일치)"]:
        return RNG.choice([1, 0], size=n)

    # 그 외에는 임의 정수 생성 -- 전처리 완료되었다 가정
    return RNG.integers(0, 101, size=n)


In [13]:
N = 100
data_dict = {}

for idx, row in code_info.iterrows():
    var = str(row["변수명"]).strip()
    example = row.get("변수예시", "")
    value_kinds = row.get("종류", None)   # 새 컬럼
    data_dict[var] = gen_series_by_example(example, N, value_kinds=value_kinds)

example_df = pd.DataFrame(data_dict)

In [14]:
example_df_nonnull = example_df.dropna(axis=1, how='all')
example_df_nonnull

Unnamed: 0,CLOS_YM,JHNT_MBN,JHNT_CTN,JHCR_DE,JHNT_CLOS_DE,FRFT_AFTR_JHNT_REQR_DYCT,JHNT_RQUT_CHNL_SECD,INFO_OTPB_GRAD_CD,MDTN_HOPE_GRD_CD,IDIF_AOFR_YN,...,ETL_DT,MAKE_DT,ACQ_180_YN,ACQ_DT,MATCH_L_YN,MATCH_M_YN,MATCH_S_YN,LAST_JSCD,EMPN_DE,LAST_FRFT_DE
0,45,497459,121204,83,95,48,온라인,1,필요,1,...,11,47,0,16,1,0,1,736,57,71
1,100,302908,193204,68,42,49,오프라인,0,필요,0,...,35,79,0,78,1,0,1,581,43,31
2,100,529071,220399,46,6,39,고용24,1,불필요,0,...,49,32,0,35,0,1,1,761,28,30
3,38,910176,315822,81,32,94,고용24,0,불필요,0,...,44,80,0,38,1,1,0,614,75,27
4,96,322722,900252,9,91,20,온라인,0,필요,0,...,51,41,0,88,1,1,0,867,78,45
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,99,597329,962576,11,29,75,고용24,1,필요,1,...,18,48,1,75,0,1,0,782,88,16
96,14,542829,155280,5,94,54,오프라인,0,필요,1,...,54,92,1,72,1,1,0,499,60,92
97,55,212541,948248,13,6,11,고용24,0,불필요,0,...,46,83,0,35,1,0,1,521,65,65
98,46,472226,105986,56,40,57,온라인,1,필요,1,...,31,89,0,35,1,0,0,235,59,23


In [15]:
example_df_nonnull.dtypes

CLOS_YM          int64
JHNT_MBN        object
JHNT_CTN        object
JHCR_DE          int64
JHNT_CLOS_DE     int64
                 ...  
MATCH_M_YN       int32
MATCH_S_YN       int32
LAST_JSCD       object
EMPN_DE          int64
LAST_FRFT_DE     int64
Length: 85, dtype: object

In [None]:
example_df_nonnull.to_csv('./data/synthetic_data.csv', index=False)