In [1]:
# scan_and_merge_dta_v2.py
import os, re, warnings
from pathlib import Path
from typing import List, Optional
import pandas as pd
warnings.simplefilter("ignore", FutureWarning)

# =========================
# 0) 경로/설정 (수정 가능)
# =========================
ROOT_DIR   = r"C:\Users\starw\OneDrive\바탕 화면\dta"         # 11개 상위 폴더가 모여 있는 경로
OUTPUT_DIR = r"C:\Users\starw\OneDrive\바탕 화면\dta_output"  # 결과물이 저장될 경로
GROUPS_TO_MERGE = ["internet","earnings","hours","conflict","pubs","rebellion","violations","cluster"]

# 키워드(원시 변수 후보)
KEYWORD_GROUPS = {
    "internet":   ["internet","fastinternet","broadband","speed","kbps","mbps","users","usage","penetration"],
    "earnings":   ["earnings","income","revenue","wage","wages","hourly_wage","pay","salary"],
    "hours":      ["hours","hrs","time_worked"],
    "conflict":   ["incident","incidents","conflict","sigacts","ied","direct_fire","casualty","killings","victims"],
    "pubs":       ["publication","publications","paper","papers","patent","patents","citation","citations"],
    "rebellion":  ["rebellion","rebellions","rebel","uprising"],
    "violations": ["violation","violations","penalty","penalties","fine","fines"],
    "cluster":    ["cluster_size","inventors","comets","field_size","peer_count"],
}
# 변환 변수 패턴(원시 아님)
TRANSFORM_PATTERNS = [r"^log", r"^ln", r"^asinh", r"^ihs"]

# 병합 키 후보(상황 따라 수정)
POSSIBLE_KEYS = [
    ["city","country_year"], ["city","year"], ["city_id","year"],
    ["country","year"], ["geo_id","year"], ["id","year"],
    ["region","year"], ["state","year"]
]

# 분석용 파일 이름 힌트(없으면 가장 큰 dta를 선택)
ANALYSIS_FILE_HINTS = ["paper1","analysis","table","final","main","reg","ready"]

# 문서화용(실제 계산 X)
SUGGESTED_TRANSFORMS = {
    "internet":   "loginternet = log(1 + internet) OR asinh(internet)",
    "earnings":   "asinh(earnings) / asinh(wages)",
    "hours":      "asinh(hours)",
    "conflict":   "log(1 + incidents/killings)",
    "pubs":       "log(publications) / log(citations)",
    "rebellion":  "asinh(rebellions per capita)",
    "violations": "log(violations) / log(penalties)",
    "cluster":    "log(cluster size)",
}

# =========================
# 1) 유틸 함수
# =========================
def rglob_dta(folder: Path) -> List[Path]:
    return sorted(folder.rglob("*.dta"))

def looks_transformed(var_lower: str) -> bool:
    return any(re.match(p, var_lower) for p in TRANSFORM_PATTERNS)

def read_dta(p: Path) -> Optional[pd.DataFrame]:
    try:
        return pd.read_stata(p, convert_categoricals=False)
    except Exception as e:
        print(f"[WARN] read fail: {p} | {e}")
        return None

def detect_group(var_lower: str) -> Optional[str]:
    for g, kws in KEYWORD_GROUPS.items():
        for kw in kws:
            if kw in var_lower:
                return g
    return None

def lower_cols(df: pd.DataFrame) -> pd.DataFrame:
    x = df.copy()
    x.columns = [c.lower() for c in x.columns]
    return x

def pick_analysis_file(dtas: List[Path]) -> Optional[Path]:
    if not dtas: return None
    for hint in ANALYSIS_FILE_HINTS:
        cands = [p for p in dtas if hint in p.stem.lower()]
        if cands:
            return max(cands, key=lambda p: p.stat().st_size)
    return max(dtas, key=lambda p: p.stat().st_size)

def pick_join_keys(paper: pd.DataFrame, src: pd.DataFrame) -> Optional[List[str]]:
    pa = set(paper.columns); sb = set(src.columns)
    for keys in POSSIBLE_KEYS:
        k = [kk.lower() for kk in keys]
        if set(k).issubset(pa) and set(k).issubset(sb):
            return k
    # fallback: 공통 컬럼 2개
    cmn = list(pa & sb)
    if len(cmn) >= 2: return cmn[:2]
    return None

def aggregate_duplicates(df: pd.DataFrame, keys: List[str]) -> pd.DataFrame:
    g = df.groupby(keys, dropna=False)
    if g.size().max() <= 1: return df
    def agg(x):
        return x.mean() if pd.api.types.is_numeric_dtype(x) else x.iloc[0]
    return g.aggregate(agg).reset_index()

def select_best_raw(df_vars: pd.DataFrame, group: str) -> Optional[tuple]:
    cand = df_vars[(df_vars["group"]==group) & (~df_vars["looks_transformed"])].copy()
    if cand.empty: return None
    def score(r):
        s = 0
        v = r["variable_lower"]; f = r["rel_path"].lower()
        if v == group: s += 3
        if group in v: s += 2
        if any(h in f for h in ["panel","city","urban","africa","global","main","ready"]): s += 1
        s += max(0, 20 - len(v))
        return s
    cand["score"] = cand.apply(score, axis=1)
    cand = cand.sort_values(["score","rel_path"], ascending=[False, True])
    row = cand.iloc[0]
    return (row["rel_path"], row["variable"])  # 상대경로, 변수명

# =========================
# 2) 폴더 단위 스캔 + 병합
# =========================
def scan_folder(root_sub: Path, out_dir: Path):
    out_dir.mkdir(parents=True, exist_ok=True)
    rows = []
    dtas = rglob_dta(root_sub)
    for dta in dtas:
        df = read_dta(dta)
        if df is None: continue
        rel = str(dta.relative_to(root_sub))
        for col in df.columns:
            clo = col.lower()
            rows.append({
                "folder": root_sub.name,
                "rel_path": rel,         # 하위폴더 포함 상대경로
                "file": dta.name,
                "variable": col,
                "variable_lower": clo,
                "group": detect_group(clo),
                "looks_transformed": looks_transformed(clo),
            })
    var_df = pd.DataFrame(rows)
    if var_df.empty:
        print(f"[WARN] no variables found in {root_sub}")
        return None, None, None

    # 저장: 전체 변수 스캔
    var_df.to_csv(out_dir / "all_vars_scanned.csv", index=False)

    # 원시 후보만 요약 + 추천 변환식
    candidates = var_df[(~var_df["looks_transformed"]) & var_df["group"].notna()].copy()
    candidates["suggested_transform"] = candidates["group"].map(SUGGESTED_TRANSFORMS)
    candidates.sort_values(["rel_path","group","variable"]).to_csv(out_dir / "variable_map.csv", index=False)

    # 분석용 파일 선택
    analysis = pick_analysis_file(dtas)
    return var_df, candidates, analysis

def merge_raw_into_analysis(root_sub: Path, analysis_path: Path, var_df: pd.DataFrame, out_dir: Path):
    log_lines = []
    if analysis_path is None:
        (out_dir / "merge_report.txt").write_text("No analysis .dta detected.", encoding="utf-8")
        return None

    paper = lower_cols(read_dta(analysis_path))
    merged = paper.copy()
    log_lines.append(f"Analysis file: {analysis_path}")
    log_lines.append(f"Rows(original): {len(merged)}")

    for group in GROUPS_TO_MERGE:
        sel = select_best_raw(var_df, group)
        if sel is None:
            log_lines.append(f"[{group}] raw candidate not found.")
            continue

        rel_src, varname = sel
        src_path = root_sub / rel_src
        src = read_dta(src_path)
        if src is None:
            log_lines.append(f"[{group}] cannot read source: {rel_src}")
            continue

        src_lo = lower_cols(src)
        keys = pick_join_keys(merged, src_lo)
        if not keys:
            log_lines.append(f"[{group}] join keys not found with {rel_src}. skipped.")
            continue

        use_cols = list(dict.fromkeys(keys + [varname.lower()]))
        # 실제 컬럼 이름 보정
        use_cols = [c if c in src_lo.columns else c.lower() for c in use_cols]
        use_cols = [c for c in use_cols if c in src_lo.columns]
        src_narrow = src_lo[use_cols].copy()
        src_narrow = aggregate_duplicates(src_narrow, keys)

        before_cols = set(merged.columns)
        merged = merged.merge(src_narrow, on=keys, how="left", validate="m:1")
        newcol = varname.lower()
        miss = merged[newcol].isna().sum() if newcol in merged.columns else -1
        log_lines.append(f"[{group}] merged from {rel_src} var={varname} | keys={keys} | missing_after_join={miss}")

    out_dir.mkdir(parents=True, exist_ok=True)
    out_dta = out_dir / f"{analysis_path.stem}_with_raw.dta"
    merged.to_stata(out_dta, write_index=False, version=118)
    (out_dir / "merge_report.txt").write_text("\n".join(log_lines), encoding="utf-8")
    return out_dta

# =========================
# 3) 메인 루프
# =========================
def main():
    root = Path(ROOT_DIR)
    out_root = Path(OUTPUT_DIR)
    out_root.mkdir(parents=True, exist_ok=True)

    master_rows = []

    subfolders = [p for p in root.iterdir() if p.is_dir()]
    for sub in sorted(subfolders, key=lambda p: p.name.lower()):
        print(f"\n=== {sub.name} ===")
        out_dir = out_root / sub.name
        var_df, cand_df, analysis = scan_folder(sub, out_dir)
        if var_df is None:
            master_rows.append({"folder": sub.name, "status": "no_dta"})
            continue

        analysis_name = analysis.name if analysis else ""
        out_dta = None
        if analysis:
            out_dta = merge_raw_into_analysis(sub, analysis, var_df, out_dir)

        master_rows.append({
            "folder": sub.name,
            "scanned_files": len(var_df["file"].unique()),
            "raw_candidates": 0 if cand_df is None else len(cand_df),
            "analysis_file": analysis_name,
            "output_dta": "" if out_dta is None else str(out_dta),
            "status": "ok" if analysis else "no_analysis_detected"
        })

    pd.DataFrame(master_rows).to_csv(out_root / "master_summary.csv", index=False)
    print("\n[DONE] See:", out_root)

if __name__ == "__main__":
    main()



=== Arora_et_al_2021_Data ===

=== Azoulay_et_al_2019_Data ===


One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  return pd.read_stata(p, convert_categoricals=False)
One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  return pd.read_stata(p, convert_categoricals=False)



=== Berkouwer_and_Dean_2022_Data ===

=== Cao_and_Chen_2022_Data ===

=== Carranza_et_al_2022_Data ===

=== Chen_and_Roth_2023_Data ===
[WARN] no variables found in C:\Users\starw\OneDrive\바탕 화면\dta\Chen_and_Roth_2023_Data

=== Fetzer_et_al_2021_Data ===
[WARN] read fail: C:\Users\starw\OneDrive\바탕 화면\dta\Fetzer_et_al_2021_Data\REPOSITORY\PROCESSING_FILES\DATA_INPUTS\DEPLOYMENT\ISAFSTRENGTH.dta | Version of given Stata file is 110. pandas supports importing versions 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16),and 119 (Stata 15/16, over 32,767 variables).
[WARN] read fail: C:\Users\starw\OneDrive\바탕 화면\dta\Fetzer_et_al_2021_Data\REPOSITORY\PROCESSING_FILES\DATA_INPUTS\DISTANCE_MEASURES\NEARROADS.dta | Version of given Stata file is 110. pandas supports importing versions 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16),and 119 (Stata 15/16, over 32,

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  return pd.read_stata(p, convert_categoricals=False)
One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  return pd.read_stata(p, convert_categoricals=False)



[DONE] See: C:\Users\starw\OneDrive\바탕 화면\dta_output


In [6]:
import pandas as pd
from pathlib import Path
import warnings

warnings.simplefilter("ignore", FutureWarning)

# ======== 사용자 경로 직접 지정 ========
ROOT_DIR = Path(r"C:\Users\starw\OneDrive\바탕 화면\dta")          # 11개 상위폴더가 있는 경로
OUTPUT_DIR = Path(r"C:\Users\starw\OneDrive\바탕 화면\dta_output")  # 결과 저장할 경로
# ====================================

# 키워드 그룹 정의
KEYWORD_GROUPS = {
    "internet": ["internet", "fastinternet", "broadband"],
    "earnings": ["earnings", "income", "wage", "salary"],
    "hours": ["hours", "hrs", "time_worked"],
    "conflict": ["incident", "conflict", "killings"],
    "pubs": ["publication", "citations", "patent"],
    "rebellion": ["rebellion", "rebel"],
    "violations": ["violation", "penalty", "fine"],
    "cluster": ["cluster_size"],
}

# 추천 변환식
SUGGESTED_TRANSFORMS = {
    "internet": "loginternet = log(1+internet) OR asinh(internet)",
    "earnings": "asinh(earnings)",
    "hours": "asinh(hours)",
    "conflict": "log(1+incidents/killings)",
    "pubs": "log(publications), log(citations)",
    "rebellion": "asinh(rebellions per capita)",
    "violations": "log(violations), log(penalties)",
    "cluster": "log(cluster size)",
}

def detect_group(varname: str):
    v = varname.lower()
    for g, kws in KEYWORD_GROUPS.items():
        if any(kw in v for kw in kws):
            return g
    return None

def scan_folder(folder: Path):
    """폴더 및 모든 하위폴더에서 .dta 파일 탐색"""
    rows = []
    for f in folder.rglob("*.dta"):   # 하위폴더까지 전부 검색
        try:
            df = pd.read_stata(f, convert_categoricals=False)
            for col in df.columns:
                g = detect_group(col)
                if g:
                    rows.append({
                        "Paper": folder.name,          # 최상위 폴더 이름 (논문 단위)
                        "Subfolder": str(f.parent),   # 파일이 속한 하위폴더 경로
                        "File": f.name,
                        "Original Variable(s)": col,
                        "Transformed Variable(s)": SUGGESTED_TRANSFORMS.get(g, "")
                    })
        except Exception as e:
            print(f"[WARN] {f} 읽기 실패: {e}")
    return pd.DataFrame(rows)

def main():
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    all_dfs = []

    for folder in sorted(ROOT_DIR.iterdir()):
        if folder.is_dir():
            print(f"▶ {folder.name} 스캔 중...")
            df = scan_folder(folder)
            if not df.empty:
                df.to_csv(OUTPUT_DIR / f"{folder.name}_variable_map.csv", index=False)
                all_dfs.append(df)

    if all_dfs:
        final = pd.concat(all_dfs, ignore_index=True)
        final.to_excel(OUTPUT_DIR / "all_papers_variable_map.xlsx", index=False)
        print(f"\n[DONE] 최종 요약표 저장 완료 → {OUTPUT_DIR/'all_papers_variable_map.xlsx'}")
    else:
        print("\n[INFO] 원시 변수 후보를 찾지 못했습니다.")

if __name__ == "__main__":
    main()


▶ Arora_et_al_2021_Data 스캔 중...
▶ Azoulay_et_al_2019_Data 스캔 중...


One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  df = pd.read_stata(f, convert_categoricals=False)


▶ Berkouwer_and_Dean_2022_Data 스캔 중...
▶ Cao_and_Chen_2022_Data 스캔 중...
▶ Carranza_et_al_2022_Data 스캔 중...
▶ Chen_and_Roth_2023_Data 스캔 중...
▶ Fetzer_et_al_2021_Data 스캔 중...
[WARN] C:\Users\starw\OneDrive\바탕 화면\dta\Fetzer_et_al_2021_Data\REPOSITORY\PROCESSING_FILES\DATA_INPUTS\DEPLOYMENT\ISAFSTRENGTH.dta 읽기 실패: Version of given Stata file is 110. pandas supports importing versions 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16),and 119 (Stata 15/16, over 32,767 variables).
[WARN] C:\Users\starw\OneDrive\바탕 화면\dta\Fetzer_et_al_2021_Data\REPOSITORY\PROCESSING_FILES\DATA_INPUTS\DISTANCE_MEASURES\NEARROADS.dta 읽기 실패: Version of given Stata file is 110. pandas supports importing versions 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16),and 119 (Stata 15/16, over 32,767 variables).
[WARN] C:\Users\starw\OneDrive\바탕 화면\dta\Fetzer_et_al_2021_Data\REPOSITORY\PRO

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  df = pd.read_stata(f, convert_categoricals=False)



[DONE] 최종 요약표 저장 완료 → C:\Users\starw\OneDrive\바탕 화면\dta_output\all_papers_variable_map.xlsx


In [7]:
pip install pyreadstat

Collecting pyreadstat
  Downloading pyreadstat-1.3.1.tar.gz (610 kB)
     ---------------------------------------- 0.0/610.8 kB ? eta -:--:--
     -------------------------------------- 610.8/610.8 kB 7.4 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting narwhals>=2.0 (from pyreadstat)
  Downloading narwhals-2.5.0-py3-none-any.whl.metadata (11 kB)
Downloading narwhals-2.5.0-py3-none-any.whl (407 kB)
Building wheels for collected packages: pyreadstat
  Building wheel for pyreadstat (pyproject.toml): started
  Building wheel for pyreadstat (pyproject.toml): finished with status 'error'
Failed to build pyreadstat
Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  × Building wheel for pyreadstat (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [4 lines of output]
      [1/3] Cythonizing pyreadstat/_readstat_parser.pyx
      [2/3] Cythonizing pyreadstat/_readstat_writer.pyx
      [3/3] Cythonizing pyreadstat/pyreadstat.pyx
      error: Microsoft Visual C++ 14.0 or greater is required. Get it with "Microsoft C++ Build Tools": https://visualstudio.microsoft.com/visual-cpp-build-tools/
      [end of output]
  
  note: This error originates from a subprocess, and is likely not a problem with pip.
  ERROR: Failed building wheel for pyreadstat

[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: Failed to build installable wheels for some pyproject.toml based projects (pyreadstat)


In [8]:
import pandas as pd
from pathlib import Path
import warnings, traceback, datetime

warnings.simplefilter("ignore", FutureWarning)

# ======== 사용자 경로 ========
ROOT_DIR = Path(r"C:\Users\starw\OneDrive\바탕 화면\dta")          # 11개 상위폴더가 있는 경로(하위폴더들 포함)
OUTPUT_DIR = Path(r"C:\Users\starw\OneDrive\바탕 화면\dta_output")  # 결과 저장 경로
# ===========================

# pyreadstat 사용 가능 여부 체크
try:
    import pyreadstat
    HAS_PYREADSTAT = True
except Exception:
    HAS_PYREADSTAT = False

# 키워드 그룹(원시변수 탐지용)
KEYWORD_GROUPS = {
    "internet": ["internet", "fastinternet", "broadband", "speed", "kbps", "mbps", "users", "usage", "penetration"],
    "earnings": ["earnings", "income", "revenue", "wage", "wages", "salary", "pay"],
    "hours": ["hours", "hrs", "time_worked"],
    "conflict": ["incident", "incidents", "conflict", "sigacts", "killings", "victims", "ied", "direct_fire", "casualty"],
    "pubs": ["publication", "publications", "paper", "papers", "patent", "patents", "citation", "citations"],
    "rebellion": ["rebellion", "rebellions", "rebel", "uprising"],
    "violations": ["violation", "violations", "penalty", "penalties", "fine", "fines"],
    "cluster": ["cluster_size", "field_size", "peer_count", "inventors"],
}

# 추천 변환식(보고용, 실제 계산 X)
SUGGESTED_TRANSFORMS = {
    "internet": "loginternet = log(1+internet) OR asinh(internet)",
    "earnings": "asinh(earnings) / asinh(wages)",
    "hours": "asinh(hours)",
    "conflict": "log(1+incidents/killings)",
    "pubs": "log(publications) / log(citations)",
    "rebellion": "asinh(rebellions per capita)",
    "violations": "log(violations) / log(penalties)",
    "cluster": "log(cluster size)",
}

def log_error(log_path: Path, file_path: Path, err: Exception):
    log_path.parent.mkdir(parents=True, exist_ok=True)
    with open(log_path, "a", encoding="utf-8") as f:
        f.write(f"[{datetime.datetime.now()}] {file_path}\n")
        f.write("".join(traceback.format_exception_only(type(err), err)))
        f.write("\n")

def detect_group(varname: str):
    v = varname.lower()
    for g, kws in KEYWORD_GROUPS.items():
        if any(kw in v for kw in kws):
            return g
    return None

def get_columns_from_dta(file_path: Path, error_log: Path):
    """
    최대한 안전하게 .dta의 컬럼 목록을 얻는다.
    1) pyreadstat (가능하면 row_limit=1) → meta.column_names 우선
    2) pandas.read_stata(chunksize=1)
    3) pandas.read_stata(그냥 전체) 마지막 시도
    실패하면 None 반환하고 error_log에 기록
    """
    # 1) pyreadstat (가장 호환성 넓음)
    if HAS_PYREADSTAT:
        try:
            # 메모리 최소: 1행만
            df, meta = pyreadstat.read_dta(str(file_path), row_limit=1, apply_value_formats=False)
            cols = meta.column_names if meta and meta.column_names else list(df.columns)
            return cols
        except Exception as e:
            log_error(error_log, file_path, e)
            # 계속 진행 (다음 방법으로 재시도)

    # 2) pandas: chunksize=1로 가볍게
    try:
        reader = pd.read_stata(file_path, convert_categoricals=False, chunksize=1)
        first = next(iter(reader))
        return list(first.columns)
    except Exception as e:
        log_error(error_log, file_path, e)

    # 3) pandas: full read(마지막 시도 – 파일이 작을 때만 성공 가능)
    try:
        df = pd.read_stata(file_path, convert_categoricals=False)
        return list(df.columns)
    except Exception as e:
        log_error(error_log, file_path, e)
        return None

def scan_one_top_folder(top_folder: Path, out_dir: Path, error_log: Path) -> pd.DataFrame:
    """
    상위폴더(논문 1개 단위) 아래 모든 하위폴더를 rglob로 훑어 .dta 컬럼을 모아
    Paper | Subfolder | File | Original Variable(s) | Transformed Variable(s)
    형태로 DF 반환
    """
    rows = []
    for f in top_folder.rglob("*.dta"):
        cols = get_columns_from_dta(f, error_log)
        if not cols:
            print(f"[WARN] 못 읽음: {f}")
            continue
        for col in cols:
            g = detect_group(col)
            if g:
                rows.append({
                    "Paper": top_folder.name,       # 최상위 폴더(논문)
                    "Subfolder": str(f.parent),     # 실제 파일이 있는 경로
                    "File": f.name,
                    "Original Variable(s)": col,
                    "Transformed Variable(s)": SUGGESTED_TRANSFORMS.get(g, "")
                })
    return pd.DataFrame(rows)

def main():
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    error_log = OUTPUT_DIR / "read_errors.log"
    # 초기화
    if error_log.exists():
        error_log.unlink()

    all_tables = []

    for folder in sorted(ROOT_DIR.iterdir(), key=lambda p: p.name.lower()):
        if not folder.is_dir():
            continue
        print(f"▶ 스캔 중: {folder.name}")
        df = scan_one_top_folder(folder, OUTPUT_DIR, error_log)
        if not df.empty:
            df.to_csv(OUTPUT_DIR / f"{folder.name}_variable_map.csv", index=False, encoding="utf-8-sig")
            all_tables.append(df)

    if all_tables:
        final = pd.concat(all_tables, ignore_index=True)
        final.to_excel(OUTPUT_DIR / "all_papers_variable_map.xlsx", index=False)
        print(f"\n[DONE] 통합표 저장 완료 → {OUTPUT_DIR/'all_papers_variable_map.xlsx'}")
        if error_log.exists():
            print(f"[INFO] 일부 파일은 다른 방식으로도 실패했습니다. 로그 확인: {error_log}")
    else:
        print("\n[INFO] 스캔 결과가 비었습니다. 경로/권한/파일 손상 여부를 확인하세요.")
        if error_log.exists():
            print(f"[INFO] 에러 로그: {error_log}")

if __name__ == "__main__":
    main()


▶ 스캔 중: Arora_et_al_2021_Data
▶ 스캔 중: Azoulay_et_al_2019_Data
▶ 스캔 중: Berkouwer_and_Dean_2022_Data


One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  first = next(iter(reader))


▶ 스캔 중: Cao_and_Chen_2022_Data
▶ 스캔 중: Carranza_et_al_2022_Data
▶ 스캔 중: Chen_and_Roth_2023_Data
▶ 스캔 중: Fetzer_et_al_2021_Data
[WARN] 못 읽음: C:\Users\starw\OneDrive\바탕 화면\dta\Fetzer_et_al_2021_Data\REPOSITORY\PROCESSING_FILES\DATA_INPUTS\DEPLOYMENT\ISAFSTRENGTH.dta
[WARN] 못 읽음: C:\Users\starw\OneDrive\바탕 화면\dta\Fetzer_et_al_2021_Data\REPOSITORY\PROCESSING_FILES\DATA_INPUTS\DISTANCE_MEASURES\NEARROADS.dta
[WARN] 못 읽음: C:\Users\starw\OneDrive\바탕 화면\dta\Fetzer_et_al_2021_Data\REPOSITORY\PROCESSING_FILES\DATA_INPUTS\DISTANCE_MEASURES\TRAVELDISTNEARMILAIRPORT.dta
▶ 스캔 중: Hjort_and_Poulsen_2019_Data
▶ 스캔 중: Johnson_2020_Data
▶ 스캔 중: Moretti_2021_Data
▶ 스캔 중: Rogall_2021_Data

[DONE] 통합표 저장 완료 → C:\Users\starw\OneDrive\바탕 화면\dta_output\all_papers_variable_map.xlsx
[INFO] 일부 파일은 다른 방식으로도 실패했습니다. 로그 확인: C:\Users\starw\OneDrive\바탕 화면\dta_output\read_errors.log
