In [9]:
import os
import pandas as pd
from collections import defaultdict

participate = "./data/participants.tsv"
meta = "./meta_data.csv"


In [3]:
root_dir = "./fmri_pooled_features"
records = []

# sub-001 ~ sub-XXX
for subject in sorted(os.listdir(root_dir)):
    subject_path = os.path.join(root_dir, subject)
    if not os.path.isdir(subject_path):
        continue

    # task (e.g., pieman, schema ...)
    for task in sorted(os.listdir(subject_path)):
        task_path = os.path.join(subject_path, task)
        if not os.path.isdir(task_path):
            continue

        # run 폴더 (e.g., sub-001_task-pieman_run-1_bold)
        for run in sorted(os.listdir(task_path)):
            run_path = os.path.join(task_path, run)
            if not os.path.isdir(run_path):
                continue

            pt_files = [f for f in os.listdir(run_path) if f.endswith(".pt")]
            frame_count = len(pt_files)

            records.append({
                "subject_id": subject,
                "task": task,
                "run_folder": run,
                "frame_count": frame_count,
                "run_path": run_path  # 경로까지 저장하면 활용도 ↑
            })

# DataFrame 생성 및 저장
df = pd.DataFrame(records)
df.to_csv("fmri_pooled_frame_counts.csv", index=False)

print("저장 완료: fmri_pooled_frame_counts.csv")

저장 완료: fmri_pooled_frame_counts.csv


In [10]:
fmri_root = "./fmri_pooled_features"
meta_path = "./meta_data.csv"
# 메타데이터 로딩
meta_df = pd.read_csv(meta_path)

# defaultdict로 story들을 임시로 누적
records_dict = defaultdict(lambda: {
    "stim_files": []
})

# subject/task/run 구조 순회
for subject in sorted(os.listdir(fmri_root)):
    subject_path = os.path.join(fmri_root, subject)
    if not os.path.isdir(subject_path):
        continue

    for task in os.listdir(subject_path):
        task_path = os.path.join(subject_path, task)
        if not os.path.isdir(task_path):
            continue

        for run_folder in os.listdir(task_path):
            run_path = os.path.join(task_path, run_folder)
            if not os.path.isdir(run_path):
                continue

            # 프레임 수
            pt_files = [f for f in os.listdir(run_path) if f.endswith(".pt")]
            frame_count = len(pt_files)

            # 메타데이터 매칭
            nii_filename = f"{run_folder}.nii.gz"
            row = meta_df[meta_df["file"] == nii_filename]
            if row.empty:
                continue

            # trial_type, stim_file 파싱
            trial_types = [t.strip() for t in str(row.iloc[0]["trial_type"]).split(",")]
            stim_files_raw = [s.strip() if isinstance(s, str) else np.nan for s in str(row.iloc[0]["stim_file"]).split(",")]

            # 키: run_folder 단위
            key = (subject, task, run_folder)

            # 기본 정보 저장 (1번만)
            records_dict[key]["subject_id"] = subject
            records_dict[key]["task"] = task
            records_dict[key]["run_folder"] = run_folder
            records_dict[key]["frame_count"] = frame_count
            records_dict[key]["run_path"] = run_path

            # story stim_file만 순서대로 누적
            for idx, t_type in enumerate(trial_types):
                if t_type == "story" and idx < len(stim_files_raw):
                    s_file = stim_files_raw[idx]
                    if isinstance(s_file, str) and s_file.lower() != "nan":
                        stim_file = s_file.replace("_audio.wav", "")
                        records_dict[key]["stim_files"].append(stim_file)

# 리스트로 변환
records = []
for v in records_dict.values():
    v["stim_files"] = ",".join(v["stim_files"])
    records.append(v)

# 저장
df = pd.DataFrame(records)
df.to_csv("fmri_story_frame_counts_grouped.csv", index=False)
print("✅ 저장 완료: fmri_story_frame_counts_grouped.csv")

✅ 저장 완료: fmri_story_frame_counts_grouped.csv


In [11]:
# 기존 결과 불러오기
df = pd.read_csv("fmri_story_frame_counts_grouped.csv")

# 저장할 열
text_frame_list = []
text_path_list = []

# 각 행 순회
for _, row in df.iterrows():
    stim_list = [s.strip() for s in row["stim_files"].split(",")]
    
    # 각 stim_file별 .pt 개수 세기
    frame_counts = []
    path_list = []

    for stim in stim_list:
        text_dir = os.path.join("./text_feature", stim)
        if os.path.exists(text_dir):
            pt_files = [f for f in os.listdir(text_dir) if f.endswith(".pt")]
            frame_counts.append(str(len(pt_files)))
            path_list.append(text_dir)
        else:
            frame_counts.append("0")
            path_list.append("MISSING")

    text_frame_list.append(",".join(frame_counts))
    text_path_list.append(", ".join(path_list))

# 열 추가
df["text_frame"] = text_frame_list
df["text_path"] = text_path_list

# 저장
df.to_csv("fmri_story_frame_counts_with_text.csv", index=False)
print("✅ 저장 완료: fmri_story_frame_counts_with_text.csv")

✅ 저장 완료: fmri_story_frame_counts_with_text.csv


In [17]:
# 기존 CSV 불러오기
df = pd.read_csv("fmri_story_frame_counts_grouped.csv")

# 새 열 저장
text_frame_list = []
text_path_list = []
stim_files_clean = []

for _, row in df.iterrows():
    stim_list_raw = [s.strip() for s in row["stim_files"].split(",")]

    # 중복 제거 + 순서 유지
    stim_list = list(dict.fromkeys(stim_list_raw))

    # text_feature 경로 처리
    frame_counts = []
    path_list = []

    for stim in stim_list:
        text_dir = os.path.join("./text_feature", stim)
        if os.path.exists(text_dir):
            pt_files = [f for f in os.listdir(text_dir) if f.endswith(".pt")]
            frame_counts.append(str(len(pt_files)))
            path_list.append(text_dir)
        else:
            frame_counts.append("0")
            path_list.append("MISSING")

    text_frame_list.append(",".join(frame_counts))
    text_path_list.append(", ".join(path_list))
    stim_files_clean.append(",".join(stim_list))

# 업데이트된 열 추가
df["stim_files"] = stim_files_clean
df["text_frame"] = text_frame_list
df["text_path"] = text_path_list

# 저장
df.to_csv("dataload.csv", index=False)
print("✅ 중복 제거 및 저장 완료: fmri_story_frame_counts_with_text.csv")

✅ 중복 제거 및 저장 완료: fmri_story_frame_counts_with_text.csv


In [22]:
# 데이터 로드
df_story = pd.read_csv("fmri_story_frame_counts_with_text.csv")
df_participant = pd.read_csv("./data/participants.tsv", sep="\t")

# 결과 저장 리스트
age_list = []
sex_list = []

for _, row in df_story.iterrows():
    subj = row["subject_id"]
    task = row["task"]

    # participant 테이블에서 해당 subject 찾기
    part_row = df_participant[df_participant["participant_id"] == subj]
    if part_row.empty:
        age_list.append("25")
        sex_list.append("Unknown")
        continue

    part_row = part_row.iloc[0]

    task_list = str(part_row["task"]).split(",")
    age_filled_list = str(part_row["age"]).split(",")
    sex_filled_list = str(part_row["sex"]).split(",")

    try:
        task_idx = task_list.index(task)
        age_val = age_filled_list[task_idx].strip()
        sex_val = sex_filled_list[task_idx].strip()
    except:
        age_val = "n/a"
        sex_val = "Unknown"

    # age가 n/a인 경우 대체
    if age_val.lower() == "n/a":
        # 같은 subject 내 다른 유효한 나이 찾아보기
        valid_ages = [a.strip() for a in age_filled_list if a.strip().lower() != "n/a"]
        age_val = valid_ages[0] if valid_ages else "25"
        
    try:
        age_val = int(float(age_val))  # float 처리 후 int로 강제 변환
    except:
        age_val = 25  # 변환 실패 시 기본값

    age_list.append(age_val)
    sex_list.append(sex_val)

# 열 추가
df_story["age"] = age_list
df_story["sex"] = sex_list

# 저장
df_story.to_csv("fmri_story_frame_counts_with_text_and_demo.csv", index=False)
print("✅ 나이 및 성별 추가 완료: fmri_story_frame_counts_with_text_and_demo.csv")

✅ 나이 및 성별 추가 완료: fmri_story_frame_counts_with_text_and_demo.csv


In [27]:
import pandas as pd

# 우리가 만든 파일
df_story = pd.read_csv("fmri_story_frame_counts_with_text_and_demo.csv")

# task별 split & rating 정보
df_info = pd.read_csv("data_info.csv")

# 병합 (task 기준)
df_merged = pd.merge(df_story, df_info, how="left", on="stim_files")
df_merged["split"] = df_merged["split"].fillna("train")
df_merged["ground_truth"] = df_merged["ground_truth"].fillna("15").astype(str)

print(df_merged["ground_truth"].unique())  # ['15' '19' 'all']
print(df_merged.dtypes["ground_truth"])    # object (which is string)

# 저장
df_merged.to_csv("final_fmri_data_with_split_rating.csv", index=False)
print("✅ 최종 병합 완료: final_fmri_data_with_split_rating.csv")

['19' 'all' '15']
object
✅ 최종 병합 완료: final_fmri_data_with_split_rating.csv


In [30]:

# 데이터 로드
df = pd.read_csv("final_fmri_data_with_split_rating.csv")

# 부정합 행을 저장할 리스트
mismatch_rows = []

for idx, row in df.iterrows():
    try:
        frame_count = int(row["frame_count"])
        text_frames = [int(x.strip()) for x in str(row["text_frame"]).split(",")]
        if sum(text_frames) != frame_count:
            mismatch_rows.append(idx)
    except Exception as e:
        print(f"⚠️ 에러 (index={idx}): {e}")
        mismatch_rows.append(idx)

# 결과 출력
mismatched_df = df.loc[mismatch_rows]
print(f"⚠️ 총 {len(mismatched_df)}개의 불일치 행이 발견되었습니다.")
mismatched_df.to_csv("missmatched_data.csv", index=False)

valid_df = df.drop(index=mismatch_rows).reset_index(drop=True)
valid_df.to_csv("dataloader.csv", index=False)

⚠️ 총 14개의 불일치 행이 발견되었습니다.


In [16]:
mismatched_df

Unnamed: 0,stim_files,subject_id,task,run_folder,frame_count,run_path,text_frame,text_path
162,"vinny,upintheair",sub-075,schema,sub-075_task-schema_run-3_bold,12,./fmri_pooled_features/sub-075/schema/sub-075_...,56,"./text_feature/vinny, ./text_feature/upintheair"
196,"vinny,upintheair",sub-089,schema,sub-089_task-schema_run-1_bold,12,./fmri_pooled_features/sub-089/schema/sub-089_...,56,"./text_feature/vinny, ./text_feature/upintheair"
247,"vinny,upintheair",sub-109,schema,sub-109_task-schema_run-3_bold,12,./fmri_pooled_features/sub-109/schema/sub-109_...,56,"./text_feature/vinny, ./text_feature/upintheair"
380,"vinny,upintheair",sub-171,schema,sub-171_task-schema_run-1_bold,12,./fmri_pooled_features/sub-171/schema/sub-171_...,56,"./text_feature/vinny, ./text_feature/upintheair"
414,"vinny,upintheair",sub-183,schema,sub-183_task-schema_run-3_bold,12,./fmri_pooled_features/sub-183/schema/sub-183_...,56,"./text_feature/vinny, ./text_feature/upintheair"
420,"vinny,upintheair",sub-184,schema,sub-184_task-schema_run-2_bold,12,./fmri_pooled_features/sub-184/schema/sub-184_...,56,"./text_feature/vinny, ./text_feature/upintheair"
429,"vinny,upintheair",sub-186,schema,sub-186_task-schema_run-4_bold,12,./fmri_pooled_features/sub-186/schema/sub-186_...,56,"./text_feature/vinny, ./text_feature/upintheair"
434,"vinny,upintheair",sub-187,schema,sub-187_task-schema_run-1_bold,12,./fmri_pooled_features/sub-187/schema/sub-187_...,56,"./text_feature/vinny, ./text_feature/upintheair"
461,"vinny,upintheair",sub-193,schema,sub-193_task-schema_run-2_bold,12,./fmri_pooled_features/sub-193/schema/sub-193_...,56,"./text_feature/vinny, ./text_feature/upintheair"
469,"vinny,upintheair",sub-195,schema,sub-195_task-schema_run-2_bold,12,./fmri_pooled_features/sub-195/schema/sub-195_...,56,"./text_feature/vinny, ./text_feature/upintheair"
