In [8]:
import os
import re
from pathlib import Path
import pandas as pd

BASE = Path(r"E:\WSM\depression")
LIST_SETS = ["dev_labels", "test_labels", "train_labels"]
VIDEO_EXTS = {".mp4", ".avi", ".mov", ".mkv", ".wmv", ".mpg", ".mpeg", ".m4v", ".flv", ".webm", ".wav"}
DRY_RUN = True  # False = —Ä–µ–∞–ª—å–Ω–æ –ø–µ—Ä–µ–∏–º–µ–Ω–æ–≤–∞—Ç—å

def natural_key(s: str):
    return [int(t) if t.isdigit() else t.lower() for t in re.findall(r'\d+|\D+', s)]

def plan_segment_renames(vid_dir: Path):
    seg_dir = vid_dir / "segments"
    if not seg_dir.is_dir():
        return []

    files = [p for p in seg_dir.iterdir() if p.is_file() and p.suffix.lower() in VIDEO_EXTS]
    if not files:
        return []

    files_sorted = sorted(files, key=lambda p: natural_key(p.name))
    pad = max(3, len(str(len(files_sorted))))

    video_id = vid_dir.name
    plan = []
    for i, src in enumerate(files_sorted, 1):
        dst_name = f"{video_id}_{str(i).zfill(pad)}{src.suffix.lower()}"
        dst = src.with_name(dst_name)
        plan.append({
            "video_id": video_id,
            "src": str(src),
            "dst": str(dst),
            "dst_name": dst_name
        })
    return plan

def execute_plan(rows):
    for r in rows:
        src = Path(r["src"])
        dst = Path(r["dst"])
        if src.exists() and src.name != dst.name:
            src.rename(dst)

def main():
    whole_plan = []
    for split in LIST_SETS:
        split_dir = BASE / split
        if not split_dir.is_dir():
            continue
        for vid_dir in split_dir.iterdir():
            if vid_dir.is_dir():
                whole_plan.extend(plan_segment_renames(vid_dir))

    if not whole_plan:
        print("–°–µ–≥–º–µ–Ω—Ç–æ–≤ –Ω–µ—Ç, –Ω–µ—á–µ–≥–æ –ø–∏—Å–∞—Ç—å.")
        return

    df = pd.DataFrame(whole_plan)
    view = df[["video_id", "src", "dst_name"]].sort_values(["video_id", "dst_name"])
    print("\n–ü–ª–∞–Ω (–ø–µ—Ä–≤—ã–µ 20 —Å—Ç—Ä–æ–∫):")
    print(view.head(20).to_string(index=False))

    if not DRY_RUN:
        execute_plan(whole_plan)

    # CSV —Ç–æ–ª—å–∫–æ —Å –º–∞–ø–ø–∏–Ω–≥–æ–º: video_id -> segment_file
    mapping = df[["video_id", "dst_name"]].rename(columns={"dst_name": "segment_file"})
    mapping.to_csv(BASE / "segments_map.csv", index=False)
    print(f"\n–ì–æ—Ç–æ–≤–æ. –ú–∞–ø–ø–∏–Ω–≥ —Å–æ—Ö—Ä–∞–Ω—ë–Ω –≤ {BASE/'segments_map.csv'}")

if __name__ == "__main__":
    main()



–ü–ª–∞–Ω (–ø–µ—Ä–≤—ã–µ 20 —Å—Ç—Ä–æ–∫):
   video_id                                                                     src            dst_name
-7UpRmNVJzQ E:\WSM\depression\train_labels\-7UpRmNVJzQ\segments\-7UpRmNVJzQ_001.mp4 -7UpRmNVJzQ_001.mp4
-7UpRmNVJzQ E:\WSM\depression\train_labels\-7UpRmNVJzQ\segments\-7UpRmNVJzQ_002.mp4 -7UpRmNVJzQ_002.mp4
-7UpRmNVJzQ E:\WSM\depression\train_labels\-7UpRmNVJzQ\segments\-7UpRmNVJzQ_003.mp4 -7UpRmNVJzQ_003.mp4
-7UpRmNVJzQ E:\WSM\depression\train_labels\-7UpRmNVJzQ\segments\-7UpRmNVJzQ_004.mp4 -7UpRmNVJzQ_004.mp4
-7UpRmNVJzQ E:\WSM\depression\train_labels\-7UpRmNVJzQ\segments\-7UpRmNVJzQ_005.mp4 -7UpRmNVJzQ_005.mp4
-7UpRmNVJzQ E:\WSM\depression\train_labels\-7UpRmNVJzQ\segments\-7UpRmNVJzQ_006.mp4 -7UpRmNVJzQ_006.mp4
-7UpRmNVJzQ E:\WSM\depression\train_labels\-7UpRmNVJzQ\segments\-7UpRmNVJzQ_007.mp4 -7UpRmNVJzQ_007.mp4
-7UpRmNVJzQ E:\WSM\depression\train_labels\-7UpRmNVJzQ\segments\-7UpRmNVJzQ_008.mp4 -7UpRmNVJzQ_008.mp4
-7UpRmNVJzQ E:\WSM\depre

In [9]:
import os
import pandas as pd
from pathlib import Path

BASE = Path(r"E:\WSM\depression")
SETS = ["dev_labels", "test_labels", "train_labels"]

for split in SETS:
    csv_path = BASE / f"{split}.csv"
    dir_path = BASE / split

    print(f"\n=== –û–±—Ä–∞–±–æ—Ç–∫–∞ {split} ===")

    df = pd.read_csv(csv_path)
    new_rows = []

    for _, row in df.iterrows():
        video_id = str(row["video_id"])
        seg_dir = dir_path / video_id / "segments"

        if seg_dir.is_dir():
            files = sorted(os.listdir(seg_dir))
            for f in files:
                new_row = row.copy()
                new_row["segment_file"] = f
                new_rows.append(new_row)
        else:
            # –Ω–µ—Ç —Å–µ–≥–º–µ–Ω—Ç–æ–≤ ‚Äî –¥–æ–±–∞–≤–ª—è–µ–º –∫–∞–∫ –µ—Å—Ç—å (–±–µ–∑ segment_file)
            new_row = row.copy()
            new_row["segment_file"] = None
            new_rows.append(new_row)

    df_new = pd.DataFrame(new_rows)

    # —Å–æ—Ö—Ä–∞–Ω—è–µ–º –∫–∞–∫ –æ—Ç–¥–µ–ª—å–Ω—ã–π —Ñ–∞–π–ª (—á—Ç–æ–±—ã –Ω–µ —É–≥—Ä–æ–±–∏—Ç—å –æ—Ä–∏–≥–∏–Ω–∞–ª)
    out_path = BASE / f"{split}_with_segments.csv"
    df_new.to_csv(out_path, index=False)
    print(f"–°–æ—Ö—Ä–∞–Ω–∏–ª {out_path} (—Å—Ç—Ä–æ–∫: {len(df_new)})")



=== –û–±—Ä–∞–±–æ—Ç–∫–∞ dev_labels ===
–°–æ—Ö—Ä–∞–Ω–∏–ª E:\WSM\depression\dev_labels_with_segments.csv (—Å—Ç—Ä–æ–∫: 663)

=== –û–±—Ä–∞–±–æ—Ç–∫–∞ test_labels ===
–°–æ—Ö—Ä–∞–Ω–∏–ª E:\WSM\depression\test_labels_with_segments.csv (—Å—Ç—Ä–æ–∫: 881)

=== –û–±—Ä–∞–±–æ—Ç–∫–∞ train_labels ===
–°–æ—Ö—Ä–∞–Ω–∏–ª E:\WSM\depression\train_labels_with_segments.csv (—Å—Ç—Ä–æ–∫: 3832)


In [13]:
import os
import re
from pathlib import Path
import pandas as pd

BASE = Path(r"E:\WSM\parkinson")
LIST_SETS = ["dev_labels"]
VIDEO_EXTS = {".mp4", ".avi", ".mov", ".mkv", ".wmv", ".mpg", ".mpeg", ".m4v", ".flv", ".webm", ".wav"}
DRY_RUN = False  # False = —Ä–µ–∞–ª—å–Ω–æ –ø–µ—Ä–µ–∏–º–µ–Ω–æ–≤–∞—Ç—å

def natural_key(s: str):
    return [int(t) if t.isdigit() else t.lower() for t in re.findall(r'\d+|\D+', s)]

def plan_segment_renames(vid_dir: Path):
    seg_dir = vid_dir / "segments"
    if not seg_dir.is_dir():
        return []

    files = [p for p in seg_dir.iterdir() if p.is_file() and p.suffix.lower() in VIDEO_EXTS]
    if not files:
        return []

    files_sorted = sorted(files, key=lambda p: natural_key(p.name))
    pad = max(3, len(str(len(files_sorted))))

    video_id = vid_dir.name
    plan = []
    for i, src in enumerate(files_sorted, 1):
        dst_name = f"{video_id}_{str(i).zfill(pad)}{src.suffix.lower()}"
        dst = src.with_name(dst_name)
        plan.append({
            "video_id": video_id,
            "src": str(src),
            "dst": str(dst),
            "dst_name": dst_name
        })
    return plan

def execute_plan(rows):
    for r in rows:
        src = Path(r["src"])
        dst = Path(r["dst"])
        if src.exists() and src.name != dst.name:
            src.rename(dst)

def main():
    whole_plan = []
    for split in LIST_SETS:
        split_dir = BASE / split
        if not split_dir.is_dir():
            continue
        for vid_dir in split_dir.iterdir():
            if vid_dir.is_dir():
                whole_plan.extend(plan_segment_renames(vid_dir))

    if not whole_plan:
        print("–°–µ–≥–º–µ–Ω—Ç–æ–≤ –Ω–µ—Ç, –Ω–µ—á–µ–≥–æ –ø–∏—Å–∞—Ç—å.")
        return

    df = pd.DataFrame(whole_plan)
    view = df[["video_id", "src", "dst_name"]].sort_values(["video_id", "dst_name"])
    print("\n–ü–ª–∞–Ω (–ø–µ—Ä–≤—ã–µ 20 —Å—Ç—Ä–æ–∫):")
    print(view.head(20).to_string(index=False))

    if not DRY_RUN:
        execute_plan(whole_plan)

    # CSV —Ç–æ–ª—å–∫–æ —Å –º–∞–ø–ø–∏–Ω–≥–æ–º: video_id -> segment_file
    mapping = df[["video_id", "dst_name"]].rename(columns={"dst_name": "segment_file"})
    mapping.to_csv(BASE / "segments_map.csv", index=False)
    print(f"\n–ì–æ—Ç–æ–≤–æ. –ú–∞–ø–ø–∏–Ω–≥ —Å–æ—Ö—Ä–∞–Ω—ë–Ω –≤ {BASE/'segments_map.csv'}")

if __name__ == "__main__":
    main()



–ü–ª–∞–Ω (–ø–µ—Ä–≤—ã–µ 20 —Å—Ç—Ä–æ–∫):
   video_id                                                                                                                                    src            dst_name
3MKVlrQoHD4       E:\WSM\parkinson\dev_labels\3MKVlrQoHD4\segments\Can Microdosing Psilocybin Magic Mushrooms help with Parkinsons disease_000.mp4 3MKVlrQoHD4_001.mp4
3MKVlrQoHD4       E:\WSM\parkinson\dev_labels\3MKVlrQoHD4\segments\Can Microdosing Psilocybin Magic Mushrooms help with Parkinsons disease_001.mp4 3MKVlrQoHD4_002.mp4
3MKVlrQoHD4       E:\WSM\parkinson\dev_labels\3MKVlrQoHD4\segments\Can Microdosing Psilocybin Magic Mushrooms help with Parkinsons disease_002.mp4 3MKVlrQoHD4_003.mp4
3MKVlrQoHD4       E:\WSM\parkinson\dev_labels\3MKVlrQoHD4\segments\Can Microdosing Psilocybin Magic Mushrooms help with Parkinsons disease_003.mp4 3MKVlrQoHD4_004.mp4
3MKVlrQoHD4       E:\WSM\parkinson\dev_labels\3MKVlrQoHD4\segments\Can Microdosing Psilocybin Magic Mushrooms help with Parki

In [14]:
import os
import pandas as pd
from pathlib import Path

BASE = Path(r"E:\WSM\parkinson")
SETS = ["dev_labels", "test_labels", "train_labels"]

for split in SETS:
    csv_path = BASE / f"{split}.csv"
    dir_path = BASE / split

    print(f"\n=== –û–±—Ä–∞–±–æ—Ç–∫–∞ {split} ===")

    df = pd.read_csv(csv_path)
    new_rows = []

    for _, row in df.iterrows():
        video_id = str(row["video_id"])
        seg_dir = dir_path / video_id / "segments"

        if seg_dir.is_dir():
            files = sorted(os.listdir(seg_dir))
            for f in files:
                new_row = row.copy()
                new_row["segment_file"] = f
                new_rows.append(new_row)
        else:
            # –Ω–µ—Ç —Å–µ–≥–º–µ–Ω—Ç–æ–≤ ‚Äî –¥–æ–±–∞–≤–ª—è–µ–º –∫–∞–∫ –µ—Å—Ç—å (–±–µ–∑ segment_file)
            new_row = row.copy()
            new_row["segment_file"] = None
            new_rows.append(new_row)

    df_new = pd.DataFrame(new_rows)

    # —Å–æ—Ö—Ä–∞–Ω—è–µ–º –∫–∞–∫ –æ—Ç–¥–µ–ª—å–Ω—ã–π —Ñ–∞–π–ª (—á—Ç–æ–±—ã –Ω–µ —É–≥—Ä–æ–±–∏—Ç—å –æ—Ä–∏–≥–∏–Ω–∞–ª)
    out_path = BASE / f"{split}_with_segments.csv"
    df_new.to_csv(out_path, index=False)
    print(f"–°–æ—Ö—Ä–∞–Ω–∏–ª {out_path} (—Å—Ç—Ä–æ–∫: {len(df_new)})")



=== –û–±—Ä–∞–±–æ—Ç–∫–∞ dev_labels ===
–°–æ—Ö—Ä–∞–Ω–∏–ª E:\WSM\parkinson\dev_labels_with_segments.csv (—Å—Ç—Ä–æ–∫: 316)

=== –û–±—Ä–∞–±–æ—Ç–∫–∞ test_labels ===
–°–æ—Ö—Ä–∞–Ω–∏–ª E:\WSM\parkinson\test_labels_with_segments.csv (—Å—Ç—Ä–æ–∫: 545)

=== –û–±—Ä–∞–±–æ—Ç–∫–∞ train_labels ===
–°–æ—Ö—Ä–∞–Ω–∏–ª E:\WSM\parkinson\train_labels_with_segments.csv (—Å—Ç—Ä–æ–∫: 2789)


In [17]:
import pandas as pd
from pathlib import Path

BASE = Path(r"E:\WSM\depression")
SETS = ["dev_labels", "test_labels", "train_labels"]

KEEP_COLS = ["video_id", "diagnosis", "segment_file"]

for split in SETS:
    in_path = BASE / f"{split}_with_segments.csv"
    out_path = BASE / f"{split}_segments_min.csv"

    df = pd.read_csv(in_path)

    # –æ—Å—Ç–∞–≤–ª—è–µ–º —Ç–æ–ª—å–∫–æ –Ω—É–∂–Ω—ã–µ –∫–æ–ª–æ–Ω–∫–∏
    cols_exist = [c for c in KEEP_COLS if c in df.columns]
    df = df[cols_exist]

    df.to_csv(out_path, index=False)
    print(f"–°–æ—Ö—Ä–∞–Ω–∏–ª {out_path} (—Å—Ç—Ä–æ–∫: {len(df)})")


–°–æ—Ö—Ä–∞–Ω–∏–ª E:\WSM\depression\dev_labels_segments_min.csv (—Å—Ç—Ä–æ–∫: 663)
–°–æ—Ö—Ä–∞–Ω–∏–ª E:\WSM\depression\test_labels_segments_min.csv (—Å—Ç—Ä–æ–∫: 881)
–°–æ—Ö—Ä–∞–Ω–∏–ª E:\WSM\depression\train_labels_segments_min.csv (—Å—Ç—Ä–æ–∫: 3832)


In [16]:
import pandas as pd
from pathlib import Path

BASE = Path(r"E:\WSM\parkinson")
SETS = ["dev_labels", "test_labels", "train_labels"]

KEEP_COLS = ["video_id", "diagnosis", "segment_file"]

for split in SETS:
    in_path = BASE / f"{split}_with_segments.csv"
    out_path = BASE / f"{split}_segments_min.csv"

    df = pd.read_csv(in_path)

    # –æ—Å—Ç–∞–≤–ª—è–µ–º —Ç–æ–ª—å–∫–æ –Ω—É–∂–Ω—ã–µ –∫–æ–ª–æ–Ω–∫–∏
    cols_exist = [c for c in KEEP_COLS if c in df.columns]
    df = df[cols_exist]

    df.to_csv(out_path, index=False)
    print(f"–°–æ—Ö—Ä–∞–Ω–∏–ª {out_path} (—Å—Ç—Ä–æ–∫: {len(df)})")


–°–æ—Ö—Ä–∞–Ω–∏–ª E:\WSM\parkinson\dev_labels_segments_min.csv (—Å—Ç—Ä–æ–∫: 316)
–°–æ—Ö—Ä–∞–Ω–∏–ª E:\WSM\parkinson\test_labels_segments_min.csv (—Å—Ç—Ä–æ–∫: 545)
–°–æ—Ö—Ä–∞–Ω–∏–ª E:\WSM\parkinson\train_labels_segments_min.csv (—Å—Ç—Ä–æ–∫: 2789)


In [40]:
import pandas as pd
from pathlib import Path

BASE = Path(r"E:\WSM\depression")
SETS = ["dev_labels_segments_min_filtered.csv", "test_labels_segments_min_filtered.csv", "train_labels_segments_min_filtered.csv"]

all_stats = []

for fname in SETS:
    path = BASE / fname
    df = pd.read_csv(path)

    counts = df["diagnosis"].value_counts().sort_index()
    tmp = counts.reset_index()
    tmp.columns = ["diagnosis", "count"]
    tmp["split"] = fname.replace("_segments_min.csv", "")
    all_stats.append(tmp)

# –æ–±—ä–µ–¥–∏–Ω—è–µ–º
df_stats = pd.concat(all_stats)

# —Å–≤–æ–¥–Ω–∞—è —Ç–∞–±–ª–∏—Ü–∞
pivot = df_stats.pivot_table(
    index="diagnosis",
    columns="split",
    values="count",
    fill_value=0,
    aggfunc="sum"
)

# –¥–µ–ª–∞–µ–º –≤—Å—ë int
pivot = pivot.astype(int)

print("\n–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø–æ diagnosis –¥–µ–ø—Ä–µ—Å—Å–∏—è:")
print(pivot.to_string())



–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø–æ diagnosis –¥–µ–ø—Ä–µ—Å—Å–∏—è:
split      dev_labels_segments_min_filtered.csv  test_labels_segments_min_filtered.csv  train_labels_segments_min_filtered.csv
diagnosis                                                                                                                     
0                                           307                                    492                                    2284
1                                           317                                    335                                    1431


In [41]:
import pandas as pd
from pathlib import Path

BASE = Path(r"E:\WSM\parkinson")
SETS = ["dev_labels_segments_min_filtered.csv", "test_labels_segments_min_filtered.csv", "train_labels_segments_min_filtered.csv"]

all_stats = []


for fname in SETS:
    path = BASE / fname
    df = pd.read_csv(path)

    # —É–¥–∞–ª—è–µ–º –¥—É–±–ª–∏ –ø–æ video_id (–µ—Å–ª–∏ –≤–¥—Ä—É–≥ –µ—Å—Ç—å)
    df_unique = df.drop_duplicates(subset=["video_id"])

    counts = df_unique["diagnosis"].value_counts().sort_index()
    tmp = counts.reset_index()
    tmp.columns = ["diagnosis", "count"]
    tmp["split"] = fname.replace(".csv", "")
    all_stats.append(tmp)

# –æ–±—ä–µ–¥–∏–Ω—è–µ–º
df_stats = pd.concat(all_stats)

# —Å–≤–æ–¥–Ω–∞—è —Ç–∞–±–ª–∏—Ü–∞
pivot = df_stats.pivot_table(
    index="diagnosis",
    columns="split",
    values="count",
    fill_value=0,
    aggfunc="sum"
)

# int –≤–º–µ—Å—Ç–æ float
pivot = pivot.astype(int)

# –¥–æ–±–∞–≤–ª—è–µ–º —Å—Ç—Ä–æ–∫—É total (—Å—É–º–º–∞ –ø–æ diagnosis)
pivot.loc["total"] = pivot.sum()

print("\n–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø–æ diagnosis –ü–ê–†–ö–ò–ù–°–û–ù (—Å—á–∏—Ç–∞–µ–º –ø–æ —É–Ω–∏–∫–∞–ª—å–Ω—ã–º video_id):")
print(pivot.to_string())


–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø–æ diagnosis –ü–ê–†–ö–ò–ù–°–û–ù (—Å—á–∏—Ç–∞–µ–º –ø–æ —É–Ω–∏–∫–∞–ª—å–Ω—ã–º video_id):
split      dev_labels_segments_min_filtered  test_labels_segments_min_filtered  train_labels_segments_min_filtered
diagnosis                                                                                                         
0                                        22                                 25                                 132
1                                        22                                 21                                 140
total                                    44                                 46                                 272


In [42]:
import pandas as pd
from pathlib import Path

BASE = Path(r"E:\WSM\depression")
SETS = ["dev_labels_segments_min_filtered.csv", "test_labels_segments_min_filtered.csv", "train_labels_segments_min_filtered.csv"]

all_stats = []


for fname in SETS:
    path = BASE / fname
    df = pd.read_csv(path)

    # —É–¥–∞–ª—è–µ–º –¥—É–±–ª–∏ –ø–æ video_id (–µ—Å–ª–∏ –≤–¥—Ä—É–≥ –µ—Å—Ç—å)
    df_unique = df.drop_duplicates(subset=["video_id"])

    counts = df_unique["diagnosis"].value_counts().sort_index()
    tmp = counts.reset_index()
    tmp.columns = ["diagnosis", "count"]
    tmp["split"] = fname.replace(".csv", "")
    all_stats.append(tmp)

# –æ–±—ä–µ–¥–∏–Ω—è–µ–º
df_stats = pd.concat(all_stats)

# —Å–≤–æ–¥–Ω–∞—è —Ç–∞–±–ª–∏—Ü–∞
pivot = df_stats.pivot_table(
    index="diagnosis",
    columns="split",
    values="count",
    fill_value=0,
    aggfunc="sum"
)

# int –≤–º–µ—Å—Ç–æ float
pivot = pivot.astype(int)

# –¥–æ–±–∞–≤–ª—è–µ–º —Å—Ç—Ä–æ–∫—É total (—Å—É–º–º–∞ –ø–æ diagnosis)
pivot.loc["total"] = pivot.sum()

print("\n–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø–æ diagnosis DEPRESSION (—Å—á–∏—Ç–∞–µ–º –ø–æ —É–Ω–∏–∫–∞–ª—å–Ω—ã–º video_id):")
print(pivot.to_string())


–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø–æ diagnosis DEPRESSION (—Å—á–∏—Ç–∞–µ–º –ø–æ —É–Ω–∏–∫–∞–ª—å–Ω—ã–º video_id):
split      dev_labels_segments_min_filtered  test_labels_segments_min_filtered  train_labels_segments_min_filtered
diagnosis                                                                                                         
0                                        29                                 27                                 156
1                                        27                                 28                                 135
total                                    56                                 55                                 291


In [38]:
import pandas as pd
from pathlib import Path

BASE = Path(r"E:\WSM\parkinson")

# üëâ –£–ö–ê–ñ–ò —Å–≤–æ–∏ —Ñ–∞–π–ª—ã
FILE_A = BASE / "dev_labels_segments_min.csv"      # —Ñ–∞–π–ª —Å–æ —Å—Ç—Ä–æ–∫–∞–º–∏ –ø–æ —Å–µ–≥–º–µ–Ω—Ç–∞–º (–º–Ω–æ–≥–æ —Å—Ç—Ä–æ–∫ –Ω–∞ 1 video_id)
FILE_B = BASE / "dev_w_fe_vid.csv"            # —Ñ–∞–π–ª-—ç—Ç–∞–ª–æ–Ω —Å —É–Ω–∏–∫–∞–ª—å–Ω—ã–º–∏ video_id

SAVE = True  # True ‚Äî —Å–æ—Ö—Ä–∞–Ω–∏—Ç—å –æ—Ç—Ñ–∏–ª—å—Ç—Ä–æ–≤–∞–Ω–Ω—ã–π A –≤ –Ω–æ–≤—ã–π CSV

# 1) —á–∏—Ç–∞–µ–º –∏ –ø—Ä–∏–≤–æ–¥–∏–º video_id –∫ —Å—Ç—Ä–æ–∫–µ
df_a = pd.read_csv(FILE_A)
df_b = pd.read_csv(FILE_B)

df_a["video_id"] = df_a["video_id"].astype(str)
df_b["video_id"] = df_b["video_id"].astype(str)

# 2) –º–Ω–æ–∂–µ—Å—Ç–≤–æ –≤–∞–ª–∏–¥–Ω—ã—Ö ID –∏–∑ —Ñ–∞–π–ª–∞ B (—É–Ω–∏–∫–∞–ª—å–Ω—ã–µ)
ids_b = set(df_b["video_id"].unique())

# 3) —á—Ç–æ –µ—Å—Ç—å –≤ A
ids_a = set(df_a["video_id"].unique())

# 4) –∫–æ–≥–æ –Ω—É–∂–Ω–æ —É–¥–∞–ª–∏—Ç—å –∏–∑ A (–≤ A –µ—Å—Ç—å, –≤ B –Ω–µ—Ç)
ids_to_drop = sorted(ids_a - ids_b)

# 5) —Ñ–∏–ª—å—Ç—Ä—É–µ–º A, –æ—Å—Ç–∞–≤–ª—è—è —Ç–æ–ª—å–∫–æ —Ç–µ—Ö, –∫—Ç–æ –µ—Å—Ç—å –≤ B
mask = df_a["video_id"].isin(ids_b)
df_a_filtered = df_a[mask].copy()

# 6) –º–∏–Ω–∏-–æ—Ç—á—ë—Ç
report = pd.DataFrame({
    "metric": [
        "unique_ids_in_A",
        "unique_ids_in_B",
        "ids_removed_from_A",
        "rows_in_A_before",
        "rows_in_A_after",
        "rows_removed"
    ],
    "value": [
        len(ids_a),
        len(ids_b),
        len(ids_to_drop),
        len(df_a),
        len(df_a_filtered),
        len(df_a) - len(df_a_filtered)
    ]
})
print("\n–°–≤–æ–¥–∫–∞ –ø–æ —á–∏—Å—Ç–∫–µ A –Ω–∞ –æ—Å–Ω–æ–≤–µ B:")
print(report.to_string(index=False))

# 7) –ø–æ–∫–∞–∂–µ–º –ø–µ—Ä–≤—ã–µ N —É–¥–∞–ª—è–µ–º—ã—Ö ID (—á—Ç–æ–±—ã –≥–ª–∞–∑–∞–º–∏ –ø—Ä–æ–≤–µ—Ä–∏—Ç—å)
N = 20
if ids_to_drop:
    print(f"\n–ü–µ—Ä–≤—ã–µ {min(N, len(ids_to_drop))} ID, –∫–æ—Ç–æ—Ä—ã—Ö –ù–ï–¢ –≤ B –∏ –∫–æ—Ç–æ—Ä—ã–µ –±—É–¥—É—Ç —É–¥–∞–ª–µ–Ω—ã –∏–∑ A:")
    print(pd.Series(ids_to_drop[:N]).to_string(index=False))
else:
    print("\n–£–¥–∞–ª—è—Ç—å –ø–æ ID –Ω–µ—á–µ–≥–æ ‚Äî –≤—Å—ë –∏–∑ A –ø—Ä–∏—Å—É—Ç—Å—Ç–≤—É–µ—Ç –≤ B. –ù–∞ —ç—Ç–æ—Ç —Ä–∞–∑ —Ç—ã —Å–ø—Ä–∞–≤–∏–ª—Å—è :)")

# 8) —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –ø–æ –∂–µ–ª–∞–Ω–∏—é
if SAVE:
    out_path = FILE_A.with_name(FILE_A.stem + "_filtered.csv")
    df_a_filtered.to_csv(out_path, index=False)
    print(f"\n–°–æ—Ö—Ä–∞–Ω–µ–Ω–æ: {out_path}")
else:
    print("\nSAVE=False ‚Äî –Ω–∏—á–µ–≥–æ –Ω–µ —Å–æ—Ö—Ä–∞–Ω—è—é. –í–æ–∑—å–º–∏ df_a_filtered –∏–∑ –ø–∞–º—è—Ç–∏, –µ—Å–ª–∏ –∑–∞–ø—É—Å–∫–∞–µ—à—å –≤ –Ω–æ—É—Ç–±—É–∫–µ.")



–°–≤–æ–¥–∫–∞ –ø–æ —á–∏—Å—Ç–∫–µ A –Ω–∞ –æ—Å–Ω–æ–≤–µ B:
            metric  value
   unique_ids_in_A     47
   unique_ids_in_B     44
ids_removed_from_A      3
  rows_in_A_before    316
   rows_in_A_after    312
      rows_removed      4

–ü–µ—Ä–≤—ã–µ 3 ID, –∫–æ—Ç–æ—Ä—ã—Ö –ù–ï–¢ –≤ B –∏ –∫–æ—Ç–æ—Ä—ã–µ –±—É–¥—É—Ç —É–¥–∞–ª–µ–Ω—ã –∏–∑ A:
87mxEmx7zeA
NUeEFVy2Ojw
U74-VmwOeGI

–°–æ—Ö—Ä–∞–Ω–µ–Ω–æ: E:\WSM\parkinson\dev_labels_segments_min_filtered.csv


In [39]:
import pandas as pd
from pathlib import Path

BASE = Path(r"E:\WSM\parkinson")

# –∫–∞–∫–∏–µ –¥–≤–∞ —Ñ–∞–π–ª–∞ —Å—Ä–∞–≤–Ω–∏–≤–∞–µ–º
FILE_A = BASE / "test_labels_segments_min_filtered.csv"
FILE_B = BASE / "test_w_fe_vid.csv"

# —á–∏—Ç–∞–µ–º —É–Ω–∏–∫–∞–ª—å–Ω—ã–µ video_id
ids_a = set(pd.read_csv(FILE_A)["video_id"].astype(str).unique())
ids_b = set(pd.read_csv(FILE_B)["video_id"].astype(str).unique())

# —Ä–∞–∑–Ω–æ—Å—Ç–∏
missing_in_b = sorted(ids_a - ids_b)
missing_in_a = sorted(ids_b - ids_a)

print(f"\n–£–Ω–∏–∫–∞–ª—å–Ω—ã–µ video_id –≤ {FILE_A.name}: {len(ids_a)}")
print(f"–£–Ω–∏–∫–∞–ª—å–Ω—ã–µ video_id –≤ {FILE_B.name}: {len(ids_b)}")

print(f"\nIDs –∏–∑ {FILE_A.name}, –∫–æ—Ç–æ—Ä—ã—Ö –Ω–µ—Ç –≤ {FILE_B.name} ({len(missing_in_b)}):")
print(missing_in_b[:20])  # –ø–µ—Ä–≤—ã–µ 20

print(f"\nIDs –∏–∑ {FILE_B.name}, –∫–æ—Ç–æ—Ä—ã—Ö –Ω–µ—Ç –≤ {FILE_A.name} ({len(missing_in_a)}):")
print(missing_in_a[:20])



–£–Ω–∏–∫–∞–ª—å–Ω—ã–µ video_id –≤ test_labels_segments_min_filtered.csv: 46
–£–Ω–∏–∫–∞–ª—å–Ω—ã–µ video_id –≤ test_w_fe_vid.csv: 46

IDs –∏–∑ test_labels_segments_min_filtered.csv, –∫–æ—Ç–æ—Ä—ã—Ö –Ω–µ—Ç –≤ test_w_fe_vid.csv (0):
[]

IDs –∏–∑ test_w_fe_vid.csv, –∫–æ—Ç–æ—Ä—ã—Ö –Ω–µ—Ç –≤ test_labels_segments_min_filtered.csv (0):
[]


In [43]:
import pandas as pd
from pathlib import Path

BASE = Path(r"E:\WSM\parkinson")
SETS = [
    "dev_labels_segments_min_filtered.csv",
    "test_labels_segments_min_filtered.csv",
    "train_labels_segments_min_filtered.csv"
]

# ===== –ø–æ —É–Ω–∏–∫–∞–ª—å–Ω—ã–º video_id =====
all_stats_unique = []

for fname in SETS:
    path = BASE / fname
    df = pd.read_csv(path)

    # –æ—Å—Ç–∞–≤–ª—è–µ–º —Ç–æ–ª—å–∫–æ —É–Ω–∏–∫–∞–ª—å–Ω—ã–µ video_id
    df_unique = df.drop_duplicates(subset=["video_id"])

    counts = df_unique["diagnosis"].value_counts().sort_index()
    tmp = counts.reset_index()
    tmp.columns = ["diagnosis", "count"]
    tmp["split"] = fname.replace(".csv", "")
    all_stats_unique.append(tmp)

df_stats_unique = pd.concat(all_stats_unique)

pivot_unique = df_stats_unique.pivot_table(
    index="diagnosis",
    columns="split",
    values="count",
    fill_value=0,
    aggfunc="sum"
).astype(int)

pivot_unique.loc["total"] = pivot_unique.sum()

print("\n–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø–æ diagnosis –ü–ê–†–ö–ò–ù–°–û–ù (–ø–æ —É–Ω–∏–∫–∞–ª—å–Ω—ã–º video_id):")
print(pivot_unique.to_string())

# ===== –ø–æ –≤—Å–µ–º —Å—Ç—Ä–æ–∫–∞–º (—Å–µ–≥–º–µ–Ω—Ç–∞–º) =====
all_stats_all = []

for fname in SETS:
    path = BASE / fname
    df = pd.read_csv(path)

    counts = df["diagnosis"].value_counts().sort_index()
    tmp = counts.reset_index()
    tmp.columns = ["diagnosis", "count"]
    tmp["split"] = fname.replace(".csv", "")
    all_stats_all.append(tmp)

df_stats_all = pd.concat(all_stats_all)

pivot_all = df_stats_all.pivot_table(
    index="diagnosis",
    columns="split",
    values="count",
    fill_value=0,
    aggfunc="sum"
).astype(int)

pivot_all.loc["total"] = pivot_all.sum()

print("\n–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø–æ diagnosis –ü–ê–†–ö–ò–ù–°–û–ù (–ø–æ –≤—Å–µ–º —Å—Ç—Ä–æ–∫–∞–º / —Å–µ–≥–º–µ–Ω—Ç–∞–º):")
print(pivot_all.to_string())



–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø–æ diagnosis –ü–ê–†–ö–ò–ù–°–û–ù (–ø–æ —É–Ω–∏–∫–∞–ª—å–Ω—ã–º video_id):
split      dev_labels_segments_min_filtered  test_labels_segments_min_filtered  train_labels_segments_min_filtered
diagnosis                                                                                                         
0                                        22                                 25                                 132
1                                        22                                 21                                 140
total                                    44                                 46                                 272

–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø–æ diagnosis –ü–ê–†–ö–ò–ù–°–û–ù (–ø–æ –≤—Å–µ–º —Å—Ç—Ä–æ–∫–∞–º / —Å–µ–≥–º–µ–Ω—Ç–∞–º):
split      dev_labels_segments_min_filtered  test_labels_segments_min_filtered  train_labels_segments_min_filtered
diagnosis                                                                                      

In [46]:
import pandas as pd
from pathlib import Path

BASE = Path(r"E:\WSM\depression")
SETS = [
    "dev_labels_segments_min_filtered.csv",
    "test_labels_segments_min_filtered.csv",
    "train_labels_segments_min_filtered.csv"
]

# ===== –ø–æ —É–Ω–∏–∫–∞–ª—å–Ω—ã–º video_id =====
all_stats_unique = []

for fname in SETS:
    path = BASE / fname
    df = pd.read_csv(path)

    # –æ—Å—Ç–∞–≤–ª—è–µ–º —Ç–æ–ª—å–∫–æ —É–Ω–∏–∫–∞–ª—å–Ω—ã–µ video_id
    df_unique = df.drop_duplicates(subset=["video_id"])

    counts = df_unique["diagnosis"].value_counts().sort_index()
    tmp = counts.reset_index()
    tmp.columns = ["diagnosis", "count"]
    tmp["split"] = fname.replace(".csv", "")
    all_stats_unique.append(tmp)

df_stats_unique = pd.concat(all_stats_unique)

pivot_unique = df_stats_unique.pivot_table(
    index="diagnosis",
    columns="split",
    values="count",
    fill_value=0,
    aggfunc="sum"
).astype(int)

pivot_unique.loc["total"] = pivot_unique.sum()

print("\n–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø–æ diagnosis –î–ï–ü–†–ï–°–°–ò–Ø (–ø–æ —É–Ω–∏–∫–∞–ª—å–Ω—ã–º video_id):")
print(pivot_unique.to_string())

# ===== –ø–æ –≤—Å–µ–º —Å—Ç—Ä–æ–∫–∞–º (—Å–µ–≥–º–µ–Ω—Ç–∞–º) =====
all_stats_all = []

for fname in SETS:
    path = BASE / fname
    df = pd.read_csv(path)

    counts = df["diagnosis"].value_counts().sort_index()
    tmp = counts.reset_index()
    tmp.columns = ["diagnosis", "count"]
    tmp["split"] = fname.replace(".csv", "")
    all_stats_all.append(tmp)

df_stats_all = pd.concat(all_stats_all)

pivot_all = df_stats_all.pivot_table(
    index="diagnosis",
    columns="split",
    values="count",
    fill_value=0,
    aggfunc="sum"
).astype(int)

pivot_all.loc["total"] = pivot_all.sum()

print("\n–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø–æ diagnosis –î–ï–ü–†–ï–°–°–ò–Ø (–ø–æ –≤—Å–µ–º —Å—Ç—Ä–æ–∫–∞–º / —Å–µ–≥–º–µ–Ω—Ç–∞–º):")
print(pivot_all.to_string())



–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø–æ diagnosis –î–ï–ü–†–ï–°–°–ò–Ø (–ø–æ —É–Ω–∏–∫–∞–ª—å–Ω—ã–º video_id):
split      dev_labels_segments_min_filtered  test_labels_segments_min_filtered  train_labels_segments_min_filtered
diagnosis                                                                                                         
0                                        29                                 27                                 156
1                                        27                                 28                                 135
total                                    56                                 55                                 291

–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø–æ diagnosis –î–ï–ü–†–ï–°–°–ò–Ø (–ø–æ –≤—Å–µ–º —Å—Ç—Ä–æ–∫–∞–º / —Å–µ–≥–º–µ–Ω—Ç–∞–º):
split      dev_labels_segments_min_filtered  test_labels_segments_min_filtered  train_labels_segments_min_filtered
diagnosis                                                                                      