### this script does basic data cleaning and precrossing of the spectral and seqeunce data

In [1]:
# ─── Project path configuration (Jupyter-only) ────────────────────────────────
from pathlib import Path

# Current working dir = “…/2025/code/”
BASE_DIR  = Path.cwd().parent              # → “…/2025”
CODE_DIR  = BASE_DIR / "code"


first we modify the log file, to add session numbers from the dates of the unique recordings, we know each combination of animal was recorded 6 times in both before and after stage. we do this as current recordings only have the date stored in them, so we convert dates to session number of recording for a given pair and stage, so we can club vocalizations from a given session together later. as vocalization per session become a sample of an indiviuals vocalixations.

In [3]:

# ─── Load & annotate experimental log ─────────────────────────────────────────
import pandas as pd
import numpy as np
from pathlib import Path      # already imported earlier, but harmless to repeat

# Folder that holds the raw / modified log files
LOG_DIR = BASE_DIR / "experimental log files"
LOG_DIR.mkdir(exist_ok=True)

file_path_log = LOG_DIR / "experiment_log_orignal.csv"      # ← relative
df_log = pd.read_csv(file_path_log)

# ----------------------------------------------------------------------------- 
# 1. Parse filename and sort chronologically
# -----------------------------------------------------------------------------
df_log[["file_prefix", "date", "time", "ID"]] = (
    df_log["file name"].str.split("_", expand=True)
)
df_log = df_log.sort_values(by=["date", "time"])

# Combine 'date' and 'time' (time uses '-' instead of ':')
df_log["Date_Time"] = pd.to_datetime(
    df_log["date"] + " " + df_log["time"].str.replace("-", ":"),
    format="%Y-%m-%d %H:%M:%S",
)

# ----------------------------------------------------------------------------- 
# 2. Add session numbers
# -----------------------------------------------------------------------------
# Ensure day / month / year are strings
for col in ("day", "month", "year"):
    df_log[col] = df_log[col].astype(str)

df_log["date"] = pd.to_datetime(
    df_log["year"] + "-" + df_log["month"] + "-" + df_log["day"]
).dt.strftime("%Y-%m-%d")

# Unique identifier per pair-date
df_log["monkey_pair_date"] = (
    df_log["ID_left"] + "-" + df_log["ID_right"] + "-" + df_log["date"]
)

# Build session numbers per (pair, period)
unique_combos = (
    df_log.groupby(["ID_left", "ID_right", "period", "date"]).first().reset_index()
)

session_numbers = {}
for _, grp in unique_combos.groupby(["ID_left", "ID_right", "period"]):
    grp = grp.sort_values("date")
    grp["session_number"] = range(1, len(grp) + 1)
    session_numbers.update(
        dict(zip(grp["monkey_pair_date"], grp["session_number"]))
    )

df_log["session_number"] = df_log["monkey_pair_date"].map(session_numbers)
df_log.drop(columns=["monkey_pair_date"], inplace=True)

# ----------------------------------------------------------------------------- 
# 3. Save modified log
# -----------------------------------------------------------------------------
df_log.to_csv(LOG_DIR / "experiment_log_modified.csv", index=False)


we now first process the sequnce data, which we extract from the selection tables of the sequnce recordings. we clean and correct any typographical erros made during annoations, modity the element name slighty to only retain main element category name and not positonal or element repeat information. and extract the meta data from the sequnces 

In [6]:
# ─── Build sequence dataframe from labelled-sequence .txt files ───────────────
import os
import pandas as pd
import numpy as np
from pathlib import Path                           # already imported earlier

# ----------------------------------------------------------------------------- 
# 1. Folder locations (relative to BASE_DIR)
# -----------------------------------------------------------------------------
LABELLED_SEQ_DIR   = BASE_DIR / "labelled sequnces"
PHASE_LABEL_DIR    = BASE_DIR / "phase change labels" / "proccesed_labels"
OUTPUT_SEQ_DIR     = BASE_DIR / "seqeunce data"
LOG_DIR            = BASE_DIR / "experimental log files"

for p in (OUTPUT_SEQ_DIR,):
    p.mkdir(exist_ok=True)

# ----------------------------------------------------------------------------- 
# 2. Helper: read & sort a single sequence-label file
# -----------------------------------------------------------------------------
def process_text_file(file_path: Path):
    df = pd.read_csv(file_path, delimiter="\t", header=None, names=["col1", "col2", "col3"])
    df = df.sort_values("col1")
    return df["col3"].tolist()                     # last column as list

# ----------------------------------------------------------------------------- 
# 3. Build matrix M across all .txt files
# -----------------------------------------------------------------------------
matrix_M = []
for txt_path in LABELLED_SEQ_DIR.glob("*.txt"):
    last_col = process_text_file(txt_path)
    matrix_M.append([txt_path.name] + last_col)

df_M = pd.DataFrame(matrix_M)

# ----------------------------------------------------------------------------- 
# 4. Parse filename, attach Date_Time
# -----------------------------------------------------------------------------
df_M[
    ["stage", "type", "date", "time", "recorderID", "labelled", "focal ID", "context", "number"]
] = df_M[0].str.split("_", expand=True)

df_M["Date_Time"] = pd.to_datetime(df_M["date"] + " " + df_M["time"], format="%Y-%m-%d %H-%M-%S")

# ----------------------------------------------------------------------------- 
# 5. Merge with experimental log
# -----------------------------------------------------------------------------
log_path = LOG_DIR / "experiment_log_modified.csv"
df_log   = pd.read_csv(log_path)
df_log["Date_Time"] = pd.to_datetime(
    df_log["date"] + " " + df_log["time"].str.replace("-", ":"),
    format="%Y-%m-%d %H:%M:%S",
)

merged_df = pd.merge(df_M, df_log, how="left", on="Date_Time")

# Add conspecific_ID
merged_df["conspecific_ID"] = merged_df.apply(
    lambda r: r["ID_right"] if r["focal ID"] == r["ID_left"] else r["ID_left"], axis=1
)

# ----------------------------------------------------------------------------- 
# 6. Clean / filter columns exactly as in original code
# -----------------------------------------------------------------------------
cols_to_drop = [
    "recorderID", "labelled", "context", "number", "file name", "file type", "equipment",
    "hab_acc", "period", "ID_left", "ID_right", "day", "month", "year", "comments",
    "Unnamed: 12", "file_prefix", "date_y", "time_y", "ID"
]
merged_df = merged_df.drop(columns=cols_to_drop).dropna(subset=["conspecific_ID"])

# ----------------------------------------------------------------------------- 
# 7. Sequence numbering
# -----------------------------------------------------------------------------
merged_df = merged_df.sort_values(by=["date_x", "focal ID", "conspecific_ID"]).reset_index(drop=True)
seq_counter, last_keys = 0, (None, None, None)

for idx, row in merged_df.iterrows():
    key = (row["date_x"], row["focal ID"], row["conspecific_ID"])
    if key != last_keys:
        seq_counter = 1
        last_keys = key
    else:
        seq_counter += 1
    merged_df.at[idx, "seq_num"] = seq_counter

# ----------------------------------------------------------------------------- 
# 8. Original string replacements & sequence processing
# -----------------------------------------------------------------------------
merged_df = merged_df.replace({"Nougati": "Nougatti", "Olympia ": "Olympia"})
merged_df.loc[
    (merged_df["focal ID"] == "Odin") & (merged_df["conspecific_ID"] == "Wuschel"),
    "conspecific_ID"
] = "Nougatti"

rep1 = {"0": "x0", "x00": "x0", "Olympia_phee": "mA1", "mb1": "mB1"}
seq_cols = merged_df.columns[1:19]              # same slice as before
merged_df[seq_cols] = merged_df[seq_cols].replace(rep1)

# remove rows with 'xx' / 'yy'
merged_df = merged_df[~merged_df[seq_cols].isin(["xx", "yy"]).any(axis=1)]

# keep only 2nd char except for x0/y0
merged_df[seq_cols] = merged_df[seq_cols].applymap(
    lambda x: x if x in ("x0", "y0") else (x[1] if isinstance(x, str) and len(x) > 1 else x)
)

rep2 = dict.fromkeys(list("BCDEFG"), "A") | {"x0": "x", "y0": "y", "O": ""}
merged_df[seq_cols] = merged_df[seq_cols].replace(rep2)

merged_df["sequence"] = merged_df[seq_cols].apply(
    lambda r: "".join([str(v) for v in r.values if v not in (None, "nan")]), axis=1
)

# true_sequence (remove x / y)
merged_df["true_sequence"] = merged_df["sequence"].str.replace("[xy]", "", regex=True)

# drop sequences of length ≤ 1
filtered_df = merged_df[merged_df["true_sequence"].str.len() > 1].copy()

# ----------------------------------------------------------------------------- 
# 9. Attach phase info
# -----------------------------------------------------------------------------
def extract_phase(file_name: str):
    file_prefix = "_".join(file_name.split("_")[:-4])
    row_num = int(file_name.split("_")[-1].split(".")[0])
    txt_path = PHASE_LABEL_DIR / f"{file_prefix}.txt"
    if txt_path.exists():
        with open(txt_path, "r") as f:
            lines = f.readlines()
            if 0 < row_num <= len(lines):
                return lines[row_num - 1].split()[-1]
    return None

filtered_df["phase"] = filtered_df[0].apply(extract_phase)
filtered_df = filtered_df[filtered_df["phase"].isin(["1", "3"])]

# ----------------------------------------------------------------------------- 
# 10. Partner / non-partner flag
# -----------------------------------------------------------------------------
partner_map = {
    "Tabor": "Lola", "Lola": "Tabor",
    "Odin": "Nougatti", "Nougatti": "Odin",
    "Wuschel": "Olympia", "Olympia": "Wuschel",
}

filtered_df["paired_status"] = filtered_df.apply(
    lambda r: "partner" if partner_map.get(r["focal ID"]) == r["conspecific_ID"] else "non-partner",
    axis=1,
)

print("Filtered sequence dataframe shape:", filtered_df.shape)

# ----------------------------------------------------------------------------- 
# 11. Save processed sequence data
# -----------------------------------------------------------------------------
out_path = OUTPUT_SEQ_DIR / "Processed_seq_data.csv"
filtered_df.to_csv(out_path, index=False)
print(f"Saved → {out_path.relative_to(BASE_DIR)}")


  merged_df[seq_cols] = merged_df[seq_cols].applymap(


Filtered sequence dataframe shape: (1619, 33)
Saved → seqeunce data\Processed_seq_data.csv


we now pre process and clean spectral data, using the same calls which are presnt in the sequnce data, but now analysis the spectral structure of the same. we store trandtional acoustic metrics, mfcc and thier pca's along with audio for each call seprately for use later.

In [10]:
# ─── Spectral merge  ➜  trad/MFCC PCA  ➜  single-call extraction ─────────────
import os, re, numpy as np, pandas as pd, soundfile as sf
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# ── 1. Folders & files (relative to BASE_DIR) ────────────────────────────────
SPECTRAL_DATA_DIR = BASE_DIR / "spectral data"          # ← new location
LOG_DIR            = BASE_DIR / "experimental log files"
PHASE_LABEL_DIR    = BASE_DIR / "phase change labels" / "proccesed_labels"
SEL_TABLE_DIR      = BASE_DIR / "selection table labels"
SEQUENCE_AUDIO_DIR = BASE_DIR / "audio recording sequences"
EXTRACT_DIR        = BASE_DIR / "extracted calls"


file_data   = SPECTRAL_DATA_DIR / "MFCC_spectral_updated.csv"
file_extra  = SPECTRAL_DATA_DIR / "MFCC_spectral_a1.csv"
file_log    = LOG_DIR / "experiment_log_modified.csv"

# ── 2. Load & concatenate raw spectral CSVs ──────────────────────────────────
df_data  = pd.read_csv(file_data)
df_extra = pd.read_csv(file_extra)
df_data  = pd.concat([df_data, df_extra], ignore_index=True)

# ── 3. Parse filename → cols + Date_Time ─────────────────────────────────────
df_data[["stage","type","date","time","recorderID","labelled","focal ID",
         "context","no","Labelled","Note","number"]] = df_data["filename"].str.split("_",expand=True)
df_data["Date_Time"] = pd.to_datetime(df_data["date"] + " " + df_data["time"],
                                      format="%Y-%m-%d %H-%M-%S")

# ── 4. Merge with experimental log ───────────────────────────────────────────
df_log = pd.read_csv(file_log)
df_log["Date_Time"] = pd.to_datetime(df_log["date"] + " " +
                                     df_log["time"].str.replace("-" ,":"),
                                     format="%Y-%m-%d %H:%M:%S")

merged_df = pd.merge(df_data, df_log, how="left", on="Date_Time")
merged_df["conspecific_ID"] = merged_df.apply(
    lambda r: r["ID_right"] if r["focal ID"] == r["ID_left"] else r["ID_left"], axis=1)

# tidy
drop_cols = ["recorderID","labelled","context","number","file name","file type","equipment",
             "hab_acc","period","ID_left","ID_right","day","month","year","comments",
             "file_prefix","date_y","time_y","ID","no","Labelled","type","Date_Time","Unnamed: 12"]
merged_df = merged_df.drop(columns=drop_cols).dropna(subset=["conspecific_ID"])
merged_df = merged_df.replace({"Nougati":"Nougatti","Olympia ":"Olympia"})
merged_df.loc[(merged_df["focal ID"]=="Odin") & (merged_df["conspecific_ID"]=="Wuschel"),
              "conspecific_ID"] = "Nougatti"
merged_df["Note"] = merged_df["Note"].str.slice(0,3)

# filename fix (rm trailing _NN.wav)
merged_df["filename"] = merged_df["filename"].str.replace(r"_\d+\.wav$", ".wav", regex=True)

# ── 5. Phase filter (1 & 3 only) ─────────────────────────────────────────────
def extract_phase(fname:str):
    prefix = "_".join(fname.split("_")[:-6])
    row_num = int(fname.split("_")[-3].split(".")[0])
    txt = PHASE_LABEL_DIR / f"{prefix}.txt"
    if txt.exists():
        lines = txt.read_text().splitlines()
        if 0 < row_num <= len(lines):
            return lines[row_num-1].split()[-1]
    return None

merged_df["phase"] = merged_df["filename"].apply(extract_phase)
merged_df = merged_df[merged_df["phase"].isin(["1","3"])]

# ── 6. Partner / non-partner flag ────────────────────────────────────────────
partner_map = {"Tabor":"Lola","Lola":"Tabor",
               "Odin":"Nougatti","Nougatti":"Odin",
               "Wuschel":"Olympia","Olympia":"Wuschel"}
merged_df["paired_status"] = merged_df.apply(
    lambda r: "partner" if partner_map.get(r["focal ID"])==r["conspecific_ID"] else "non-partner",
    axis=1)

# ── 7. Z-score + PCA (trad & MFCC blocks) ────────────────────────────────────
trad_cols = ["Dur 90% (s)","Dur 50% (s)","Center Freq (Hz)","Freq 5% (Hz)",
             "Freq 25% (Hz)","Freq 75% (Hz)","Freq 95% (Hz)","BW 50% (Hz)",
             "BW 90% (Hz)","Avg Entropy (bits)","Agg Entropy (bits)"]
mfcc_cols = [str(i) for i in range(1,133)]

# ------------------------------------------------------------------
# Traditional metrics  ➜  PCA
# ------------------------------------------------------------------
scaler_trad = StandardScaler()
trad_scaled = scaler_trad.fit_transform(merged_df[trad_cols])

pca_trad    = PCA(n_components=5, random_state=42)
trad_scores = pca_trad.fit_transform(trad_scaled)

# Save scores back to the DF (as you already did)
for i in range(5):
    merged_df[f"trad_PC{i+1}"] = trad_scores[:, i]

trad_var = pca_trad.explained_variance_ratio_ * 100   # % variance

# ------------------------------------------------------------------
# MFCC metrics  ➜  PCA
# ------------------------------------------------------------------
scaler_mfcc = StandardScaler()
mfcc_scaled = scaler_mfcc.fit_transform(merged_df[mfcc_cols])

pca_mfcc    = PCA(n_components=5, random_state=42)
mfcc_scores = pca_mfcc.fit_transform(mfcc_scaled)

for i in range(5):
    merged_df[f"mfcc_PC{i+1}"] = mfcc_scores[:, i]

mfcc_var = pca_mfcc.explained_variance_ratio_ * 100   # % variance

print("\nVariance explained (%)")
print("Traditional:", np.round(trad_var, 2))
print("MFCC:       ", np.round(mfcc_var, 2))

# variance each PC explains (%)
trad_var = pca_trad.explained_variance_ratio_ * 100
mfcc_var = pca_mfcc.explained_variance_ratio_ * 100

# ── combined variance (sum of those 5 PCs) ─────────────────────
total_var_trad = trad_var.sum()
total_var_mfcc = mfcc_var.sum()

print(f"Traditional features – variance captured by 5 PCs: {total_var_trad:.2f}%")
print(f"MFCC features        – variance captured by 5 PCs: {total_var_mfcc:.2f}%")


# ── 8. Extract single-call audio & attach path ───────────────────────────────
def extract_single_call(row):
    full_fname = row["filename"]
    try:
        seq_part, label_part = full_fname.rsplit("_labelled_",1)
        seq_base   = seq_part
        call_label = label_part.replace(".wav","")

        sel_tab = SEL_TABLE_DIR / f"{seq_base}.txt"
        if not sel_tab.exists(): raise FileNotFoundError(sel_tab)

        start_s = end_s = None
        for ln in sel_tab.read_text().splitlines():
            if not ln.strip(): continue
            s,e,lbl = ln.split()[:3]
            if lbl == call_label:
                start_s, end_s = float(s), float(e)
                break
        if start_s is None: raise ValueError(f"label {call_label} not found")

        seq_wav = SEQUENCE_AUDIO_DIR / f"{seq_base}_labelled.wav"
        if not seq_wav.exists(): raise FileNotFoundError(seq_wav)

        audio, sr = sf.read(seq_wav, always_2d=False)
        segment = audio[int(start_s*sr): int(end_s*sr)]

        out_path = EXTRACT_DIR / full_fname
        sf.write(out_path, segment, sr)
        return str(out_path)

    except Exception as e:
        print(f"⚠️  {full_fname} skipped – {e}")
        return pd.NA

merged_df["call_audio_path"] = merged_df.apply(extract_single_call, axis=1)
clean_df = merged_df.dropna(subset=["call_audio_path"]).reset_index(drop=True)

# ── 9. Save ONLY the with-paths CSV to ‘spectral data/’ ───────────────────────
out_csv = SPECTRAL_DATA_DIR / "Processed_spec_data.csv"
clean_df.to_csv(out_csv, index=False)

print("\n---- SUMMARY ----")
print("Rows total      :", len(merged_df))
print("Rows w/ audio   :", len(clean_df))
print("Saved CSV       :", out_csv.relative_to(BASE_DIR))



Variance explained (%)
Traditional: [45.97 33.45 12.16  4.06  2.24]
MFCC:        [34.19 21.91 11.32  6.29  5.41]
Traditional features – variance captured by 5 PCs: 97.88%
MFCC features        – variance captured by 5 PCs: 79.12%
⚠️  after_phees_2022-07-21_13-48-50_0000035_labelled_Tabor_pheeseq_0000020_labelled_mC1.wav skipped – c:\Users\nakul\Desktop\marmoset project\2025_Work_Re-start_Marmoset\2025\selection table labels\after_phees_2022-07-21_13-48-50_0000035_labelled_Tabor_pheeseq_0000020.txt
⚠️  after_phees_2022-07-21_13-48-50_0000035_labelled_Tabor_pheeseq_0000020_labelled_mC2.wav skipped – c:\Users\nakul\Desktop\marmoset project\2025_Work_Re-start_Marmoset\2025\selection table labels\after_phees_2022-07-21_13-48-50_0000035_labelled_Tabor_pheeseq_0000020.txt
⚠️  after_phees_2022-07-21_13-48-50_0000035_labelled_Tabor_pheeseq_0000020_labelled_mC3.wav skipped – c:\Users\nakul\Desktop\marmoset project\2025_Work_Re-start_Marmoset\2025\selection table labels\after_phees_2022-07-21_13-