In [4]:
import os, re, string, json
import pandas as pd
from pathlib import Path
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from textblob import TextBlob
from tqdm import tqdm

# 0) make sure NLTK data is downloaded once
import nltk, ssl, warnings
warnings.filterwarnings("ignore")
try: nltk.data.find("tokenizers/punkt")
except LookupError:
    ssl._create_default_https_context = ssl._create_unverified_context
    nltk.download("punkt")
    nltk.download("averaged_perceptron_tagger")
    nltk.download("stopwords")

# ─── paths ──────────────────────────────────────────────────────────────
BASE_DIR = Path.cwd()               # Audio Model
IN_DIR   = BASE_DIR / "Transcripts" # holds 300_TRANSCRIPT.csv … 
OUT_DIR  = BASE_DIR / "Processed"   # where the output will be saved
OUT_DIR.mkdir(exist_ok=True)

# helper: convert one row string → dict
ROW_RGX = re.compile(
    r'^"?(?P<start>[\d.]+)\s+(?P<stop>[\d.]+)\s+(?P<speaker>\w+)\s+(?P<text>.*)"?$'
)
def parse_row(row: str):
    m = ROW_RGX.match(row.strip())
    if not m: return None
    d = m.groupdict()
    return {
        "start"   : float(d["start"]),
        "stop"    : float(d["stop"]),
        "speaker" : d["speaker"],
        "text"    : d["text"].strip()
    }

# helper: clean utterance text
STOP = set(stopwords.words("english"))
FILLERS = {"uh", "um", "mhm", "hmm"}
PUNCT  = set(string.punctuation)
def clean_text(t):
    t = re.sub(r"\[.*?\]", " ", t)        # remove [laughter] etc.
    words = [w.lower() for w in word_tokenize(t)
             if w.lower() not in STOP and w.lower() not in FILLERS
             and not all(ch in PUNCT for ch in w)]
    return " ".join(words)

# helper: feature extraction for one utterance
def utterance_feats(text):
    blob = TextBlob(text)
    return {
        "len_char"     : len(text),
        "len_words"    : len(text.split()),
        "polarity"     : blob.sentiment.polarity,
        "subjectivity" : blob.sentiment.subjectivity,
        "noun_ratio"   : _pos_ratio(text, "NN"),
        "verb_ratio"   : _pos_ratio(text, "VB"),
    }

def _pos_ratio(txt, tag_prefix):
    tags = pos_tag(word_tokenize(txt))
    if not tags: return 0
    return sum(1 for _, t in tags if t.startswith(tag_prefix)) / len(tags)

# ─── iterate over all transcript files in "Transcripts" folder ─────────────────────────────────────────
for fn in tqdm(sorted(IN_DIR.glob("*_TRANSCRIPT.csv"))):
    participant_id = fn.stem.split("_")[0]   # Extract participant ID (e.g., "300")
    with open(fn, encoding="utf8") as f:
        rows = [parse_row(r) for r in f if parse_row(r)]

    df = pd.DataFrame(rows)
    df["text_clean"] = df["text"].apply(clean_text)

    # utterance‑level features
    feats = pd.DataFrame(df["text_clean"].apply(utterance_feats).tolist())
    df = pd.concat([df, feats], axis=1)

    # session‑level aggregates (mean, std)
    agg = df[["polarity", "subjectivity",
              "len_char", "len_words",
              "noun_ratio", "verb_ratio"]].agg(["mean","std"]).unstack().to_dict()
    agg = {f"{k[0]}_{k[1]}": v for k,v in agg.items()}  # flatten keys
    agg["duration_total"] = df["stop"].iloc[-1] - df["start"].iloc[0]
    agg["participant"]    = participant_id

    # save utterance features
    df.to_parquet(OUT_DIR / f"{participant_id}_utterances.parquet", index=False)
    
    # save session summary in JSON format
    with open(OUT_DIR / f"{participant_id}_transcript_features.json", "w") as fp:
        json.dump(agg, fp, indent=2)

    print(f"✓ Processed {fn.name} → features saved")

print("\nAll transcripts converted. Utterance‑level files (*.parquet) and "
      "session‑level JSON summaries are in the 'Processed' folder.")


 12%|██████████▌                                                                         | 1/8 [00:00<00:03,  2.28it/s]

✓ Processed 300_TRANSCRIPT.csv → features saved


 25%|█████████████████████                                                               | 2/8 [00:00<00:02,  2.87it/s]

✓ Processed 301_TRANSCRIPT.csv → features saved


 38%|███████████████████████████████▌                                                    | 3/8 [00:00<00:01,  3.46it/s]

✓ Processed 302_TRANSCRIPT.csv → features saved


 50%|██████████████████████████████████████████                                          | 4/8 [00:01<00:01,  3.42it/s]

✓ Processed 303_TRANSCRIPT.csv → features saved


 62%|████████████████████████████████████████████████████▌                               | 5/8 [00:01<00:00,  3.46it/s]

✓ Processed 304_TRANSCRIPT.csv → features saved


 75%|███████████████████████████████████████████████████████████████                     | 6/8 [00:01<00:00,  3.58it/s]

✓ Processed 319_TRANSCRIPT.csv → features saved


 88%|█████████████████████████████████████████████████████████████████████████▌          | 7/8 [00:02<00:00,  3.25it/s]

✓ Processed 320_TRANSCRIPT.csv → features saved


100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:02<00:00,  3.24it/s]

✓ Processed 321_TRANSCRIPT.csv → features saved

All transcripts converted. Utterance‑level files (*.parquet) and session‑level JSON summaries are in the 'Processed' folder.





In [8]:
import pandas as pd

# path to the file you saved earlier
file_path = "Processed/301_utterances.parquet"

# load into a DataFrame
df = pd.read_parquet(file_path)

# quick loo
df.head()       # first 5 rows
       # column dtypes & non‑null counts


Unnamed: 0,start,stop,speaker,text,text_clean,len_char,len_words,polarity,subjectivity,noun_ratio,verb_ratio
0,29.428,35.888,Ellie,hi i'm ellie thanks for coming in today i was ...,hi 'm ellie thanks coming today created talk p...,75,12,0.366667,0.433333,0.583333,0.25
1,32.738,33.068,Participant,thank you,thank,5,1,0.0,0.0,1.0,0.0
2,36.598,40.948,Ellie,think of me as a friend i don't judge i can't ...,think friend n't judge ca n't 'm computer,41,8,0.0,0.0,0.375,0.25
3,42.088,42.518,Participant,mmm k,mmm k,5,2,0.0,0.0,1.0,0.0
4,42.358,51.738,Ellie,i'm here to learn about people and would love ...,'m learn people would love learn 'll ask quest...,125,20,0.3,0.716667,0.35,0.3


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 174 entries, 0 to 173
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   start         174 non-null    float64
 1   stop          174 non-null    float64
 2   speaker       174 non-null    object 
 3   text          174 non-null    object 
 4   text_clean    174 non-null    object 
 5   len_char      174 non-null    int64  
 6   len_words     174 non-null    int64  
 7   polarity      174 non-null    float64
 8   subjectivity  174 non-null    float64
 9   noun_ratio    174 non-null    float64
 10  verb_ratio    174 non-null    float64
dtypes: float64(6), int64(2), object(3)
memory usage: 15.1+ KB
