In [1]:
import os

try:
    # Running as normal Python script inside src/
    this_file = os.path.abspath(__file__)
    src_root = os.path.dirname(this_file)                        # EMOTION-PRED/src
    project_root = os.path.dirname(src_root)                    # EMOTION-PRED/
except NameError:
    # Running inside Jupyter (likely src/notebooks or src/)
    cwd = os.getcwd()

    # If running inside src/notebooks â†’ go up one level
    if cwd.endswith("notebooks"):
        src_root = os.path.abspath(os.path.join(cwd, ".."))
        project_root = os.path.dirname(src_root)
    else:
        # Running from project root directly
        project_root = cwd
        src_root = os.path.join(project_root, "src")

# Final unified paths
results_root = os.path.join(src_root, "results")
data_root = os.path.join(src_root, "data","MAMS-ACSA","raw","data_jsonl")
print(f"ðŸ“‚ Project root: {project_root}"
      f"\nðŸ“‚ Source root: {src_root}"
      f"\nðŸ“‚ Results root: {results_root}"
      f"\nðŸ“‚ Data root: {data_root}")
# 3 â€” JSONL files
TRAIN_JSONL = os.path.join(data_root, "train.jsonl")
VAL_JSONL   = os.path.join(data_root, "val.jsonl")
TEST_JSONL  = os.path.join(data_root, "test.jsonl")
SAMPLE_JSONL = os.path.join(data_root, "sample.jsonl")
print("Using dataset directory:", data_root)



ðŸ“‚ Project root: /Users/hd/Desktop/EMOTION-PRED
ðŸ“‚ Source root: /Users/hd/Desktop/EMOTION-PRED/src
ðŸ“‚ Results root: /Users/hd/Desktop/EMOTION-PRED/src/results
ðŸ“‚ Data root: /Users/hd/Desktop/EMOTION-PRED/src/data/MAMS-ACSA/raw/data_jsonl
Using dataset directory: /Users/hd/Desktop/EMOTION-PRED/src/data/MAMS-ACSA/raw/data_jsonl


In [2]:
!head -n 3 data/MAMS-ACSA/raw/data_jsonl/train.jsonl

head: data/MAMS-ACSA/raw/data_jsonl/train.jsonl: No such file or directory


In [3]:
import os
import json
import pandas as pd
import numpy as np

# ------------------------------------------------------------
# 1) Load dataset
# ------------------------------------------------------------
PATH = TRAIN_JSONL     # <-- your input jsonl path

rows = []
with open(PATH, "r", encoding="utf-8") as f:
    for line in f:
        rows.append(json.loads(line))

df = pd.DataFrame(rows)

# Keep original index for each sentence
df["orig_id"] = df.index


# ------------------------------------------------------------
# 2) Explode: each aspect becomes one row
# ------------------------------------------------------------
df_ex = df.explode("output").reset_index(drop=True)

df_ex["aspect"]   = df_ex["output"].apply(lambda x: x["aspect"])
df_ex["polarity"] = df_ex["output"].apply(lambda x: x["polarity"])

# stratum = aspect + polarity (e.g. food_positive)
df_ex["stratum"] = df_ex["aspect"] + "_" + df_ex["polarity"]


# ------------------------------------------------------------
# 3) Compute proportional sample sizes for TARGET = 300
# ------------------------------------------------------------
TARGET = 300

stratum_counts = df_ex["stratum"].value_counts()
weights = stratum_counts / stratum_counts.sum()

sizes = (weights * TARGET).round().astype(int)
sizes[sizes == 0] = 1   # ensure no stratum disappears


# ------------------------------------------------------------
# 4) First-pass sampling by stratum (sentence-level)
# ------------------------------------------------------------
selected_sentence_ids = set()
np.random.seed(42)

for stratum, size in sizes.items():
    subset = df_ex[df_ex["stratum"] == stratum]

    # unique sentences belonging to this stratum
    sentence_ids = subset["orig_id"].unique()

    # pick as many as possible
    n_pick = min(size, len(sentence_ids))
    chosen = np.random.choice(sentence_ids, n_pick, replace=False)

    selected_sentence_ids.update(chosen)


# ------------------------------------------------------------
# 4b) Top-up to EXACTLY 300 sentences
# ------------------------------------------------------------
selected_sentence_ids = set(selected_sentence_ids)
current_n = len(selected_sentence_ids)
missing = TARGET - current_n

if missing > 0:
    all_sentence_ids = set(df["orig_id"].unique())
    remaining = list(all_sentence_ids - selected_sentence_ids)

    np.random.seed(42)
    extra = np.random.choice(remaining, missing, replace=False)
    selected_sentence_ids.update(extra)


# ------------------------------------------------------------
# 5) Extract final unique sentences
# ------------------------------------------------------------
selected_df = df[df["orig_id"].isin(selected_sentence_ids)].copy()
selected_df = selected_df.drop(columns=["orig_id"])

# Shuffle for randomness
selected_df = selected_df.sample(frac=1, random_state=42).reset_index(drop=True)

print("Final sentence count:", len(selected_df))  # ALWAYS 300


# ------------------------------------------------------------
# 6) Save as JSONL
# ------------------------------------------------------------
OUT_PATH = SAMPLE_JSONL     # <-- output jsonl path
selected_df.to_json(OUT_PATH, orient="records", lines=True, force_ascii=False)

print("Saved to:", OUT_PATH)

Final sentence count: 300
Saved to: /Users/hd/Desktop/EMOTION-PRED/src/data/MAMS-ACSA/raw/data_jsonl/sample.jsonl


In [4]:
import json
import pandas as pd
import numpy as np
from collections import Counter

# -------------------------------------------------------------------
# 1) Load FULL TRAIN and SAMPLED 300
# -------------------------------------------------------------------

TRAIN_PATH = TRAIN_JSONL
SAMPLE_PATH = SAMPLE_JSONL

# Load full dataset
train_rows = []
with open(TRAIN_PATH, "r", encoding="utf-8") as f:
    for line in f:
        train_rows.append(json.loads(line))
df_train = pd.DataFrame(train_rows)

# Load sampled dataset
sample_rows = []
with open(SAMPLE_PATH, "r", encoding="utf-8") as f:
    for line in f:
        sample_rows.append(json.loads(line))
df_sample = pd.DataFrame(sample_rows)

# -------------------------------------------------------------------
# 2) Explode both (aspect-level rows)
# -------------------------------------------------------------------

def explode_df(df):
    ex = df.explode("output").reset_index(drop=True)
    ex["aspect"] = ex["output"].apply(lambda x: x["aspect"])
    ex["polarity"] = ex["output"].apply(lambda x: x["polarity"])
    ex["joint"] = ex["aspect"] + "_" + ex["polarity"]
    return ex

df_train_ex = explode_df(df_train)
df_sample_ex = explode_df(df_sample)

# -------------------------------------------------------------------
# 3) Compute distributions
# -------------------------------------------------------------------

def compute_dist(df, column):
    counts = df[column].value_counts()
    pct = (counts / counts.sum() * 100).round(2)
    return counts, pct

# Aspect-level
train_aspect_counts, train_aspect_pct = compute_dist(df_train_ex, "aspect")
sample_aspect_counts, sample_aspect_pct = compute_dist(df_sample_ex, "aspect")

# Polarity-level
train_pol_counts, train_pol_pct = compute_dist(df_train_ex, "polarity")
sample_pol_counts, sample_pol_pct = compute_dist(df_sample_ex, "polarity")

# Joint aspect Ã— polarity
train_joint_counts, train_joint_pct = compute_dist(df_train_ex, "joint")
sample_joint_counts, sample_joint_pct = compute_dist(df_sample_ex, "joint")

# -------------------------------------------------------------------
# 4) Create comparison tables
# -------------------------------------------------------------------

def compare_dists(train_pct, sample_pct, name):
    print(f"\n====================== {name.upper()} ======================")
    df_compare = pd.DataFrame({
        "Train %": train_pct,
        "Sample %": sample_pct
    }).fillna(0)
    df_compare["Difference"] = (df_compare["Sample %"] - df_compare["Train %"]).round(2)
    print(df_compare)
    return df_compare

aspect_compare = compare_dists(train_aspect_pct, sample_aspect_pct, "Aspect Category")
polarity_compare = compare_dists(train_pol_pct, sample_pol_pct, "Polarity")
joint_compare = compare_dists(train_joint_pct, sample_joint_pct, "Aspect Ã— Polarity (Joint)")

# -------------------------------------------------------------------
# 5) KL Divergence (measure of distribution similarity)
# -------------------------------------------------------------------

def kl_divergence(p, q):
    # Replace 0s to avoid log problems
    p = np.array(p.replace(0, 1e-9))
    q = np.array(q.replace(0, 1e-9))
    return np.sum(p * np.log(p / q))

kl_aspect = kl_divergence(train_aspect_pct / 100, sample_aspect_pct / 100)
kl_polarity = kl_divergence(train_pol_pct / 100, sample_pol_pct / 100)
kl_joint = kl_divergence(train_joint_pct / 100, sample_joint_pct / 100)

print("\n====================== KL DIVERGENCE ======================")
print("Aspect KL:", round(kl_aspect, 4))
print("Polarity KL:", round(kl_polarity, 4))
print("Joint KL:", round(kl_joint, 4))

# -------------------------------------------------------------------
# 6) Chi-square test (optional statistical test)
# -------------------------------------------------------------------
from scipy.stats import chisquare

def chi_test(train_counts, sample_counts, name):
    # align indexes
    idx = sorted(set(train_counts.index) | set(sample_counts.index))
    train = np.array([train_counts.get(i,0) for i in idx])
    sample = np.array([sample_counts.get(i,0) for i in idx])

    # scale train distribution to 300 samples
    train_scaled = train / train.sum() * sample.sum()

    chi, p = chisquare(f_obs=sample, f_exp=train_scaled)
    print(f"\n========== Chi-square test: {name} ==========")
    print("Chi-square:", round(chi, 4), "| p-value:", round(p, 4))
    return chi, p

chi_test(train_aspect_counts, sample_aspect_counts, "Aspect")
chi_test(train_pol_counts, sample_pol_counts, "Polarity")
chi_test(train_joint_counts, sample_joint_counts, "Joint")


               Train %  Sample %  Difference
aspect                                      
ambience          4.57      4.42       -0.15
food             32.54     31.24       -1.30
menu              6.70      7.42        0.72
miscellaneous    13.46     14.12        0.66
place             9.79      9.99        0.20
price             4.54      5.28        0.74
service           8.90      8.84       -0.06
staff            19.51     18.69       -0.82

          Train %  Sample %  Difference
polarity                               
neutral     43.40     43.65        0.25
negative    29.39     29.81        0.42
positive    27.21     26.53       -0.68

                        Train %  Sample %  Difference
joint                                                
ambience_negative          1.27      1.14       -0.13
ambience_neutral           0.75      1.28        0.53
ambience_positive          2.55      2.00       -0.55
food_negative              3.60      3.28       -0.32
food_neutral           

(np.float64(19.343500849489637), np.float64(0.6811171887378685))