In [None]:
# ================================
# IMPORTS
# ================================
import pandas as pd
import ast
import json
from datasets import load_dataset, Dataset

print("\n=== STEP 1: LOAD DATASET ===")
ds = load_dataset("NEUDM/mams")
train_df = ds["train"].to_pandas()
print("Train shape:", train_df.shape)


# ================================
# STEP 2 — Parse output into real list
# ================================
print("\n=== STEP 2: PARSE `output` STRING → PYTHON LIST ===")

def parse_output(x):
    try:
        return ast.literal_eval(x)
    except:
        return []

train_df["output_parsed"] = train_df["output"].apply(parse_output)

print("RAW:", train_df["output"].iloc[0])
print("PARSED:", train_df["output_parsed"].iloc[0])


# ================================
# STEP 3 — aspect_count
# ================================
print("\n=== STEP 3: COMPUTE aspect_count ===")

train_df["aspect_count"] = train_df["output_parsed"].apply(len)
print(train_df["aspect_count"].value_counts())


# ================================
# STEP 4 — Filter to 2,3,4 aspects
# ================================
print("\n=== STEP 4: FILTER aspect_count ∈ {2,3,4} ===")

df_small = train_df[train_df["aspect_count"].isin([2,3,4])]
print("Filtered shape:", df_small.shape)


# ================================
# STEP 5 — Extract majority polarity
# ================================
print("\n=== STEP 5: MAJORITY POLARITY EXTRACTION ===")

def extract_polarity(lst):
    if not isinstance(lst, list) or len(lst) == 0:
        return "unknown"
    pols = [item[1] for item in lst if len(item) >= 2]
    if not pols:
        return "unknown"
    return max(set(pols), key=pols.count)

df_small["label_clean"] = df_small["output_parsed"].apply(extract_polarity)

print("\nPolarity counts:")
print(df_small["label_clean"].value_counts())
print("\nPolarity proportions:")
print(df_small["label_clean"].value_counts(normalize=True))


# ================================
# STEP 6 — Sample 300 proportionally by polarity
# ================================
print("\n=== STEP 6: SAMPLE 300 PROPORTIONALLY BY POLARITY ===")

polarity_props = df_small["label_clean"].value_counts(normalize=True)

TARGET = 300
samples = []

for label, prop in polarity_props.items():
    print(f"\n--- Sampling label '{label}' ---")
    group = df_small[df_small["label_clean"] == label]
    available = group.shape[0]
    print("Available:", available)

    if available == 0:
        print("Skipping (no rows).")
        continue

    n_label = min(round(TARGET * prop), available)
    print("Sampling:", n_label)

    samples.append(group.sample(n=n_label, random_state=42))

df_sampled = pd.concat(samples)
print("\nShape before trimming:", df_sampled.shape)

df_sampled = df_sampled.sample(n=300, random_state=42).reset_index(drop=True)
print("FINAL SAMPLE SHAPE:", df_sampled.shape)

print("\nFinal polarity mix:")
print(df_sampled["label_clean"].value_counts(normalize=True))


# ================================
# STEP 7 — Prepare DataFrame for CSV
# ================================
print("\n=== STEP 7: PREPARE FOR CSV EXPORT ===")

df_out = df_sampled.copy()

# Replace `output` with real parsed list
df_out["output"] = df_out["output_parsed"]

# Convert list → JSON string (safe for CSV)
df_out["output"] = df_out["output"].apply(json.dumps)

# Remove helper columns
df_out = df_out.drop(columns=["output_parsed", "__index_level_0__"], errors='ignore')

print("\nColumns in final CSV:")
print(df_out.columns)


# ================================
# STEP 8 — Save CSV
# ================================
print("\n=== STEP 8: SAVE CSV FILE ===")

df_out.to_csv("mams_sample_300.csv", index=False)
print("Saved CSV → mams_sample_300.csv")

# validate one row
print("\nLoaded-back example check (lists still JSON):")
print(df_out["output"].iloc[0])

In [None]:
print("\n=== ORIGINAL (after filtering aspect_count 2/3/4) ===")
orig_counts = df_small["label_clean"].value_counts()
orig_props  = df_small["label_clean"].value_counts(normalize=True)

print("\nOriginal counts:")
print(orig_counts)

print("\nOriginal proportions:")
print(orig_props)

In [None]:
print("\n=== SAMPLED (300 rows) ===")
sample_counts = df_sampled["label_clean"].value_counts()
sample_props  = df_sampled["label_clean"].value_counts(normalize=True)

print("\nSample counts:")
print(sample_counts)

print("\nSample proportions:")
print(sample_props)

In [None]:
comparison = pd.DataFrame({
    "Original_Counts": orig_counts,
    "Sample_Counts": sample_counts,
    "Original_Proportion": orig_props,
    "Sample_Proportion": sample_props,
})

print("\n=== COMPARISON TABLE ===")
print(comparison)

In [7]:
# ================================
# IMPORTS
# ================================
import pandas as pd
import ast
from datasets import load_dataset

print("\n=== STEP 1: LOAD DATASET ===")
ds = load_dataset("NEUDM/mams")
train_df = ds["train"].to_pandas()
print("Train shape:", train_df.shape)


# ================================
# STEP 2 — Parse output into real list
# ================================
print("\n=== STEP 2: PARSE `output` STRING → PYTHON LIST ===")

def parse_output(x):
    try:
        return ast.literal_eval(x)
    except:
        return []

train_df["output_parsed"] = train_df["output"].apply(parse_output)

print("RAW:", train_df["output"].iloc[0])
print("PARSED:", train_df["output_parsed"].iloc[0])


# ================================
# STEP 3 — aspect_count
# ================================
print("\n=== STEP 3: COMPUTE aspect_count ===")

train_df["aspect_count"] = train_df["output_parsed"].apply(len)
print(train_df["aspect_count"].value_counts())


# ================================
# STEP 4 — Filter aspect_count 2, 3, 4
# ================================
print("\n=== STEP 4: FILTER aspect_count ∈ {2,3,4} ===")

df_small = train_df[train_df["aspect_count"].isin([2,3,4])]
print("Filtered shape:", df_small.shape)


# ================================
# STEP 5 — Extract polarity combinations per sentence
# ================================
print("\n=== STEP 5: POLARITY COMBINATION EXTRACTION ===")

def polarity_combo(lst):
    # Extract polarities inside the sentence
    if not lst or not isinstance(lst, list):
        return "unknown"
    pols = [item[1] for item in lst if len(item) >= 2]
    if len(pols) == 0:
        return "unknown"
    # sort so that ["positive","negative"] → ["negative","positive"]
    return "_".join(sorted(pols))

df_small["polarity_combo"] = df_small["output_parsed"].apply(polarity_combo)

print("Example polarity_combo:", df_small["polarity_combo"].iloc[0])


# ================================
# STEP 6 — Print combinations for 2, 3, 4 aspect sentences
# ================================
for ac in [2, 3, 4]:
    print(f"\n==============================")
    print(f"     ASPECT COUNT = {ac}")
    print("==============================")

    subset = df_small[df_small["aspect_count"] == ac]

    print("\n--- POLARITY COMBO COUNTS ---")
    print(subset["polarity_combo"].value_counts())

    print("\n--- POLARITY COMBO PROPORTIONS ---")
    print(subset["polarity_combo"].value_counts(normalize=True))

print("\n=== DONE ===")


=== STEP 1: LOAD DATASET ===


Repo card metadata block was not found. Setting CardData to empty.


Train shape: (7446, 8)

=== STEP 2: PARSE `output` STRING → PYTHON LIST ===
RAW: [['food', 'positive'], ['place', 'neutral']]
PARSED: [['food', 'positive'], ['place', 'neutral']]

=== STEP 3: COMPUTE aspect_count ===
aspect_count
2     5020
3     1778
4      445
5      133
6       48
7       13
8        6
9        1
11       1
10       1
Name: count, dtype: int64

=== STEP 4: FILTER aspect_count ∈ {2,3,4} ===
Filtered shape: (7243, 10)

=== STEP 5: POLARITY COMBINATION EXTRACTION ===
Example polarity_combo: neutral_positive

     ASPECT COUNT = 2

--- POLARITY COMBO COUNTS ---
polarity_combo
negative_neutral     2121
neutral_positive     2089
negative_positive     810
Name: count, dtype: int64

--- POLARITY COMBO PROPORTIONS ---
polarity_combo
negative_neutral     0.422510
neutral_positive     0.416135
negative_positive    0.161355
Name: proportion, dtype: float64

     ASPECT COUNT = 3

--- POLARITY COMBO COUNTS ---
polarity_combo
negative_neutral_neutral      482
neutral_neutral_posi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small["polarity_combo"] = df_small["output_parsed"].apply(polarity_combo)


In [8]:
import pandas as pd
import ast
from datasets import load_dataset

# --------------------------------------
# LOAD FULL MAMS (train, val, test)
# --------------------------------------
ds = load_dataset("NEUDM/mams")

# Convert all splits to pandas
dfs = []
for split in ["train", "validation", "test"]:
    df = ds[split].to_pandas()
    df["split"] = split
    dfs.append(df)

df_all = pd.concat(dfs).reset_index(drop=True)
print("Total rows:", df_all.shape)

# --------------------------------------
# PARSE output column into Python lists
# --------------------------------------
def parse_output(x):
    try:
        return ast.literal_eval(x)
    except:
        return []

df_all["output_parsed"] = df_all["output"].apply(parse_output)

# --------------------------------------
# EXTRACT ALL ASPECT TERMS
# --------------------------------------
unique_aspects = set()

for row in df_all["output_parsed"]:
    for aspect, polarity in row:
        unique_aspects.add(aspect)

# Sort alphabetically
unique_aspects = sorted(list(unique_aspects))

print("\n=== TOTAL UNIQUE ASPECT TERMS ===")
print(len(unique_aspects))

print("\n=== ALL UNIQUE ASPECTS ===")
for asp in unique_aspects:
    print(asp)

Repo card metadata block was not found. Setting CardData to empty.


Total rows: (9246, 9)

=== TOTAL UNIQUE ASPECT TERMS ===
3084

=== ALL UNIQUE ASPECTS ===
'side' dishes
2 cheese slices
7A
APPETIZER
Abijah's secret sauce
Alaska king crab legs and fried coconut shrimp
Ambiance
American
American bar food staples
American bistro dishes
American fare
American platters
Appetizers
Apple
Apple sourdough pie
Argentinian Pizza
Argentinian cuisine
Asian street food
Atmosphere
Austrian
Austrian cuisine
Aztec
BAR
BBQ
BBQ Pork Rice
BBQ salmon
BURGER
BUSBOY
Baby backs
Bagel Owner
Baked Ziti
Bakery
Balthazar
Bangkok fruit punch
Bar
Bar Manager
Bartender
Bartenders
Basil Chicken and Shrimp Tempura roll
Bean
Beef
Beef Negimaki
Beer Battered Fish Chips
Beer selection
Belgian beers
Billy-Cheese steak sandwich
Black Duck
Blue Ribbon menu
Blueberry Waffles
Bollos appetizer
Bolognese sauce
Boton Shrimp
Braised Fish
Braised short ribs
Brazilian BBQ
Bread
Breakfast
Brunch
Brunch menu
Brushetta
Buffet lunch
Bustelo coffee
CAESAR SALAD
CEVICHE
COFFEE
CRAB
Caesar Salad
Caesar 