In [30]:
import polars as pl

new_columns = [
    "par_id",      # 1 (integer ID)
    "art_id",      # @@24942188 (article identifier)
    "topic",       # hopeless (PCL category)
    "country",     # ph (country code)
    "text",        # Full text content
    "label"        # 0 (binary label)
]

# Read main dataset - skip 4 disclaimer rows and handle ragged lines
df = pl.read_csv(
    "data/dontpatronizeme_pcl.tsv",
    separator="\t",
    has_header=False,
    skip_rows=4,
    truncate_ragged_lines=True,
    new_columns=new_columns
)

# Read train/dev splits
train_labels = pl.read_csv("data/train_semeval_parids-labels.csv")
# print(train_labels)
dev_labels = pl.read_csv("data/dev_semeval_parids-labels.csv")
# print(dev_labels)

# Convert string labels to lists
def parse_labels(label_str: str) -> list[int]:
    return [int(x) for x in label_str.strip("[]").replace(" ", "").split(",")]

labels = [train_labels, dev_labels]
for i, label in enumerate(labels):
    labels[i] = label.with_columns(
        pl.col("label").map_elements(parse_labels, return_dtype=pl.List(pl.Int64)).alias("labels")
    ).drop("label")

train_labels = labels[0]
dev_labels = labels[1]

# Join with main data
train_df = df.join(train_labels, on="par_id", how="inner")
dev_df = df.join(dev_labels, on="par_id", how="inner")

# Split dev into validation and test (80-20 split)
val_df = dev_df.slice(0, int(len(dev_df)*0.8))
test_df = dev_df.slice(int(len(dev_df)*0.8), None)

print(f"Train samples: {len(train_df)}")
print(f"Val samples: {len(val_df)}") 
print(f"Test samples: {len(test_df)}")

# Optional: Save processed data
train_df.write_parquet("data/train.parquet")
val_df.write_parquet("data/val.parquet")
test_df.write_parquet("data/test.parquet")

print(test_df)

Train samples: 8375
Val samples: 1675
Test samples: 419
shape: (419, 7)
┌────────┬────────────┬────────────┬─────────┬───────────────────────────────┬───────┬─────────────┐
│ par_id ┆ art_id     ┆ topic      ┆ country ┆ text                          ┆ label ┆ labels      │
│ ---    ┆ ---        ┆ ---        ┆ ---     ┆ ---                           ┆ ---   ┆ ---         │
│ i64    ┆ str        ┆ str        ┆ str     ┆ str                           ┆ i64   ┆ list[i64]   │
╞════════╪════════════╪════════════╪═════════╪═══════════════════════════════╪═══════╪═════════════╡
│ 10016  ┆ @@20724698 ┆ women      ┆ gh      ┆ " Together we can achieve     ┆ 0     ┆ [0, 0, … 0] │
│        ┆            ┆            ┆         ┆ gend…                         ┆       ┆             │
│ 10017  ┆ @@13945049 ┆ women      ┆ us      ┆ Contemporary women have other ┆ 1     ┆ [0, 0, … 0] │
│        ┆            ┆            ┆         ┆ …                             ┆       ┆             │
│ 10019  ┆ @@147046