In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

new_columns = [
    "par_id",      # 1 (integer ID)
    "art_id",      # @@24942188 (article identifier)
    "topic",       # hopeless (PCL category)
    "country",     # ph (country code)
    "text",        # Full text content
    "label"        # 0 (binary label)
]

# Read main dataset - skip 4 disclaimer rows
df = pd.read_csv(
    "data/dontpatronizeme_pcl.tsv",
    sep="\t",
    header=None,
    skiprows=4,
    names=new_columns,
    on_bad_lines='warn'
)



      par_id      art_id       topic country  \
6700    6838  @@13786523     refugee      za   
6701    6839  @@23413758       women      gh   
6702    6840   @@9745516    homeless      au   
6703    6841   @@1864808    homeless      tz   
6704    6842  @@14327774  vulnerable      lk   

                                                   text  label  \
6700  Sheena Jonker , head mediator of Access to Jus...      0   
6701  While the President has had occasions to recei...      0   
6702  The decision comes following a series of Tweet...      1   
6703             Strong winds leave 180 people homeless      0   
6704  Voice : - Elders who live outside major cities...      0   

                     labels  
6700  [0, 0, 0, 0, 0, 0, 0]  
6701  [0, 0, 0, 0, 0, 0, 0]  
6702  [0, 0, 0, 0, 0, 0, 0]  
6703  [0, 0, 0, 0, 0, 0, 0]  
6704  [0, 0, 0, 0, 0, 0, 0]  


In [None]:
# Read train/dev splits
train_val_labels = pd.read_csv("data/train_semeval_parids-labels.csv")
test_labels = pd.read_csv("data/dev_semeval_parids-labels.csv")

# Convert string labels to lists
def parse_labels(label_str: str) -> list[int]:
    return [int(x) for x in label_str.strip("[]").replace(" ", "").split(",")]

# Process labels dataframes
for labels_df in [train_val_labels, test_labels]:
    labels_df['labels'] = labels_df['label'].apply(parse_labels)
    labels_df.drop('label', axis=1, inplace=True)

# Join with main data
train_val_df = df.merge(train_val_labels, on="par_id", how="inner")
test_df = df.merge(test_labels, on="par_id", how="inner")

train_df, val_df = train_test_split(train_val_df, test_size=0.2, random_state=42, shuffle=False)

print(val_df.head())