# Gold Standard Dataset Creation

In [None]:
import pandas as pd

In [None]:
df_silver_standard = pd.read_pickle(r'Dataset\silver_std.pkl')
df_aspects_fixed = pd.read_pickle(r'Dataset\aspect_categorization_before_filter.pkl')

In [None]:
# ==============================================================================
# STRATIFIED SAMPLING (250 POS / 250 NEG)
# ==============================================================================
# Filter by weak label
df_pos = df_silver_standard[df_silver_standard['weak_label'] == 'positive']
df_neg = df_silver_standard[df_silver_standard['weak_label'] == 'negative']

# Random sample 250 from each (Review Level)
sample_pos = df_pos.sample(n=250, random_state=42)
sample_neg = df_neg.sample(n=250, random_state=42)

# Combine to create the list of 500 ID's
gold_review_sample = pd.concat([sample_pos, sample_neg])
gold_ids = gold_review_sample.index 

print(f"Gold Sample Selected: {len(gold_review_sample)} reviews")
print(f"Breakdown: {gold_review_sample['weak_label'].value_counts()}")

# ==============================================================================
# RETRIEVE SEGMENTS & PREPARE EXCEL
# ==============================================================================
# Filter the aspect/segment dataset to only include rows belonging to our 500 reviews
df_gold_segments = df_aspects_fixed[df_aspects_fixed['Original_Review_ID'].isin(gold_ids)].copy()

# Select only necessary columns for the labeler
# Adjust column names based on your actual df_aspects_clean columns
cols_to_export = ['Original_Review_ID', 'Full_Review', 'Segment', 'Aspect_Labels'] 

df_labeling_task = df_gold_segments[cols_to_export].copy()

# Add empty columns for your manual input
df_labeling_task['Manual_Aspect'] = "" 
df_labeling_task['Manual_Sentiment'] = "" 
df_labeling_task['Is_Wrong_segment'] = "" # Checkbox if segmentation was bad

# Export
output_path = r'C:\Users\Ong Hui Ling\Documents\Github\Aspect-Based-Sentiment-Analysis\Dataset\Gold_Standard_Labeling_Task.xlsx'
df_labeling_task.to_excel(output_path, index=False)

print(f"\nTask Created!")
print(f"Total Segments to Label: {len(df_labeling_task)}")
print(f"File saved to: {output_path}")