In [1]:
# Use Kernel "base" for this notebook
import pandas as pd
import numpy as np

In [2]:
# Load the dataset
file_path = r'(A) Data/Need To Vote/3 Annotators_News Title_400 Data.csv'
df = pd.read_csv(file_path)

# Display the first few rows to check the data
df.head()

Unnamed: 0,Title,1st Annotator,2nd Annotator,3rd Annotator,Voting Result,Final Take,Labelling
0,,,,,,,
1,"Olahraga Pilates Makin Populer, Ini Deret Manf...",Positive,Positive,Positive,,,
2,,,,,,,
3,"Janice Tjen Luar Biasa, Lolos di 2 Nomor Final...",Positive,Positive,Positive,,,
4,,,,,,,


In [3]:
def determine_voting_result(row):
    # Get votes from the three annotators
    votes = [row['1st Annotator'], row['2nd Annotator'], row['3rd Annotator']]
    
    # Normalize to Title Case to handle potential inconsistencies (e.g. 'positive' vs 'Positive')
    # Assuming standard labels are 'Positive', 'Negative', 'Neutral'
    votes = [str(v).strip().title() for v in votes]
    
    # Count occurrences
    counts = {
        'Positive': votes.count('Positive'),
        'Negative': votes.count('Negative'),
        'Neutral': votes.count('Neutral')
    }
    
    # Check for Majority
    for label, count in counts.items():
        if count >= 2:
            return label
            
    # Check for Tie (1 Positive, 1 Negative, 1 Neutral)
    # The prompt says: "if positive, negative, and neutral is chosen the result is 'TIED'"
    if counts['Positive'] == 1 and counts['Negative'] == 1 and counts['Neutral'] == 1:
        return 'TIED'
        
    # Fallback if something unexpected happens (e.g. unknown labels)
    return 'Unknown'

In [4]:
# Apply the voting logic
df['Voting Result'] = df.apply(determine_voting_result, axis=1)

In [5]:
# Determine Final Take
# Rule: if it is Tied the result is Neutral. Otherwise take the Voting Result.
def determine_final_take(row):
    if row['Voting Result'] == 'TIED':
        return 'Neutral'
    return row['Voting Result']

df['Final Take'] = df.apply(determine_final_take, axis=1)

In [6]:
# Determine Labelling
# Rule: 1 for positive, -1 for negative, 0 for neutral
def determine_labelling(row):
    final_take = row['Final Take']
    if final_take == 'Positive':
        return 1
    elif final_take == 'Negative':
        return -1
    elif final_take == 'Neutral':
        return 0
    return None # Should not happen based on rules

df['Labelling'] = df.apply(determine_labelling, axis=1)

In [7]:
# Display the updated dataframe
df.head(10)

Unnamed: 0,Title,1st Annotator,2nd Annotator,3rd Annotator,Voting Result,Final Take,Labelling
0,,,,,Unknown,Unknown,
1,"Olahraga Pilates Makin Populer, Ini Deret Manf...",Positive,Positive,Positive,Positive,Positive,1.0
2,,,,,Unknown,Unknown,
3,"Janice Tjen Luar Biasa, Lolos di 2 Nomor Final...",Positive,Positive,Positive,Positive,Positive,1.0
4,,,,,Unknown,Unknown,
5,Siapakah Pasukan RSF yang Diduga Bantai Ribuan...,Negative,Negative,Neutral,Negative,Negative,-1.0
6,,,,,Unknown,Unknown,
7,Pertamina Peduli Salurkan Bantuan untuk Korban...,Positive,Positive,Neutral,Positive,Positive,1.0
8,,,,,Unknown,Unknown,
9,"Cuaca Panas, Hujan hingga Banjir Rob Bayangi S...",Negative,Negative,Neutral,Negative,Negative,-1.0


In [8]:
# Replace 'Unknown' strings with NaN first
df = df.replace('Unknown', np.nan)
# Then drop rows that contain any NaN values
df = df.dropna()
df.head(10)

Unnamed: 0,Title,1st Annotator,2nd Annotator,3rd Annotator,Voting Result,Final Take,Labelling
1,"Olahraga Pilates Makin Populer, Ini Deret Manf...",Positive,Positive,Positive,Positive,Positive,1.0
3,"Janice Tjen Luar Biasa, Lolos di 2 Nomor Final...",Positive,Positive,Positive,Positive,Positive,1.0
5,Siapakah Pasukan RSF yang Diduga Bantai Ribuan...,Negative,Negative,Neutral,Negative,Negative,-1.0
7,Pertamina Peduli Salurkan Bantuan untuk Korban...,Positive,Positive,Neutral,Positive,Positive,1.0
9,"Cuaca Panas, Hujan hingga Banjir Rob Bayangi S...",Negative,Negative,Neutral,Negative,Negative,-1.0
11,Reaksi Hector Souto Soal Timnas Futsal Disamak...,Neutral,Neutral,Neutral,Neutral,Neutral,0.0
13,PB XIII Akan Dimakamkan di Imogiri Yogyakarta ...,Neutral,Neutral,Neutral,Neutral,Neutral,0.0
15,"PB XIII, Naik Takhta di Tengah Konflik Raja Ke...",Neutral,Neutral,Neutral,Neutral,Neutral,0.0
17,"VIDEO: Cuaca Ekstrem Datang, Pohon Tumbang Men...",Negative,Negative,Neutral,Negative,Negative,-1.0
19,Mesin Cuci Front Load Rp3 Jutaan di Transmart ...,Neutral,Positive,Neutral,Neutral,Neutral,0.0


In [9]:
# Save the result to a new CSV file to avoid overwriting the original immediately
output_path = r'(A) Data/Need To Vote/Final Labelled_News Title_400 Data.csv'
df.to_csv(output_path, index=False)
print(f"Processed data saved to {output_path}")

Processed data saved to (A) Data/Need To Vote/Final Labelled_News Title_400 Data.csv
