In [None]:
import pandas as pd

In [None]:
column_names=[
        'id',                # Column 1: the ID of the statement ([ID].json).
        'label',             # Column 2: the label.
        'statement',         # Column 3: the statement.
        'subjects',          # Column 4: the subject(s).
        'speaker',           # Column 5: the speaker.
        'speaker_job_title', # Column 6: the speaker's job title.
        'state_info',        # Column 7: the state info.
        'party_affiliation', # Column 8: the party affiliation.

        # Column 9-13: the total credit history count, including the current statement.
        'count_1', # barely true counts.
        'count_2', # false counts.
        'count_3', # half true counts.
        'count_4', # mostly true counts.
        'count_5', # pants on fire counts.

        'context', # Column 14: the context (venue / location of the speech or statement).
        'justification'
]

train2_data = pd.read_csv('train2.tsv', sep='\t', header=None, names=column_names)
test2_data  = pd.read_csv('test2.tsv',  sep='\t', header=None, names=column_names)
valid2_data = pd.read_csv('val2.tsv', sep='\t', header=None, names=column_names)

In [None]:
train2_data.head()

Unnamed: 0,id,label,statement,subjects,speaker,speaker_job_title,state_info,party_affiliation,count_1,count_2,count_3,count_4,count_5,context,justification
0.0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,That's a premise that he fails to back up. Ann...
1.0,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,"Surovell said the decline of coal ""started whe..."
2.0,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,Obama said he would have voted against the ame...
3.0,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release,The release may have a point that Mikulskis co...
4.0,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN,"Crist said that the economic ""turnaround start..."


In [None]:
!pip install datasets



In [None]:
from datasets import Dataset

In [None]:
def preprocess_data(data: pd.DataFrame, six_way: bool = True):
    # Drop unusable columns
    data.drop(columns=[f'count_{i+1}' for i in range(5)], inplace=True, errors='ignore')

    # Encode labels for six-way or binary classification
    if six_way:
        numerical = {'pants-fire': 0, 'false': 1, 'barely-true': 2, 'half-true': 3, 'mostly-true': 4, 'true': 5}
        data['label'] = data['label'].map(numerical)
    else:
        true_labels = ['true', 'mostly-true', 'half-true']
        false_labels = ['false', 'pants-fire', 'barely-true']
        data['label'] = data['label'].apply(lambda x: 1 if x in true_labels else 0 if x in false_labels else None)

    # Eliminate rows with invalid labels (if any)
    data.dropna(subset=['label', 'statement'], inplace=True)

    # Fill missing metadata with empty string
    data.fillna('', inplace=True)

    # Combine metadata into the 'statement' column
    data['statement'] = data['statement'] + ' ' + data['speaker'] + ' ' + \
                        data['speaker_job_title'] + ' ' + data['state_info'] + \
                        ' ' + data['party_affiliation'] + ' ' + data['context'] + ' ' + data['justification']

    # Keep only the 'label' and 'statement' columns
    data = data[['label', 'statement']]

    # Convert DataFrame to Dataset
    return Dataset.from_pandas(data)


In [None]:
def save_preprocessed_data(train_data: Dataset, test_data: Dataset, valid_data: Dataset, binary: bool = True):
    # Define file names based on classification type
    suffix = 'binary' if binary else 'six_way'

    # Save train data
    train_df = train_data.to_pandas()
    train_df.to_csv(f'train2_{suffix}.csv', index=False)
    print(f'Train data saved as train2_{suffix}.csv')

    # Save test data
    test_df = test_data.to_pandas()
    test_df.to_csv(f'test2_{suffix}.csv', index=False)
    print(f'Test data saved as test2_{suffix}.csv')

    # Save validation data
    valid_df = valid_data.to_pandas()
    valid_df.to_csv(f'valid2_{suffix}.csv', index=False)
    print(f'Validation data saved as valid2_{suffix}.csv')


In [None]:
# Reprocess the datasets to ensure binary classification
train2_data = preprocess_data(train2_data, six_way=False) # Remove .to_pandas()
test2_data = preprocess_data(test2_data, six_way=False)  # Remove .to_pandas()
valid2_data = preprocess_data(valid2_data, six_way=False) # Remove .to_pandas()

# Save the corrected binary-labeled data
save_preprocessed_data(train2_data, test2_data, valid2_data, binary=True)

Train data saved as train2_binary.csv
Test data saved as test2_binary.csv
Validation data saved as valid2_binary.csv


In [None]:
# Reprocess the datasets to ensure binary classification
train2_data = preprocess_data(train2_data, six_way=True) # Remove .to_pandas()
test2_data = preprocess_data(test2_data, six_way=True)  # Remove .to_pandas()
valid2_data = preprocess_data(valid2_data, six_way=True) # Remove .to_pandas()

# Save the corrected 6-way-labeled data
save_preprocessed_data(train2_data, test2_data, valid2_data, binary=False)

Train data saved as train2_six_way.csv
Test data saved as test2_six_way.csv
Validation data saved as valid2_six_way.csv
