In [1]:
# %%
import json
import random
import pandas as pd

# Configuration for random seed and file paths
CONFIG = {
    "seed": 42,
    "file_paths": ['train.jsonl', 'dev.jsonl', 'test.jsonl'],
    "positive_output_path": "../../../test_task_data/sentiment/positive_sst5.json",
    "negative_output_path": "../../../test_task_data/sentiment/negative_sst5.json",
    "remaining_output_path": "../../../external_classifier_data/sentiment/sst5.json"
}

# Set random seed for reproducibility
random.seed(CONFIG['seed'])

def read_jsonl_to_df(file_paths):
    """
    Read and merge data from jsonl files into a DataFrame.
    
    Args:
        file_paths (list): List of file paths to read from.
    
    Returns:
        DataFrame: Combined data from the files.
    """
    all_data = []  # Store all data

    # Iterate through each file
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                data = json.loads(line)
                all_data.append({'text': data['text'], 'label': data['label']})

    # Convert data into DataFrame
    return pd.DataFrame(all_data)


In [2]:

# Read files into DataFrame
original_data = read_jsonl_to_df(CONFIG['file_paths'])

# Filter out neutral scores
processed_data = original_data[(original_data['label'] == 0) | (original_data['label'] == 1) | 
                                (original_data['label'] == 3) | (original_data['label'] == 4)]

# Sample 500 from each of the four scores for positive and negative DataFrames
positive_data_3 = processed_data[processed_data['label'] == 3].sample(500, random_state=CONFIG['seed'])
positive_data_4 = processed_data[processed_data['label'] == 4].sample(500, random_state=CONFIG['seed'])
negative_data_0 = processed_data[processed_data['label'] == 0].sample(500, random_state=CONFIG['seed'])
negative_data_1 = processed_data[processed_data['label'] == 1].sample(500, random_state=CONFIG['seed'])

# Combine positive and negative samples into DataFrames
positive_df = pd.DataFrame({'prompt': pd.concat([positive_data_3['text'], positive_data_4['text']], ignore_index=True),
                            'ground_truth': pd.concat([positive_data_3['text'], positive_data_4['text']], ignore_index=True)})

negative_df = pd.DataFrame({'prompt': pd.concat([negative_data_0['text'], negative_data_1['text']], ignore_index=True),
                            'ground_truth': pd.concat([negative_data_0['text'], negative_data_1['text']], ignore_index=True)})


positive_df.to_json(CONFIG['positive_output_path'], orient='records', lines=True, force_ascii=False)
negative_df.to_json(CONFIG['negative_output_path'], orient='records', lines=True, force_ascii=False)


In [3]:
positive_df

Unnamed: 0,prompt,ground_truth
0,` it 's better to go in knowing full well what...,` it 's better to go in knowing full well what...
1,a hard look at one man 's occupational angst a...,a hard look at one man 's occupational angst a...
2,"it 's a setup so easy it borders on facile , b...","it 's a setup so easy it borders on facile , b..."
3,"i admired it , particularly that unexpected do...","i admired it , particularly that unexpected do..."
4,there are deeply religious and spiritual peopl...,there are deeply religious and spiritual peopl...
...,...,...
995,no such thing is sort of a minimalist beauty a...,no such thing is sort of a minimalist beauty a...
996,a stirring tribute to the bravery and dedicati...,a stirring tribute to the bravery and dedicati...
997,"a first-class , thoroughly involving b movie t...","a first-class , thoroughly involving b movie t..."
998,a warm but realistic meditation on friendship ...,a warm but realistic meditation on friendship ...


In [4]:
negative_df

Unnamed: 0,prompt,ground_truth
0,"there are cheesy backdrops , ridiculous action...","there are cheesy backdrops , ridiculous action..."
1,make like the title and dodge this one .,make like the title and dodge this one .
2,"it 's clotted with heavy-handed symbolism , di...","it 's clotted with heavy-handed symbolism , di..."
3,you have no affinity for most of the characters .,you have no affinity for most of the characters .
4,that chirpy songbird britney spears has popped...,that chirpy songbird britney spears has popped...
...,...,...
995,adam sandler 's heart may be in the right plac...,adam sandler 's heart may be in the right plac...
996,the film 's hero is a bore and his innocence s...,the film 's hero is a bore and his innocence s...
997,"while some of the camera work is interesting ,...","while some of the camera work is interesting ,..."
998,the holiday message of the 37-minute santa vs....,the holiday message of the 37-minute santa vs....


In [5]:
# Process remaining data
remaining_data = processed_data[~processed_data['text'].isin(positive_df['prompt']) & ~processed_data['text'].isin(negative_df['prompt'])]

# Temporary label change for processing
remaining_data.loc[remaining_data['label'].isin([3, 4]), 'label'] = -1
remaining_data.loc[remaining_data['label'].isin([0, 1]), 'label'] = 0
remaining_data.loc[remaining_data['label'] == -1, 'label'] = 1

# Save remaining data to JSON
remaining_data.to_json(CONFIG['remaining_output_path'], orient='records', lines=True, force_ascii=False)

In [6]:
remaining_data

Unnamed: 0,text,label
0,"a stirring , funny and finally transporting re...",1
2,they presume their audience wo n't sit still f...,0
6,jonathan parker 's bartleby should have been t...,1
7,campanella gets the tone just right -- funny i...,1
8,a fan film that for the uninitiated plays bett...,0
...,...,...
11849,"an often-deadly boring , strange reading of a ...",0
11850,the problem with concept films is that if the ...,0
11851,"safe conduct , however ambitious and well-inte...",0
11852,"a film made with as little wit , interest , an...",0
