In [4]:
import os
import json
import pandas as pd

# to split text into sentences
from nltk.tokenize import sent_tokenize

# train/test split
from sklearn.model_selection import train_test_split

# Create tasks for Label Studio

In [None]:
path_to_data_raw = '/../../../../blobdata/raw/news.ndjson'
path_save_tasks = '../raw_data'

In [3]:
def transform_and_save(json_file, output_dir, num_items, num_skip=0, condition=False):
    """Transforms json file with raw data into multiple tasks for Label Studio"""

    os.makedirs(output_dir, exist_ok=True)

    with open(json_file, 'r') as f_in:
        # Skip the first # lines
        for _ in range(num_skip):
            if not f_in.readline():
                return  # Exit if there are less than 500 lines
            
        count = 0
        while count <= num_items:
            line = f_in.readline()

            if not line:
                break

            item = json.loads(line)

            concatenated_text = item.get('+header:en', '') + '. ' + item.get('+abstract:en', '')

            if condition:
                if 'acquire' in concatenated_text:
                    transformed_item = {
                        "data": {
                            "text": concatenated_text.strip(),
                            "meta_info": {
                                "timestamp": str(item['published_at+timestamp']),
                                "location": item.get('+domain_n', '')
                            }
                        }
                    }
                    
                    output_file = os.path.join(output_dir, f'entity_{count + num_skip}.json')
                    with open(output_file, 'w') as f_out:
                        json.dump(transformed_item, f_out, indent=2)

                    count += 1

            else:

                transformed_item = {
                    "data": {
                        "text": concatenated_text.strip(),
                        "meta_info": {
                            "timestamp": str(item['published_at+timestamp']),
                            "location": item.get('+domain_n', '')
                        }
                    }
                }
                
                output_file = os.path.join(output_dir, f'entity_{count}.json')
                with open(output_file, 'w') as f_out:
                    json.dump(transformed_item, f_out, indent=2)

                count += 1


In [4]:
transform_and_save(path_to_data_raw, path_save_tasks, num_items=100, num_skip=550, condition=True)

# Train test split

In [5]:
path_to_data_annotations = '../benchmark_data/annotations.json'
path_save_split = '../benchmark_data/train_test_split/'

In [6]:
# transform from json to pd.DataFrame

def split_text_into_sentences(row):
    """Splits text into multiple sentences."""

    # split text into sentences
    sentences = sent_tokenize(row['text'])
    result = pd.DataFrame({'text': sentences})
    
    return result

def transform_json(path):
    """
    Transforms json into pd.DataFrame.
    Args:
        path : string that contains path to the json file
    """

    with open(path) as f:
        data = json.load(f)

    texts = []

    for item in data:

        text = item['data']['text']

        texts.append(text)


    df = pd.DataFrame({'text': texts})
    df = pd.concat([split_text_into_sentences(row) for _, row in df.iterrows()], ignore_index=True)
    df = df.drop_duplicates(ignore_index=True)

    return df


In [7]:
df = transform_json(path_to_data_annotations)
train_df, test_df = train_test_split(df, test_size=0.33, shuffle=True, random_state=42)

In [8]:
df

Unnamed: 0,text
0,"Zelensky is Pinocchio, officer tells The New Y..."
1,A Ukrainian officer interviewed by The New Yor...
2,SmartBear Names SVP of Growth Marketing to Sca...
3,Kevin Foster brings 25+ years of marketing exp...
4,Kevin comes to SmartBear from the startup High...
...,...
881,Top End Energy Advances Low-Carbon Vision.
882,Top End Energy Limited (AU:TEE) has released a...
883,"Top End Energy Limited, an Australian energy c..."
884,Azelis Expands Its Portfolio in Germany With t...


In [6]:
train_df.to_csv(path_save_split + 'train.csv', index=False)
test_df.to_csv(path_save_split + 'test.csv', index=False)