In [1]:
import os
import pandas as pd
from sklearn.datasets import fetch_20newsgroups

In [2]:
read_root_path = './SLTCDataset/RawData/'
write_root_path = './SLTCDataset/'
datasets = ["20NG", "R52", "R8", 'mr', 'ohsumed_single_23']
modes = ['train', 'test']

In [3]:
print(f'Converting data to CSV...')
for currentDatasets in datasets:
    print(f"currentDataset is {currentDatasets}")
    read_data_root_path = os.path.join(read_root_path, currentDatasets)
    write_data_root_path = os.path.join(write_root_path, currentDatasets)
    if currentDatasets == '20NG':
        for mode in modes:
            file_name = f"{currentDatasets}_{mode}.csv"
            rawData = fetch_20newsgroups(subset = mode)
            df = pd.DataFrame(data={
                "target" : rawData.target,
                "text" : rawData.data
            })
            df.to_csv(os.path.join(write_data_root_path, file_name), index = False)
    elif currentDatasets == 'R52':
        for mode in modes:
            raw_file_name = f'{mode}.txt'
            df = pd.read_csv(os.path.join(read_data_root_path, raw_file_name), sep='\t', encoding='utf-8')
            file_name = f"{currentDatasets}_{mode}.csv"
            df.to_csv(os.path.join(write_data_root_path, file_name), index = False)
    elif currentDatasets == 'R8':
        for mode in modes:
            raw_file_name = f'{mode}.txt'
            df = pd.read_csv(os.path.join(read_data_root_path, raw_file_name), sep='\t', encoding='utf-8')
            file_name = f"{currentDatasets}_{mode}.csv"
            df.to_csv(os.path.join(write_data_root_path, file_name), index = False)
    elif currentDatasets == 'mr':
        for mode in modes:
            raw_text_name = f'text_{mode}.txt'
            raw_label_name = f'label_{mode}.txt'
            with open(os.path.join(read_data_root_path, raw_text_name), encoding='utf-8') as f:
                text_data = [text.replace('\n', '') for text in f.readlines()]
            with open(os.path.join(read_data_root_path, raw_label_name), encoding='utf-8') as f:
                label_data = [text.replace('\n', '') for text in f.readlines()]
            assert len(text_data) == len(label_data), f"the length of data != the length of target in {currentDatasets}"
            file_name = f"{currentDatasets}_{mode}.csv"
            df = pd.DataFrame(data = {
                "target" : label_data,
                "text" : text_data
            })
            df.to_csv(os.path.join(write_data_root_path, file_name), index = False)
    elif currentDatasets == 'ohsumed_single_23':
        for mode in modes:
            mode_root_dir = os.path.join(read_data_root_path, mode)
            target_hash = {target : value for value, target in enumerate(os.listdir(mode_root_dir))}
            data = {
                "target":[],
                "text":[]
            }
            for target in os.listdir(mode_root_dir):
                target_root = os.path.join(mode_root_dir, target)
                for raw_file in os.listdir(target_root):
                    with open(os.path.join(target_root, raw_file), encoding='utf-8') as f:
                        data['text'].append(f.read())
                    data['target'].append(target_hash[target])
            assert len(data['target']) == len(data['text']), f"the length of data != the length of target in {currentDatasets}"
            file_name = f"{currentDatasets}_{mode}.csv"
            df = pd.DataFrame(data = data)
            df.to_csv(os.path.join(write_data_root_path, file_name), index = False)
    print("Done!!")
                

Converting data to CSV...
currentDataset is 20NG
Done!!
currentDataset is R52
Done!!
currentDataset is R8
Done!!
currentDataset is mr
Done!!
currentDataset is ohsumed_single_23
Done!!
