In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
ROOT = '.\\dataset\\SLTC\\'
TARGET = '.\\ProcessedData\\'
dirs = os.listdir(ROOT)
dataset_names = {
    "20NewsGroup": "20NG",
    "IMDB":"IMDB",
    "MovieReview":"mr",
    "Ohsumed":"ohsumed_single_23",
    "R52":"R52",
    "R8":"R8"
}
TRAIN_FILE_NAME = 'train_data.csv'
TEST_FILE_NAME = 'test_data.csv'

In [3]:
def get20NG(ROOT, dir):
    data_path = os.path.join(ROOT, dir)
    train_data = pd.read_csv(os.path.join(data_path, '20NG_train.csv'), index_col=False)
    test_data = pd.read_csv(os.path.join(data_path, '20NG_test.csv'), index_col=False)
    return train_data, test_data
def getIMDB(ROOT, dir):
    pass
def getMR(ROOT, dir):
    data_path = os.path.join(ROOT, dir)
    with open(os.path.join(data_path, 'text_train.txt'), encoding='utf-8') as f:
        train_text = [text.replace('\n', '') for text in f.readlines()]
    with open(os.path.join(data_path, 'label_train.txt'), encoding='utf-8') as f:
        train_label = [int(label) for label in f.readlines()]
    train_data = {'target': train_label, 'text': train_text}
    train_data = pd.DataFrame(train_data)

    with open(os.path.join(data_path, 'text_test.txt'), encoding='utf-8') as f:
        test_text = [text.replace('\n', '') for text in f.readlines()]
    with open(os.path.join(data_path, 'label_test.txt'), encoding='utf-8') as f:
        test_label = [int(label) for label in f.readlines()]
    test_data = {'target': test_label, 'text': test_text}
    test_data = pd.DataFrame(test_data)
    return train_data, test_data
def getOhsumed(ROOT, dir):
    data_path = os.path.join(ROOT, dir)
    label2idx = {label: i for i, label in enumerate(os.listdir(os.path.join(data_path, 'train')))}
    train_data = {'target' : [], 'text' : []}
    for key in label2idx.keys():
        train_path = os.path.join(os.path.join(data_path, 'train'), key)
        for filename in os.listdir(train_path):
            filepath = os.path.join(train_path, filename)
            with open(filepath, 'r' ,encoding='utf-8') as f:
                train_data['target'].append(label2idx[key])
                train_data['text'].append(f.read())
    test_data = {'target' : [], 'text' : []}
    for key in label2idx.keys():
        test_path = os.path.join(os.path.join(data_path, 'test'), key)
        for filename in os.listdir(test_path):
            filepath = os.path.join(test_path, filename)
            with open(filepath, 'r' ,encoding='utf-8') as f:
                test_data['target'].append(label2idx[key])
                test_data['text'].append(f.read())
    train_data = pd.DataFrame(train_data)
    test_data = pd.DataFrame(test_data)
    return train_data, test_data
def getReuters(ROOT, dir):
    data_path = os.path.join(ROOT, dir)
    train_data = pd.read_csv(os.path.join(data_path, 'train.txt'), sep='\t', index_col=False)
    test_data = pd.read_csv(os.path.join(data_path, 'test.txt'), sep='\t', index_col=False)
    train_data.rename(columns={'Label':'target', "Text":'text'}, inplace=True)
    test_data.rename(columns={'Label':'target', "Text":'text'}, inplace=True)
    label2idx = {label : i for i, label in enumerate(train_data['target'].unique())}
    train_data['target'] = train_data['target'].map(lambda x : label2idx[x])
    test_data['target'] = test_data['target'].map(lambda x : label2idx[x])
    return train_data, test_data

In [7]:
for dir in dirs:
    # print(f'Current dataset : {dir}')
    target = os.path.join(TARGET, dir)
    if (not os.path.isdir(target)):
        os.mkdir(target)
    
    if (dataset_names['20NewsGroup'] == dir):
        train_data, test_data = get20NG(ROOT, dir)
    elif (dataset_names['IMDB'] == dir):
        pass
    elif (dataset_names['MovieReview'] == dir):
        train_data, test_data = getMR(ROOT, dir)
    elif (dataset_names['Ohsumed'] == dir):
        train_data, test_data = getOhsumed(ROOT, dir)
    elif (dataset_names['R52'] == dir):
        train_data, test_data = getReuters(ROOT, dir)
    elif (dataset_names['R8'] == dir):
        train_data, test_data = getReuters(ROOT, dir)
    else:
        print("Uknown dataset")
        break

    if (train_data is not None and test_data is not None):
        train_data.to_csv(os.path.join(target, TRAIN_FILE_NAME), index=False)
        test_data.to_csv(os.path.join(target, TEST_FILE_NAME), index=False)
        train_data = None
        test_data = None