if you have not download NusaX, use the following code


In [None]:
# grab the data first
!git clone https://github.com/IndoNLP/nusax.git

default dataset path = 'nusax/datasets/mt/' and 'nusax/datasets/sentiment/', which is for machine translation and sentiment analysis respectively.

please do not change the directory name or the filepath, otherwise the code will not work.

In [None]:
import pandas as pd
from nltk import word_tokenize
import nltk
nltk.download('punkt')

# Data loading for Machine Translation Task

In [60]:
def load_mt_data(directory="nusax/datasets/mt/"):
    """
    Load machine translation data from NusaX dataset.
    Returns training, validation, and testing data for each language separately.
    
    Returns:
        dict: A dictionary with language keys ('indonesian', 'english', 'javanese').
              Each key contains another dictionary with keys 'train', 'valid', and 'test'.
              Each of these keys maps to a tuple of (tokenized_data, original_data), where:
                  - tokenized_data: List of tokenized sentences in the corresponding language.
                  - original_data: List of original sentences(in other words, not tokenized) in the corresponding language.
    
    Example usage:
        mt_data = load_mt_data()
        indonesian_train_data = mt_data['indonesian']['train']
        english_valid_data = mt_data['english']['valid']
        javanese_test_data = mt_data['javanese']['test']
    """
    train_data = pd.read_csv(directory + "train.csv")
    valid_data = pd.read_csv(directory + "valid.csv")
    test_data = pd.read_csv(directory + "test.csv")

    languages = ["indonesian", "english", "javanese"]
    data = {}

    for lang in languages:
        xtrain = [" ".join(word_tokenize(sent)) for sent in list(train_data[lang])]
        xvalid = [" ".join(word_tokenize(sent)) for sent in list(valid_data[lang])]
        xtest = [" ".join(word_tokenize(sent)) for sent in list(test_data[lang])]
        ytrain = list(train_data[lang])
        yvalid = list(valid_data[lang])
        ytest = list(test_data[lang])

        data[lang] = {
            "train": (xtrain, ytrain),
            "valid": (xvalid, yvalid),
            "test": (xtest, ytest)
        }

    return data

# Data loading for Sentiment Analysis Task

In [61]:
def load_sentiment_data(directory="nusax/datasets/sentiment/"):
    """
    Load sentiment analysis data from NusaX dataset.
    Returns training, validation, and testing data along with labels for each language.
    
    Returns:
    dict: A dictionary with language keys ('indonesian', 'english', 'javanese').
            Each key contains another dictionary with keys 'train', 'valid', and 'test'.
            Each of these keys maps to a tuple of (tokenized_data, original_data), where:
                - tokenized_data: List of tokenized sentences in the corresponding language.
                - original_data: List of original sentences(in other words, not tokenized) in the corresponding language.
    
    Example usage:
        sentiment_data = load_sentiment_data()
        english_train_data = sentiment_data['english']['train']
        indonesian_valid_data = sentiment_data['indonesian']['valid']
        javanese_test_data = sentiment_data['javanese']['test']
    """
    languages = ["indonesian", "english", "javanese"]
    data = {}

    for lang in languages:
        lang_directory = directory + lang + "/"
        train_df = pd.read_csv(lang_directory + "train.csv")
        valid_df = pd.read_csv(lang_directory + "valid.csv")
        test_df = pd.read_csv(lang_directory + "test.csv")

        xtrain = [" ".join(word_tokenize(sent)) for sent in list(train_df['text'])]
        ytrain = list(train_df['label'])
        xvalid = [" ".join(word_tokenize(sent)) for sent in list(valid_df['text'])]
        yvalid = list(valid_df['label'])
        xtest = [" ".join(word_tokenize(sent)) for sent in list(test_df['text'])]
        ytest = list(test_df['label'])

        data[lang] = {
            "train": (xtrain, ytrain),
            "valid": (xvalid, yvalid),
            "test": (xtest, ytest)
        }
    
    return data

In [62]:
# # test and print
# mt_data = load_mt_data()
# sentiment_data = load_sentiment_data()

# print("Machine Translation Data:")
# print(mt_data)
# print("\nSentiment Analysis Data:")
# print(sentiment_data)