In [1]:
%pip install datasets transformers torch tqdm pandas huggingface_hub
%pip install sentencepiece
%pip install protobuf transformers==4.30.2 cpm_kernels torch>=2.0 gradio mdtex2html sentencepiece accelerate


Collecting tqdm
  Downloading tqdm-4.66.2-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.49.0
    Uninstalling tqdm-4.49.0:
      Successfully uninstalled tqdm-4.49.0
Successfully installed tqdm-4.66.2
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 1.12.1 requires huggingface-hub<0.1.0,>=0.0.14, but you have huggingface-hub 0.21.4 which is incompatible.


In [3]:
import os
import shutil

jsonl_path = "./data/dataset_new.jsonl"
save_path = './data/dataset_new'
partition_path = './data/partition'


if os.path.exists(jsonl_path):
    os.remove(jsonl_path)

if os.path.exists(save_path):
    shutil.rmtree(save_path)

directory = "./data"
if not os.path.exists(directory):
    os.makedirs(directory)

### 1.2 Load and Prepare Dataset:
- Import necessary libraries from the datasets package: https://huggingface.co/docs/datasets/index
- Load the Twitter Financial News Sentiment (TFNS) dataset and convert it to a Pandas dataframe. https://huggingface.co/datasets/zeroshot/twitter-financial-news-sentiment
- Map numerical labels to their corresponding sentiments (negative, positive, neutral).
- Add instruction for each data entry, which is crucial for Instruction Tuning.
- Convert the Pandas dataframe back to a Hugging Face Dataset object.

In [5]:
from datasets import load_dataset
import datasets

dic = {
    0:"negative",
    1:'positive',
    2:'neutral',
}

tfns = load_dataset('zeroshot/twitter-financial-news-sentiment')
tfns = tfns['train']
tfns = tfns.to_pandas()
tfns['label'] = tfns['label'].apply(lambda x:dic[x])
tfns['instruction'] = 'What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}.'
tfns.columns = ['input', 'output', 'instruction']
tfns = datasets.Dataset.from_pandas(tfns)

Using custom data configuration twitter-financial-news-sentiment-daa9ce3749c6d4cd
Reusing dataset csv (C:\Users\baoh2\.cache\huggingface\datasets\csv\twitter-financial-news-sentiment-daa9ce3749c6d4cd\0.0.0\9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff)
100%|██████████| 1/1 [00:00<00:00, 497.96it/s]


### Concatenate and Shuffle dataset
- Concatenate tfns into training set
- Shuffle the training set
- Partition the training set into 5 subsets


In [None]:
# Concatenate and shuffle datasets
tmp_dataset = datasets.concatenate_datasets([tfns]*2)
train_dataset = tmp_dataset
print(tmp_dataset.num_rows)

all_dataset = train_dataset.shuffle(seed = 42)
all_dataset.shape

# Partition the dataset into 5 subsets
n = 5
partitions = all_dataset.train_test_split(test_size=1/n)
partitions


## Dataset Formatting and Tokenization


### Dataset Formatting


In [None]:
import json
from tqdm.notebook import tqdm

In [None]:
def format_example(example: dict) -> dict:
    context = f"Instruction: {example['instruction']}\n"
    if example.get("input"):
        context += f"Input: {example['input']}\n"
    context += "Answer: "
    target = example["output"]
    return {"context": context, "target": target}

In [None]:
data_list = []
for item in all_dataset.to_pandas().itertuples():
    tmp = {}
    tmp["instruction"] = item.instruction
    tmp["input"] = item.input
    tmp["output"] = item.output
    data_list.append(tmp)

In [None]:
# save to a jsonl file
with open("../data/dataset_new.jsonl", 'w') as f:
    for example in tqdm(data_list, desc="formatting.."):
        f.write(json.dumps(format_example(example)) + '\n')