In [1]:
%%bash
pip install --no-cache-dir -qU pip
pip install --no-cache-dir -qU \
    jsonlines datasets transformers

In [2]:
import pandas as pd

df_invoice = pd.read_csv('WA_Fn-UseC_-Accounts-Receivable.csv')
df_invoice['InvoiceDate'] = pd.to_datetime(df_invoice['InvoiceDate'], format='%m/%d/%Y')\
    .dt.date
df_invoice['InvoiceAmount'] = df_invoice['InvoiceAmount']\
    .apply(lambda x: "${:,.2f}".format(x))
df_invoice['DaysToSettle'] = df_invoice['DaysToSettle'].apply(lambda x: f"{x} days")

df_invoice.drop(columns=['countryCode', 'PaperlessDate', 'invoiceNumber', 'DueDate', 
    'Disputed', 'SettledDate', 'PaperlessBill', 'DaysLate'], inplace=True)
df_invoice.sort_values(by=['customerID', 'InvoiceDate'], inplace=True)
df_invoice.reset_index(inplace=True, drop=True)

df_invoice[df_invoice['customerID']=='2621-XCLEH']

Unnamed: 0,customerID,InvoiceDate,InvoiceAmount,DaysToSettle
401,2621-XCLEH,2012-01-13,$80.99,61 days
402,2621-XCLEH,2012-02-21,$79.51,46 days
403,2621-XCLEH,2012-02-22,$69.80,43 days
404,2621-XCLEH,2012-03-02,$67.51,57 days
405,2621-XCLEH,2012-03-23,$89.05,52 days
406,2621-XCLEH,2012-04-16,$74.06,44 days
407,2621-XCLEH,2012-06-27,$69.42,60 days
408,2621-XCLEH,2012-11-18,$86.39,75 days
409,2621-XCLEH,2013-03-01,$58.96,55 days
410,2621-XCLEH,2013-03-31,$70.93,55 days


In [3]:
prompt_template = """\
Below is an instruction that describes a task. \
Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Response:
"""

In [4]:
import pandas as pd
import jsonlines

def create_train_prompt(x):
    d = {}
    d['instruction'] = f"""\
Given the dataset delimited by the triple backticks, forecast number of days client \
{x['customerID'][-2:-1].values[0]} will take for the payment of an invoice dated \
{x['InvoiceDate'][-2:-1].values[0]} with an amount {x['InvoiceAmount'][-2:-1].values[0]} \
to be settled. Return the response in JSON format, containing four keys: \
'customerID', 'InvoiceDate', 'InvoiceAmount' and 'DaysToSettle'. \
Return only the forecast, not the Python code.

```
{x[:-2].to_string(index=False)}
```\
"""
    d['response'] = f"""\
{{
    "customerID": {x['customerID'][-2:-1].values[0]},
    "InvoiceDate": {x['InvoiceDate'][-2:-1].values[0]},
    "InvoiceAmount": {x['InvoiceAmount'][-2:-1].values[0]},
    "DaysToSettle": {x['DaysToSettle'][-2:-1].values[0]}
}}\
"""

    return pd.Series(d, index=['instruction', 'response'])

df_train_prompt = df_invoice.copy()\
    .sort_values(by=['customerID', 'InvoiceDate'])\
    .groupby('customerID')\
    .apply(create_train_prompt)\
    .reset_index()\
    .drop(columns=['customerID'])

train_prompts = []
for i in range(len(df_train_prompt)):
    prompt = prompt_template.format(
        instruction=df_train_prompt.loc[i, 'instruction']
    )
    train_prompts.append({
        "input": prompt,
        "output": df_train_prompt.loc[i, 'response']
    })

with jsonlines.open('train_prompt.jsonl', 'w') as writer:
    writer.write_all(train_prompts)

In [5]:
import pandas as pd
import jsonlines

def create_test_prompt(x):
    d = {}
    d['instruction'] = f"""\
Given the dataset delimited by the triple backticks, forecast number of days client \
{x['customerID'][-1:].values[0]} will take for the payment of an invoice dated \
{x['InvoiceDate'][-1:].values[0]} with an amount {x['InvoiceAmount'][-1:].values[0]} \
to be settled. Return the response in JSON format, containing four keys: \
'customerID', 'InvoiceDate', 'InvoiceAmount' and 'DaysToSettle'. \
Return only the forecast, not the Python code.

```
{x[:-1].to_string(index=False)}
```\
"""
    d['response'] = f"""\
{{
    "customerID": {x['customerID'][-1:].values[0]},
    "InvoiceDate": {x['InvoiceDate'][-1:].values[0]},
    "InvoiceAmount": {x['InvoiceAmount'][-1:].values[0]},
    "DaysToSettle": {x['DaysToSettle'][-1:].values[0]}
}}\
"""

    return pd.Series(d, index=['instruction', 'response'])

df_test_prompt = df_invoice.copy()\
    .sort_values(by=['customerID', 'InvoiceDate'])\
    .groupby('customerID')\
    .apply(create_test_prompt)\
    .reset_index()\
    .drop(columns=['customerID'])

test_prompts = []
for i in range(len(df_test_prompt)):
    prompt = prompt_template.format(
        instruction=df_test_prompt.loc[i, 'instruction']
    )
    test_prompts.append({
        "input": prompt,
        "output": df_test_prompt.loc[i, 'response']
    })

with jsonlines.open('test_prompt.jsonl', 'w') as writer:
    writer.write_all(test_prompts)

In [6]:
import datasets

finetune_dataset  = datasets.load_dataset("json", data_files={
    "train": "train_prompt.jsonl",
    "test": "test_prompt.jsonl"
})
print(finetune_dataset)

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input', 'output'],
        num_rows: 100
    })
})


In [7]:
finetune_dataset['test'][17]

{'input': "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nGiven the dataset delimited by the triple backticks, forecast number of days client 2621-XCLEH will take for the payment of an invoice dated 2013-07-28 with an amount $92.17 to be settled. Return the response in JSON format, containing four keys: 'customerID', 'InvoiceDate', 'InvoiceAmount' and 'DaysToSettle'. Return only the forecast, not the Python code.\n\n```\ncustomerID InvoiceDate InvoiceAmount DaysToSettle\n2621-XCLEH  2012-01-13        $80.99      61 days\n2621-XCLEH  2012-02-21        $79.51      46 days\n2621-XCLEH  2012-02-22        $69.80      43 days\n2621-XCLEH  2012-03-02        $67.51      57 days\n2621-XCLEH  2012-03-23        $89.05      52 days\n2621-XCLEH  2012-04-16        $74.06      44 days\n2621-XCLEH  2012-06-27        $69.42      60 days\n2621-XCLEH  2012-11-18        $86.39      75 days\n2621-XCLEH  2013-03-01        $58.96

In [24]:
import torch
import logging
logger = logging.getLogger(__name__)

device_count = torch.cuda.device_count()
if device_count > 0:
    logger.debug('Select GPU device')
    device = torch.device('cuda')
else:
    logger.debug('Select CPU device')
    device = torch.device('cpu')

In [25]:
from transformers import AutoTokenizer, AutoModelForCausalLM

pretrained_llm = "EleutherAI/pythia-70m"

base_model = AutoModelForCausalLM.from_pretrained(pretrained_llm)
base_model.to(device)
tokenizer = AutoTokenizer.from_pretrained(pretrained_llm)

In [26]:
base_model

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (a

In [12]:
text = finetune_dataset['test'][17]['input'] \
    + finetune_dataset['test'][17]['output']

encoded_text = tokenizer(text)['input_ids']
print("Encoded texts into tokens:\n", encoded_text)

decoded_text = tokenizer.decode(encoded_text)
print("Decoded tokens back into texts:\n", decoded_text)

Encoded texts into tokens:
 [30003, 310, 271, 9775, 326, 8631, 247, 4836, 15, 19566, 247, 2380, 326, 20420, 29141, 253, 2748, 15, 187, 187, 4118, 41959, 27, 187, 15768, 253, 10895, 38352, 959, 407, 253, 16260, 896, 3028, 661, 13, 16923, 1180, 273, 1897, 5268, 3436, 1797, 14, 57, 28040, 41, 588, 1379, 323, 253, 7830, 273, 271, 45156, 15483, 4072, 14, 2922, 14, 1619, 342, 271, 2408, 370, 4529, 15, 1166, 281, 320, 11371, 15, 16140, 253, 2380, 275, 13922, 5981, 13, 4508, 1740, 10149, 27, 686, 34590, 1838, 1383, 686, 688, 22619, 6958, 1383, 686, 688, 22619, 35277, 8, 285, 686, 41430, 1992, 52, 35189, 5983, 16140, 760, 253, 16923, 13, 417, 253, 13814, 2127, 15, 187, 187, 11202, 187, 34590, 1838, 49427, 6958, 49427, 35277, 23264, 1992, 52, 35189, 187, 1731, 1797, 14, 57, 28040, 41, 50276, 6755, 14, 520, 14, 1012, 50270, 5, 1438, 15, 1525, 50272, 3832, 1897, 187, 1731, 1797, 14, 57, 28040, 41, 50276, 6755, 14, 2640, 14, 1797, 50270, 5, 2787, 15, 3712, 50272, 2950, 1897, 187, 1731, 1797, 14, 57

In [13]:
import numpy as np

test_dataset = finetune_dataset['test']
input_lengths, output_lengths = [], []
for i in range(len(test_dataset)):
    tokenized_input = tokenizer(test_dataset[i]['input'])
    input_lengths.append(len(tokenized_input['input_ids']))
    
    tokenized_output = tokenizer(test_dataset[i]['output'])
    output_lengths.append(len(tokenized_output['input_ids']))

print("Max input tokens:", np.max(input_lengths))
print("Max output tokens:", np.max(output_lengths))

Max input tokens: 885
Max output tokens: 53


In [17]:
def tokenize_text(row):
    text = row['input'][0] + row['output'][0]
    tokenizer.pad_token = tokenizer.eos_token
    tokenized_inputs = tokenizer(text, return_tensors='np', padding=True)
    max_length = min(2048, tokenized_inputs['input_ids'].shape[1])
    tokenizer.truncation_side = 'right'
    tokenized_inputs = tokenizer(text, return_tensors='np', truncation=True,
        max_length=max_length)

    return tokenized_inputs

tokenized_dataset = finetune_dataset.map(
    tokenize_text, batched=True, batch_size=1, drop_last_batch=True
)

print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'input_ids', 'attention_mask'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input', 'output', 'input_ids', 'attention_mask'],
        num_rows: 100
    })
})


In [19]:
def inference(text, model, tokenizer,
        max_input_tokens=1000, max_output_tokens=1000):
    # Tokenize
    input_ids = tokenizer.encode(text, return_tensors='pt',
        truncation=True, max_length=max_input_tokens)
    # Generate
    device = model.device
    generated_tokens_with_prompt = model.generate(
        input_ids=input_ids.to(device),
        max_length=max_output_tokens
    )
    # Decode
    generated_text_with_prompt = tokenizer.batch_decode(
        generated_tokens_with_prompt, skip_special_tokens=True)
    # Strip the prompt
    generated_text_answer = generated_text_with_prompt[0][len(text):]
    
    return generated_text_answer

In [22]:
test_sample = tokenized_dataset['test'][17]['input']
print(test_sample)
print(inference(test_sample, base_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Given the dataset delimited by the triple backticks, forecast number of days client 2621-XCLEH will take for the payment of an invoice dated 2013-07-28 with an amount $92.17 to be settled. Return the response in JSON format, containing four keys: 'customerID', 'InvoiceDate', 'InvoiceAmount' and 'DaysToSettle'. Return only the forecast, not the Python code.

```
customerID InvoiceDate InvoiceAmount DaysToSettle
2621-XCLEH  2012-01-13        $80.99      61 days
2621-XCLEH  2012-02-21        $79.51      46 days
2621-XCLEH  2012-02-22        $69.80      43 days
2621-XCLEH  2012-03-02        $67.51      57 days
2621-XCLEH  2012-03-23        $89.05      52 days
2621-XCLEH  2012-04-16        $74.06      44 days
2621-XCLEH  2012-06-27        $69.42      60 days
2621-XCLEH  2012-11-18        $86.39      75 days
2621-XCLEH  2013-03-01        $58.96      55 days
2621-XCLEH  