In [1]:
%%bash
pip install --no-cache-dir -qU pip
pip install --no-cache-dir -qU jsonlines
pip check

No broken requirements found.


In [2]:
import pandas as pd

df_invoice = pd.read_csv('WA_Fn-UseC_-Accounts-Receivable.csv')
df_invoice['InvoiceDate'] = pd.to_datetime(df_invoice['InvoiceDate'], format='%m/%d/%Y')\
    .dt.date
df_invoice['InvoiceAmount'] = df_invoice['InvoiceAmount']\
    .apply(lambda x: "${:,.2f}".format(x))
df_invoice['DaysToSettle'] = df_invoice['DaysToSettle'].apply(lambda x: f"{x} days")

df_invoice.drop(columns=['countryCode', 'PaperlessDate', 'invoiceNumber', 'DueDate', 
    'Disputed', 'SettledDate', 'PaperlessBill', 'DaysLate'], inplace=True)
df_invoice.sort_values(by=['customerID', 'InvoiceDate'], inplace=True)
df_invoice.reset_index(inplace=True, drop=True)

print(df_invoice[df_invoice['customerID']=='2621-XCLEH'])

     customerID InvoiceDate InvoiceAmount DaysToSettle
401  2621-XCLEH  2012-01-13        $80.99      61 days
402  2621-XCLEH  2012-02-21        $79.51      46 days
403  2621-XCLEH  2012-02-22        $69.80      43 days
404  2621-XCLEH  2012-03-02        $67.51      57 days
405  2621-XCLEH  2012-03-23        $89.05      52 days
406  2621-XCLEH  2012-04-16        $74.06      44 days
407  2621-XCLEH  2012-06-27        $69.42      60 days
408  2621-XCLEH  2012-11-18        $86.39      75 days
409  2621-XCLEH  2013-03-01        $58.96      55 days
410  2621-XCLEH  2013-03-31        $70.93      55 days
411  2621-XCLEH  2013-04-27        $65.76      37 days
412  2621-XCLEH  2013-06-18        $37.49      29 days
413  2621-XCLEH  2013-06-24        $90.62      35 days
414  2621-XCLEH  2013-07-16        $78.08      48 days
415  2621-XCLEH  2013-07-28        $92.17      46 days


In [3]:
import pandas as pd
import jsonlines

def create_train_prompt(x):
    d = {}
    d['instruction'] = f"""\
Given the dataset delimited by the triple backticks, forecast number of days client \
{x['customerID'][-2:-1].values[0]} will take for the payment of an invoice dated \
{x['InvoiceDate'][-2:-1].values[0]} with an amount {x['InvoiceAmount'][-2:-1].values[0]} \
to be settled. Return the response in JSON format, containing four keys: \
'customerID', 'InvoiceDate', 'InvoiceAmount' and 'DaysToSettle'. \
Return only the forecast, not the Python code.

```
{x[:-2].to_string(index=False)}
```\
"""
    d['response'] = {
        "customerID": f"{x['customerID'][-2:-1].values[0]}",
        "InvoiceDate": f"{x['InvoiceDate'][-2:-1].values[0]}",
        "InvoiceAmount": f"{x['InvoiceAmount'][-2:-1].values[0]}",
        "DaysToSettle": f"{x['DaysToSettle'][-2:-1].values[0]}"
    }

    return pd.Series(d, index=['instruction', 'response'])

df_train_prompt = df_invoice.copy()\
    .sort_values(by=['customerID', 'InvoiceDate'])\
    .groupby('customerID')\
    .apply(create_train_prompt)\
    .reset_index()\
    .drop(columns=['customerID'])

train_prompt_template = """\
Below is an instruction that describes a task. Write a response \
that appropriately completes the request.

### Instruction:
{instruction}

### Response:
{response}\
"""

train_prompts = []
for i in range(len(df_train_prompt)):
    prompt = train_prompt_template.format(
        instruction=df_train_prompt.loc[i, 'instruction'],
        response=df_train_prompt.loc[i, 'response']
    )
    train_prompts.append(prompt)

with jsonlines.open('train_prompt.jsonl', 'w') as writer:
    writer.write_all(train_prompts)

In [4]:
train_prompts[17]

"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nGiven the dataset delimited by the triple backticks, forecast number of days client 2621-XCLEH will take for the payment of an invoice dated 2013-07-16 with an amount $78.08 to be settled. Return the response in JSON format, containing four keys: 'customerID', 'InvoiceDate', 'InvoiceAmount' and 'DaysToSettle'. Return only the forecast, not the Python code.\n\n```\ncustomerID InvoiceDate InvoiceAmount DaysToSettle\n2621-XCLEH  2012-01-13        $80.99      61 days\n2621-XCLEH  2012-02-21        $79.51      46 days\n2621-XCLEH  2012-02-22        $69.80      43 days\n2621-XCLEH  2012-03-02        $67.51      57 days\n2621-XCLEH  2012-03-23        $89.05      52 days\n2621-XCLEH  2012-04-16        $74.06      44 days\n2621-XCLEH  2012-06-27        $69.42      60 days\n2621-XCLEH  2012-11-18        $86.39      75 days\n2621-XCLEH  2013-03-01        $58.96      55 d

In [5]:
import pandas as pd
import jsonlines

def create_test_prompt(x):
    d = {}
    d['instruction'] = f"""\
Given the dataset delimited by the triple backticks, forecast number of days client \
{x['customerID'][-1:].values[0]} will take for the payment of an invoice dated \
{x['InvoiceDate'][-1:].values[0]} with an amount {x['InvoiceAmount'][-1:].values[0]} \
to be settled. Return the response in JSON format, containing four keys: \
'customerID', 'InvoiceDate', 'InvoiceAmount' and 'DaysToSettle'. \
Return only the forecast, not the Python code.

```
{x[:-1].to_string(index=False)}
```\
"""
    
    return pd.Series(d, index=['instruction'])

df_test_prompt = df_invoice.copy()\
    .sort_values(by=['customerID', 'InvoiceDate'])\
    .groupby('customerID')\
    .apply(create_test_prompt)\
    .reset_index()\
    .drop(columns=['customerID'])

test_prompt_template = """\
Below is an instruction that describes a task. Write a response \
that appropriately completes the request.

### Instruction:
{instruction}

### Response:\
"""

test_prompts = []
for i in range(len(df_test_prompt)):
    prompt = test_prompt_template.format(
        instruction=df_test_prompt.loc[i, 'instruction']
    )
    test_prompts.append(prompt)

with jsonlines.open('test_prompt.jsonl', 'w') as writer:
    writer.write_all(test_prompts)

In [6]:
test_prompts[17]

"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nGiven the dataset delimited by the triple backticks, forecast number of days client 2621-XCLEH will take for the payment of an invoice dated 2013-07-28 with an amount $92.17 to be settled. Return the response in JSON format, containing four keys: 'customerID', 'InvoiceDate', 'InvoiceAmount' and 'DaysToSettle'. Return only the forecast, not the Python code.\n\n```\ncustomerID InvoiceDate InvoiceAmount DaysToSettle\n2621-XCLEH  2012-01-13        $80.99      61 days\n2621-XCLEH  2012-02-21        $79.51      46 days\n2621-XCLEH  2012-02-22        $69.80      43 days\n2621-XCLEH  2012-03-02        $67.51      57 days\n2621-XCLEH  2012-03-23        $89.05      52 days\n2621-XCLEH  2012-04-16        $74.06      44 days\n2621-XCLEH  2012-06-27        $69.42      60 days\n2621-XCLEH  2012-11-18        $86.39      75 days\n2621-XCLEH  2013-03-01        $58.96      55 d

In [None]:
from llama import BasicModelRunner

chat_llama2 = BasicModelRunner(
    "meta-llama/Llama-2-7b-chat-hf")
print(chat_llama2(prompt))

In [None]:
from llama import BasicModelRunner

pretrained_model = "EleutherAI/pythia-70m"

non_finetuned = BasicModelRunner(pretrained_model)
print(non_finetuned(prompt))

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer\
    .from_pretrained(pretrained_model)

encoded_text = tokenizer(prompt)["input_ids"]
print("Encoded texts into tokens:\n", encoded_text)

decoded_text = tokenizer.decode(encoded_text)
print("Decoded tokens back into texts:\n", decoded_text)