In [1]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, DataCollatorForLanguageModeling
from peft import PeftModel, LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
import evaluate 
import os 
import numpy as np
from transformers import default_data_collator
from transformers import StoppingCriteria, StoppingCriteriaList

import warnings
warnings.filterwarnings("ignore")

In [2]:
model_name = "EleutherAI/pythia-410m"
saved_path = "./pythia-lora-final"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
data_files = {
    "train": [
        # "data/datascience_2000.jsonl",
        "data/datascience_1000_multistep.jsonl",
        "data/datascience_4000_multistep.jsonl",
        # "data/datascience_1000_errors.jsonl"
    ]
}

In [4]:
raw_ds = load_dataset("json", data_files=data_files, split="train")

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
len(raw_ds)

5000

In [6]:
raw_ds[0]

{'instruction': "Read CSV file 'time_series.csv' into DataFrame data, drop duplicates, fill missing values in 'quantity' with median, group by 'department' and plot average 'speed'.",
 'output': "import pandas as pd\nimport matplotlib.pyplot as plt\ndata = pd.read_csv('time_series.csv')\ndata = data.drop_duplicates()\ndata['quantity'] = data['quantity'].fillna(data['quantity'].median())\navg = data.groupby('department')['speed'].mean()\navg.plot(kind='bar')\nplt.title('Average speed by department')\nplt.show()"}

In [7]:
tokenizer = AutoTokenizer.from_pretrained(saved_path)
tokenizer.pad_token = tokenizer.eos_token

In [8]:
def format_example(ex):
    prompt = f"### Instruction:\n{ex['instruction']}\n\n### Response:\n{ex['output']}"
    prompt += tokenizer.eos_token
    return {"text": prompt}

In [9]:
dataset = raw_ds.map(format_example, remove_columns=raw_ds.column_names)
len(dataset)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

5000

In [10]:
print(dataset[0])

{'text': "### Instruction:\nRead CSV file 'time_series.csv' into DataFrame data, drop duplicates, fill missing values in 'quantity' with median, group by 'department' and plot average 'speed'.\n\n### Response:\nimport pandas as pd\nimport matplotlib.pyplot as plt\ndata = pd.read_csv('time_series.csv')\ndata = data.drop_duplicates()\ndata['quantity'] = data['quantity'].fillna(data['quantity'].median())\navg = data.groupby('department')['speed'].mean()\navg.plot(kind='bar')\nplt.title('Average speed by department')\nplt.show()<|endoftext|>"}


In [11]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id == tokenizer.eos_token_id

def tokenize_fn(ex):
    return tokenizer(
        ex["text"],
        truncation=True,
        max_length=192,
        padding="max_length",  # will use pad_token_id that you already set
        return_attention_mask=True
    )

tokenized_dataset = dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
dataloader = DataLoader(tokenized_dataset, batch_size=16, shuffle=True, collate_fn=default_data_collator)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [12]:
#Load model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

base_model = prepare_model_for_kbit_training(base_model)

In [13]:
instruction = "Load time series CSV 'data.csv', set 'date' as index, resample weekly average of 'sales', detect points > mean+2*std and annotate plot."
prompt_template = f"### Instruction:\n{instruction}\n\n### Response:\n"

In [14]:
class StopOnKeywords(StoppingCriteria):
    def __init__(self, tokenizer, stop_phrases, input_len):
        super().__init__()
        self.tokenizer = tokenizer
        self.stop_phrases = stop_phrases
        self.input_len = input_len

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        generated_text = self.tokenizer.decode(input_ids[0][self.input_len:], skip_special_tokens=True)
        for stop_phrase in self.stop_phrases:
            if stop_phrase in generated_text:
                return True
        return False

# Base Model Generation

In [15]:
stop_phrases = ["### Response:", "\n\n"]
inputs = tokenizer(prompt_template, return_tensors="pt").to("cuda")
input_len = inputs["input_ids"].shape[1]

stopping_criteria = StoppingCriteriaList([
    StopOnKeywords(tokenizer, stop_phrases, input_len)
])
tokens = base_model.generate(**inputs, max_new_tokens=192, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id, stopping_criteria=stopping_criteria)
print(tokenizer.decode(tokens[0]))

### Instruction:
Load time series CSV 'data.csv', set 'date' as index, resample weekly average of 'sales', detect points > mean+2*std and annotate plot.

### Response:

```




# Lora Model V1: Generation

In [16]:
model = PeftModel.from_pretrained(base_model, saved_path)
model = model.to(device)

model.enable_input_require_grads()
model.config.use_cache = False

inputs = tokenizer(prompt_template, return_tensors="pt").to("cuda")
tokens = base_model.generate(**inputs, max_new_tokens=192, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id, stopping_criteria=stopping_criteria)
print(tokenizer.decode(tokens[0]))

### Instruction:
Load time series CSV 'data.csv', set 'date' as index, resample weekly average of 'sales', detect points > mean+2*std and annotate plot.

### Response:
import pandas as pd
import matplotlib.pyplot as plt
df_data = pd.read_csv('data.csv', parse_dates=['date'], index_col='date')
weekly = df_data['sales'].resample('W').mean()
mean = weekly.mean(); std = weekly.std()
signal = weekly[weekly > mean + 2*std]
plt.plot(weekly.index, weekly)
plt.scatter(signal.index, signal, color='red')
plt.title('Weekly sales with Anomalies')
plt.show()
odf_data.to_csv('omal_data.csv', index=False)
plt.show()
plt.show()
odf_data.to_csv('omal_data.csv', index=False)
plt. finish()
odf_


# Training loop

In [17]:
lora_cfg = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(base_model, lora_cfg)
print(model.get_memory_footprint()/1e6)

567.517216


In [18]:
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
epochs = 10

In [19]:
from tqdm import tqdm

for epoch in range(epochs):
    total_loss = 0
    print(f"\nEpoch {epoch + 1}/{epochs}")
    
    progress_bar = tqdm(enumerate(dataloader), total=len(dataloader), desc=f"Epoch {epoch+1}")
    
    for step, batch in progress_bar:
        model.train()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = input_ids.clone()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

        # Update tqdm bar with current loss
        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

    model.eval()
    inputs = tokenizer(prompt_template, return_tensors="pt").to(device)
    with torch.no_grad():
        tokens = model.generate(**inputs, max_new_tokens=192, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id, stopping_criteria=stopping_criteria)
    print("=== Inference Output ===")
    print(tokenizer.decode(tokens[0], skip_special_tokens=True))
    print("========================")

    avg_loss = total_loss / len(dataloader)
    print(f"\nEpoch {epoch+1} completed. Avg loss: {avg_loss:.4f}")



Epoch 1/10


Epoch 1: 100%|██████████████████████████████████████████████████████████| 313/313 [36:38<00:00,  7.02s/it, loss=0.0907]


=== Inference Output ===
### Instruction:
Load time series CSV 'data.csv', set 'date' as index, resample weekly average of 'sales', detect points > mean+2*std and annotate plot.

### Response:
import pandas as pd
import matplotlib.pyplot as plt
df1 = pd.read_csv('data.csv', parse_dates=['date'], index_col='date')
weekly = df1['sales'].resample('W').mean()
mean = weekly.mean(); std = weekly.std()
signal = weekly[weekly > mean + 2*std]
plt.plot(weekly.index, weekly)
plt.scatter(signal.index, signal, color='red')
plt.title('Weekly sales with Anomalies')
plt.show()

Epoch 1 completed. Avg loss: 0.3139

Epoch 2/10


Epoch 2: 100%|██████████████████████████████████████████████████████████| 313/313 [36:51<00:00,  7.07s/it, loss=0.0845]


=== Inference Output ===
### Instruction:
Load time series CSV 'data.csv', set 'date' as index, resample weekly average of 'sales', detect points > mean+2*std and annotate plot.

### Response:
import pandas as pd
import matplotlib.pyplot as plt
df1 = pd.read_csv('data.csv', parse_dates=['date'], index_col='date')
weekly = df1['sales'].resample('W').mean()
mean = weekly.mean(); std = weekly.std()
signal = weekly[weekly > mean + 2*std]
plt.plot(weekly.index, weekly)
plt.scatter(signal.index, signal, color='red')
plt.title('Weekly sales with Anomalies')
plt.show()

Epoch 2 completed. Avg loss: 0.0879

Epoch 3/10


Epoch 3: 100%|██████████████████████████████████████████████████████████| 313/313 [36:53<00:00,  7.07s/it, loss=0.0801]


=== Inference Output ===
### Instruction:
Load time series CSV 'data.csv', set 'date' as index, resample weekly average of 'sales', detect points > mean+2*std and annotate plot.

### Response:
import pandas as pd
import matplotlib.pyplot as plt
df1 = pd.read_csv('data.csv', parse_dates=['date'], index_col='date')
weekly = df1['sales'].resample('W').mean()
mean = weekly.mean(); std = weekly.std()
signal = weekly[weekly > mean + 2*std]
plt.plot(weekly.index, weekly)
plt.scatter(signal.index, signal, color='red')
plt.title('Weekly sales with Anomalies')
plt.show()

Epoch 3 completed. Avg loss: 0.0816

Epoch 4/10


Epoch 4: 100%|██████████████████████████████████████████████████████████| 313/313 [36:53<00:00,  7.07s/it, loss=0.0855]


=== Inference Output ===
### Instruction:
Load time series CSV 'data.csv', set 'date' as index, resample weekly average of 'sales', detect points > mean+2*std and annotate plot.

### Response:
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_csv('data.csv', parse_dates=['date'], index_col='date')
weekly = data['sales'].resample('W').mean()
mean = weekly.mean(); std = weekly.std()
signal = weekly[weekly > mean + 2*std]
plt.plot(weekly.index, weekly)
plt.scatter(signal.index, signal, color='red')
plt.title('Weekly sales with Anomalies')
plt.show()

Epoch 4 completed. Avg loss: 0.0793

Epoch 5/10


Epoch 5: 100%|████████████████████████████████████████████████████████| 313/313 [8:13:56<00:00, 94.69s/it, loss=0.0803]


=== Inference Output ===
### Instruction:
Load time series CSV 'data.csv', set 'date' as index, resample weekly average of 'sales', detect points > mean+2*std and annotate plot.

### Response:
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('data.csv', parse_dates=['date'], index_col='date')
weekly = df['sales'].resample('W').mean()
mean = weekly.mean(); std = weekly.std()
signal = weekly[weekly > mean + 2*std]
plt.plot(weekly.index, weekly)
plt.scatter(signal.index, signal, color='red')
plt.title('Weekly sales with Anomalies')
plt.show()

Epoch 5 completed. Avg loss: 0.0772

Epoch 6/10


Epoch 6: 100%|██████████████████████████████████████████████████████████| 313/313 [21:45<00:00,  4.17s/it, loss=0.0707]


=== Inference Output ===
### Instruction:
Load time series CSV 'data.csv', set 'date' as index, resample weekly average of 'sales', detect points > mean+2*std and annotate plot.

### Response:
import pandas as pd
import matplotlib.pyplot as plt
df1 = pd.read_csv('data.csv', parse_dates=['date'], index_col='date')
weekly = df1['sales'].resample('W').mean()
mean = weekly.mean(); std = weekly.std()
signal = weekly[weekly > mean + 2*std]
plt.plot(weekly.index, weekly)
plt.scatter(signal.index, signal, color='red')
plt.title('Weekly sales with Anomalies')
plt.show()

Epoch 6 completed. Avg loss: 0.0767

Epoch 7/10


Epoch 7: 100%|██████████████████████████████████████████████████████████| 313/313 [21:48<00:00,  4.18s/it, loss=0.0790]


=== Inference Output ===
### Instruction:
Load time series CSV 'data.csv', set 'date' as index, resample weekly average of 'sales', detect points > mean+2*std and annotate plot.

### Response:
import pandas as pd
import matplotlib.pyplot as plt
df_data = pd.read_csv('data.csv', parse_dates=['date'], index_col='date')
weekly = df_data['sales'].resample('W').mean()
mean = weekly.mean(); std = weekly.std()
signal = weekly[weekly > mean + 2*std]
plt.plot(weekly.index, weekly)
plt.scatter(signal.index, signal, color='red')
plt.title('Weekly sales with Anomalies')
plt.show()

Epoch 7 completed. Avg loss: 0.0759

Epoch 8/10


Epoch 8: 100%|██████████████████████████████████████████████████████████| 313/313 [21:47<00:00,  4.18s/it, loss=0.0725]


=== Inference Output ===
### Instruction:
Load time series CSV 'data.csv', set 'date' as index, resample weekly average of 'sales', detect points > mean+2*std and annotate plot.

### Response:
import pandas as pd
import matplotlib.pyplot as plt
df1 = pd.read_csv('data.csv', parse_dates=['date'], index_col='date')
weekly = df1['sales'].resample('W').mean()
mean = weekly.mean(); std = weekly.std()
signal = weekly[weekly > mean + 2*std]
plt.plot(weekly.index, weekly)
plt.scatter(signal.index, signal, color='red')
plt.title('Weekly sales with Anomalies')
plt.show()

Epoch 8 completed. Avg loss: 0.0757

Epoch 9/10


Epoch 9: 100%|██████████████████████████████████████████████████████████| 313/313 [21:46<00:00,  4.17s/it, loss=0.0758]


=== Inference Output ===
### Instruction:
Load time series CSV 'data.csv', set 'date' as index, resample weekly average of 'sales', detect points > mean+2*std and annotate plot.

### Response:
import pandas as pd
import matplotlib.pyplot as plt
data_clean = pd.read_csv('data.csv', parse_dates=['date'], index_col='date')
weekly = data_clean['sales'].resample('W').mean()
mean = weekly.mean(); std = weekly.std()
signal = weekly[weekly > mean + 2*std]
plt.plot(weekly.index, weekly)
plt.scatter(signal.index, signal, color='red')
plt.title('Weekly sales with Anomalies')
plt.show()

Epoch 9 completed. Avg loss: 0.0763

Epoch 10/10


Epoch 10: 100%|█████████████████████████████████████████████████████████| 313/313 [21:43<00:00,  4.17s/it, loss=0.0744]


=== Inference Output ===
### Instruction:
Load time series CSV 'data.csv', set 'date' as index, resample weekly average of 'sales', detect points > mean+2*std and annotate plot.

### Response:
import pandas as pd
import matplotlib.pyplot as plt
dataset = pd.read_csv('data.csv', parse_dates=['date'], index_col='date')
weekly = dataset['sales'].resample('W').mean()
mean = weekly.mean(); std = weekly.std()
signal = weekly[weekly > mean + 2*std]
plt.plot(weekly.index, weekly)
plt.scatter(signal.index, signal, color='red')
plt.title('Weekly sales with Anomalies')
plt.show()

Epoch 10 completed. Avg loss: 0.0749


In [20]:
# 6. Save Fine-Tuned Weights
model.save_pretrained("./pythia-lora-V3")
tokenizer.save_pretrained("./pythia-lora-V3")

('./pythia-lora-V3\\tokenizer_config.json',
 './pythia-lora-V3\\special_tokens_map.json',
 './pythia-lora-V3\\tokenizer.json')