In [1]:
import torch
import transformers
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel


tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1")
model = AutoModelForCausalLM.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1", load_in_4bit=True, torch_dtype=torch.float16, device_map="auto")


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

In [31]:
# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)
tokenizer.pad_token = "!"
CUTOFF_LEN = 128
LORA_R = 8
LORA_ALPHA = 2 * LORA_R
LORA_DROPOUT = 0.1

In [32]:
config = LoraConfig(r=LORA_R, lora_alpha=LORA_ALPHA, target_modules=[ "w1", "w2", "w3"], lora_dropout=LORA_DROPOUT, bias="none", task_type="CAUSAL_LM")
model = get_peft_model(model, config)

In [33]:
# dataset = load_dataset("harpreetsahota/modern-to-shakesperean-translation")
# print("dataset", dataset)
# train_data = dataset["train"]

# # Print the first few examples from the training dataset
# print("First few examples from the training dataset:")
# for i in range(3):  # Adjust the range to preview more or fewer examples
#     print(f"Example {i+1}: {train_data[i]}")


In [27]:
import csv
import json

# Function to load JSONs from a CSV file
def load_jsons_from_csv(filename):
    data = []  # This list will store all the JSON objects
    errors = []
    with open(filename, 'r', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            if row:  # Ensure the row is not empty
                # Parse JSON from the first column in each row (assuming each row is a JSON string)
                try:
                    json_str = ','.join(part for part in row if part.strip())
                    # print("Json string")
                    # print(json_str)
                    # print(type(json_str))
                    json_data = json.loads(json_str)
                    # print("Json data")
                    # print(json_data)
                    data_list = [json_data]  # Place the dictionary in a list as per your requirement
                    # print("Data list")
                    # print(data_list)
                    data.append(json_data)
                except json.JSONDecodeError:
                    errors.append(row)
                    pass
    print("Number of error rows: " + str(len(errors))) 
    return data, errors

# Example usage
filename = 'Jules_Dialogues_JSON.csv'
loaded_jsons, errors = load_jsons_from_csv(filename)
# print(loaded_jsons)
# for error in errors:
#     print(error)

# print()
# print()

# for data in loaded_jsons:
#     print(data)

print(len(loaded_jsons))

Number of error rows: 0
690


In [34]:
# {'role': 'user', 'content': '(yelling to all) Everybody be cool this is a robbery!'},
 # {'role': 'assistant', 'content': ' Okay now, tell me about the hash bars?'},

def generate_prompt(usercontent, assistant_content):
  sys_msg = "Respond to this in the form of Jules from Pulp Fiction"
  p = "<s> [INST]" + sys_msg +"\n"+ usercontent["content"] + "[/INST]" +  assistant_content["content"] + "</s>"
  return p 


tokenize = lambda prompt: tokenizer(prompt + tokenizer.eos_token, truncation=True, max_length=CUTOFF_LEN, padding="max_length")

train_data = []

print(len(loaded_jsons))

for row in range(0, len(loaded_jsons)-1, 2):
    # print("Hi")
    user_content = loaded_jsons[row]
    assistant_content = loaded_jsons[(row + 1)]
    prompt_row_data = tokenize(generate_prompt(user_content, assistant_content))
    train_data.append(prompt_row_data)

# train_data.shuffle()


# train_data = train_data.shuffle().map(lambda x: tokenize(generate_prompt(x)), remove_columns=["modern" , "shakespearean"])



690


In [35]:
# train_data

In [None]:

trainer = Trainer(
  model=model,
  train_dataset=train_data,
  args=TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    learning_rate=1e-4,
    logging_steps=2,
    optim="adamw_torch",
    save_strategy="epoch",
    output_dir="mixtral-moe-lora-instruct-shapeskeare"
  ),
  data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)


model.config.use_cache = False
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
2,1.9121
4,1.8364
6,1.8289
8,1.5329
10,1.3722
12,1.4769
14,1.4507
16,1.3508
18,1.5918
20,1.7521




In [None]:
# def generate_prompt(usercontent, assistant_content):
#   sys_msg = "Respond to this in the form of Jules from Pulp Fiction"
#   p = "<s> [INST]" + sys_msg +"\n"+ usercontent["content"] + "[/INST]" +  assistant_content["content"] + "</s>"
#   return p 

# test = {'modern': "When someone says 'She's thirsty, ain't she?', they're implying she's seeking attention.", 'shakespearean': 'When one remarks, "She doth crave attention, doth she not?", they suggest her desire for notice.'}

# usercontent =  {"role": "user", "content": '(yelling to all) Everybody be cool this is a robbery!'}
# assistant_content = {'role': 'assistant', 'content': ' Okay now, tell me about the hash bars?'}
# print(type(usercontent))
# print(usercontent)  # This will show the actual data structure

# generate_prompt(usercontent, assistant_content)


In [None]:
# model

In [None]:
# def translate_to_shakespearean(text):
#     # Generate prompt
#     prompt = f"<s> [INST]Respond to this in the form of Jules from Pulp Fiction\n{text}[/INST]</s>"
    
#     # Tokenize the prompt
#     inputs = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=CUTOFF_LEN, padding="max_length").to(model.device)
    
#     # Generate output tokens
#     with torch.no_grad():
#         outputs = model.generate(**inputs, max_length=CUTOFF_LEN)
    
#     # Decode generated tokens to text
#     translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
#     return translation

# # Example usage
# modern_text = "hello"
# print("Shakespearean translation:", translate_to_shakespearean(modern_text))

In [None]:
def respond_as_Jules(text):
    # Generate prompt
    prompt = f"<s> [INST]Respond to this in the form of Jules from Pulp Fiction\n{text}[/INST]</s>"
    
    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=CUTOFF_LEN, padding="max_length").to(model.device)
    
    # Generate output tokens
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=CUTOFF_LEN)

    # Decode generated tokens to text
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return response

# Example usage
user_quest = "Hi, how's it going"
print("Jules response:", respond_as_Jules(user_quest))