#

# Install Necessary Packages

Install all the necessary packages including Unsloth, Xformers, and others.

In [None]:

# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers trl peft accelerate bitsandbytes
!pip install datasets

Collecting unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-e29677x0/unsloth_9b69cc44e11f466886b24709756a2fb8
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-e29677x0/unsloth_9b69cc44e11f466886b24709756a2fb8
  Resolved https://github.com/unslothai/unsloth.git to commit 27fa021a7bb959a53667dd4e7cdb9598c207aa0d
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


# Mount Google Drive

Mount Google Drive to access and save files.



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load the Dataset

Load the CSV file into a pandas DataFrame and convert it to a Hugging Face Dataset.

In [None]:
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from unsloth.chat_templates import get_chat_template

# Load the CSV file into a pandas DataFrame
data_path = "/content/drive/MyDrive/1-Working/1-HSLU-MscIDS/5-Projects/4-Semester/CLT/Samuel-CLT-Development/patent_qa_pairs_clean.csv"
df = pd.read_csv(data_path)

# Convert the DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)


Verify the Formatted Dataset

In [None]:
dataset

Dataset({
    features: ['answer', 'question'],
    num_rows: 3917
})

#Format the Dataset

Format the dataset to the desired prompt structure.



In [None]:
import pandas as pd
from datasets import Dataset


# Function to format the dataset
def formatting_prompts_func(examples):
    texts = [f"### Question: {q}\n ### Answer: {a}" for q, a in zip(examples["question"], examples["answer"])]
    return {"text": texts}

# Apply the formatting function to the dataset
dataset = dataset.map(formatting_prompts_func, batched=True)


Map:   0%|          | 0/3917 [00:00<?, ? examples/s]

In [None]:
dataset[0]

{'answer': 'Disclosed is an adaptable DC-AC inverter system and its operation. System is suited for solar energy harvesting in grid-connected or off-grid modes of operation.',
 'question': 'What are the advantages of the proposed adaptive-controlled AC inverter system for solar energy harvesting?',
 'text': '### Question: What are the advantages of the proposed adaptive-controlled AC inverter system for solar energy harvesting?\n ### Answer: Disclosed is an adaptable DC-AC inverter system and its operation. System is suited for solar energy harvesting in grid-connected or off-grid modes of operation.'}

# Apply Chat Template to Tokenizer

Apply the chat template to the tokenizer.

In [None]:
# Function to apply chat template to the dataset
def apply_chat_template_func(examples):
    convos = [
        {"messages": [
            {"from": "human", "value": q},
            {"from": "gpt", "value": a}
        ]}
        for q, a in zip(examples["question"], examples["answer"])
    ]

    try:
        # Apply the chat template directly to the messages
        processed_convos = [
            tokenizer.apply_chat_template(convo, tokenize=True, add_special_tokens=False)
            for convo in convos
        ]
        # Extract the text from the processed conversations
        texts = ["".join(tokenizer.decode(tokens) for tokens in convo) for convo in processed_convos]
    except Exception as e:
        print(f"Error in applying chat template: {e}")
        raise e

    return {"text": texts}

# Load the tokenizer and apply the chat template
from transformers import AutoTokenizer
from unsloth.chat_templates import get_chat_template

# Define model name and load tokenizer
model_name = "microsoft/Phi-3-mini-4k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define the template
unsloth_template = """
{{ bos_token }}
You are a helpful assistant to the user.
{% for message in messages %}
>>> {{ message['from'] }}: {{ message['value'] }}
{% endfor %}
{% if add_generation_prompt %}
>>> Assistant:
{% endif %}
"""

# Apply the chat template
tokenizer = get_chat_template(
    tokenizer,
    chat_template=(unsloth_template, "eos_token"),  # Provide template and EOS token
    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},  # ShareGPT style
    map_eos_token=True,  # Maps to </s> instead
)

# Example conversation to verify the chat template
example_convo = {
    "messages": [
        {"from": "human", "value": "What are the advantages of the proposed adaptive-controlled AC inverter system for solar energy harvesting?"},
        {"from": "gpt", "value": "Disclosed is an adaptable DC-AC inverter system and its operation. System is suited for solar energy harvesting in grid-connected or off-grid modes of operation."}
    ]
}

# Verify the template with a single example
try:
    print("Applying chat template to a single example...")
    print(f"Example conversation structure: {example_convo}")

    # Apply chat template
    result = tokenizer.apply_chat_template(example_convo['messages'], tokenize=False, add_generation_prompt=False)

    # Print intermediate results
    print("Intermediate Result:")
    for message in example_convo["messages"]:
        print(f"{message['from']}: {message['value']}")

    print(f"Final Result: {result}")
except Exception as e:
    print(f"Error in applying chat template: {e}")
    print(f"Template: {tokenizer.chat_template}")
    raise e

# Apply the chat template function to the dataset
dataset = dataset.map(apply_chat_template_func, batched=True)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Applying chat template to a single example...
Example conversation structure: {'messages': [{'from': 'human', 'value': 'What are the advantages of the proposed adaptive-controlled AC inverter system for solar energy harvesting?'}, {'from': 'gpt', 'value': 'Disclosed is an adaptable DC-AC inverter system and its operation. System is suited for solar energy harvesting in grid-connected or off-grid modes of operation.'}]}
Intermediate Result:
human: What are the advantages of the proposed adaptive-controlled AC inverter system for solar energy harvesting?
gpt: Disclosed is an adaptable DC-AC inverter system and its operation. System is suited for solar energy harvesting in grid-connected or off-grid modes of operation.
Final Result: 
<s>
You are a helpful assistant to the user.
>>> human: What are the advantages of the proposed adaptive-controlled AC inverter system for solar energy harvesting?
>>> gpt: Disclosed is an adaptable DC-AC inverter system and its operation. System is suited fo

Map:   0%|          | 0/3917 [00:00<?, ? examples/s]

# Apply Chat Template to Dataset

Function to apply the chat template to the dataset and process it.

In [None]:
# Function to apply chat template to the dataset
def apply_chat_template_func(examples):
    convos = [
        {"messages": [
            {"from": "human", "value": q},
            {"from": "gpt", "value": a}
        ]}
        for q, a in zip(examples["question"], examples["answer"])
    ]

    try:
        # Apply the chat template directly to the messages
        processed_convos = [
            tokenizer.apply_chat_template(convo['messages'], tokenize=True, add_special_tokens=False)
            for convo in convos
        ]

        # Debugging: Print the structure of processed_convos
        print("Processed Conversations (First Example):", processed_convos[0])

        # Extract the text from the processed conversations
        texts = [tokenizer.decode(convo) for convo in processed_convos]
    except Exception as e:
        print(f"Error in applying chat template: {e}")
        raise e

    return {"text": texts}

# Apply the chat template function to the dataset
dataset = dataset.map(apply_chat_template_func, batched=True)

# Check the first few rows to ensure it processed correctly
print(dataset["text"][:5])


Map:   0%|          | 0/3917 [00:00<?, ? examples/s]

Processed Conversations (First Example): [29871, 13, 1, 29871, 13, 3492, 526, 263, 8444, 20255, 304, 278, 1404, 29889, 13, 6778, 29958, 5199, 29901, 1724, 526, 278, 25486, 310, 278, 7972, 7744, 573, 29899, 6451, 839, 14614, 297, 13549, 1788, 363, 21635, 5864, 4023, 10147, 292, 29973, 13, 6778, 29958, 330, 415, 29901, 3295, 15603, 338, 385, 7744, 519, 13681, 29899, 2477, 297, 13549, 1788, 322, 967, 5858, 29889, 2184, 338, 480, 1573, 363, 21635, 5864, 4023, 10147, 292, 297, 6856, 29899, 18045, 470, 1283, 29899, 7720, 18893, 310, 5858, 29889, 13]
Processed Conversations (First Example): [29871, 13, 1, 29871, 13, 3492, 526, 263, 8444, 20255, 304, 278, 1404, 29889, 13, 6778, 29958, 5199, 29901, 1128, 947, 278, 19843, 310, 278, 3699, 29892, 411, 967, 6641, 29899, 29888, 9390, 16246, 24628, 322, 11520, 17526, 29879, 29892, 29126, 304, 967, 5864, 19201, 322, 12463, 2874, 297, 278, 3030, 310, 278, 6136, 322, 1209, 573, 23387, 14294, 10090, 313, 10764, 29956, 29956, 29956, 29897, 1904, 29973, 13

In [None]:
dataset

Dataset({
    features: ['answer', 'question', 'text'],
    num_rows: 3917
})

In [None]:
dataset[0]

{'answer': 'Disclosed is an adaptable DC-AC inverter system and its operation. System is suited for solar energy harvesting in grid-connected or off-grid modes of operation.',
 'question': 'What are the advantages of the proposed adaptive-controlled AC inverter system for solar energy harvesting?',
 'text': '\n<s> \nYou are a helpful assistant to the user.\n>>> human: What are the advantages of the proposed adaptive-controlled AC inverter system for solar energy harvesting?\n>>> gpt: Disclosed is an adaptable DC-AC inverter system and its operation. System is suited for solar energy harvesting in grid-connected or off-grid modes of operation.\n'}

In [None]:
print(dataset[0]["text"])


<s> 
You are a helpful assistant to the user.
>>> human: What are the advantages of the proposed adaptive-controlled AC inverter system for solar energy harvesting?
>>> gpt: Disclosed is an adaptable DC-AC inverter system and its operation. System is suited for solar energy harvesting in grid-connected or off-grid modes of operation.



# Verify the Processed Dataset

Check the first few rows to ensure it processed correctly.

In [None]:
# Check the first few rows to ensure it processed correctly
for i in range(5):
    print(f"Processed Example {i}:")
    print(dataset["text"][i])
    print("\n")

# Save the processed dataset to a CSV file
processed_dataset_path = "/content/drive/MyDrive/1-Working/1-HSLU-MscIDS/5-Projects/4-Semester/CLT/Samuel-CLT-Development/patent_qa_pairs_processed.csv"
dataset.to_csv(processed_dataset_path, index=False)

print(f"Processed dataset saved to {processed_dataset_path}")


Processed Example 0:

<s> 
You are a helpful assistant to the user.
>>> human: What are the advantages of the proposed adaptive-controlled AC inverter system for solar energy harvesting?
>>> gpt: Disclosed is an adaptable DC-AC inverter system and its operation. System is suited for solar energy harvesting in grid-connected or off-grid modes of operation.



Processed Example 1:

<s> 
You are a helpful assistant to the user.
>>> human: What components are included in a typical solar energy system, and how are they configured to work together?
>>> gpt: A solar energy system comprises: a solar energy structure comprising photovoltaic solar panels contiguously covering an area; a first inverter configured to receive power from a first string of solar panels; and a second inverter.



Processed Example 2:

<s> 
You are a helpful assistant to the user.
>>> human: What control method is proposed for optimizing the solar-to-power efficiency of a solar-aided coal-fired power system under off-d

Creating CSV from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Processed dataset saved to /content/drive/MyDrive/1-Working/1-HSLU-MscIDS/5-Projects/4-Semester/CLT/Samuel-CLT-Development/patent_qa_pairs_processed.csv
