In [1]:
!pip install -q datasets==2.21.0 requests torch peft bitsandbytes transformers trl accelerate sentencepiece

In [2]:
import os
import re
import math
from tqdm import tqdm
from google.colab import userdata
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, set_seed
from peft import LoraConfig, PeftModel
from datetime import datetime

In [3]:
PROJECT_NAME = "Ravi-shankar-chat-style"

In [5]:
# prompt: Load /content/keyfile.md and load the keys to environment

def get_secrets_from_file(secrets_file):
  """
  Load secrets from a file.

  Args:
      secrets_file (str): Path to the secrets file.

  Returns:
      dict: A dictionary containing the secrets.
  """
  secrets = {}
  with open(secrets_file, "r") as f:
      for line in f:
          key, value = line.strip().split("=", 1)
          #remove doule quotes from value
          secrets[key] = value.replace('"', '')
  return secrets


secrets = get_secrets_from_file("/content/keyfile.md")

# Load each secret into an environment variable
for key, value in secrets.items():
  os.environ[key] = value

In [6]:
# Log in to Hugging Face
from huggingface_hub import login
from wandb import wandb
hf_token = os.environ['HUGGINGFACE_TOKEN']
login(hf_token, add_to_git_credential=True)

# Log in to Weights & Biases
wandb.login()

# Configure Weights & Biases to record against our project
os.environ["WANDB_PROJECT"] = PROJECT_NAME
os.environ["WANDB_LOG_MODEL"] = "all"  # Log all checkpoints correctly
os.environ["WANDB_WATCH"] = "gradients"

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mashishkumarsahani[0m ([33mashishkumarsahani-vettura[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from huggingface_hub import HfApi, create_repo, upload_file
import os

# Load JSON dataset
json_file = "/content/Ravishankar_Data/shree_ravi_shankar_data.json"  # Replace with your JSON file path
df = pd.read_json(json_file)

# Split into train and test sets
train_df, test_df = train_test_split(df, test_size=0.25, random_state=42)

# Save to Parquet format
train_file = "train.parquet"
test_file = "test.parquet"
train_df.to_parquet(train_file, index=False)
test_df.to_parquet(test_file, index=False)

# Hugging Face Hub details
repo_name = "sri-sri-ravishankar-chat-style"  # Replace with your repo name
api = HfApi()

# Authenticate and fetch username
user = api.whoami()["name"]
repo_id = f"{user}/{repo_name}"

# Create the repository if it doesn't exist
try:
    create_repo(repo_id, repo_type="dataset", exist_ok=True)
    print(f"Repository created or already exists: https://huggingface.co/datasets/{repo_id}")
except Exception as e:
    print(f"Error creating repository: {e}")
    raise

# Upload files to the repository
for file, file_path in [("train.parquet", train_file), ("test.parquet", test_file)]:
    try:
        upload_file(
            path_or_fileobj=file_path,
            path_in_repo=file,
            repo_id=repo_id,
            repo_type="dataset",
        )
        print(f"Uploaded {file} to https://huggingface.co/datasets/{repo_id}/{file}")
    except Exception as e:
        print(f"Error uploading {file}: {e}")

print(f"Dataset successfully uploaded to: https://huggingface.co/datasets/{repo_id}")


Repository created or already exists: https://huggingface.co/datasets/ashishkumarsahani/sri-sri-ravishankar-chat-style


No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


Uploaded train.parquet to https://huggingface.co/datasets/ashishkumarsahani/sri-sri-ravishankar-chat-style/train.parquet
Uploaded test.parquet to https://huggingface.co/datasets/ashishkumarsahani/sri-sri-ravishankar-chat-style/test.parquet
Dataset successfully uploaded to: https://huggingface.co/datasets/ashishkumarsahani/sri-sri-ravishankar-chat-style


In [9]:
BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"
HF_USER = api.whoami()["name"]  # your HF name here!

# Data
DATASET_NAME = repo_id
MAX_SEQUENCE_LENGTH = 182

# Run name for saving the model in the hub
RUN_NAME = f"{datetime.now():%Y-%m-%d_%H.%M.%S}"
PROJECT_RUN_NAME = f"{PROJECT_NAME}_{RUN_NAME}"
HUB_MODEL_NAME = f"{HF_USER}/{PROJECT_RUN_NAME}"

# ==========================
# 🔹 Hyperparameters for LoRA (Low-Rank Adaptation)
# ==========================

LORA_R = 32  # LoRA rank: Defines the size of low-rank matrices (higher = more expressivity but more memory usage)
LORA_ALPHA = 64  # LoRA scaling factor: Controls the magnitude of updates to fine-tuned weights
TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "o_proj"]  # Specific transformer layers where LoRA is applied
LORA_DROPOUT = 0.1  # Dropout rate for LoRA layers to prevent overfitting
QUANT_4_BIT = True  # Enables 4-bit quantization for reduced memory usage and efficient model training

# ==========================
# 🔹 Hyperparameters for Training
# ==========================

EPOCHS = 3  # Number of complete passes over the training dataset
BATCH_SIZE = 4  # Number of training samples per GPU per step
GRADIENT_ACCUMULATION_STEPS = 1  # Number of steps to accumulate gradients before updating model weights
LEARNING_RATE = 1e-4  # Initial learning rate for model training
LR_SCHEDULER_TYPE = "cosine"  # Learning rate scheduler type ("cosine" means it follows a cosine decay)
WARMUP_RATIO = 0.03  # Ratio of total training steps used for learning rate warmup (gradually increasing LR at the start)
OPTIMIZER = "paged_adamw_32bit"  # Optimizer type for training (AdamW with 32-bit precision and memory-efficient paging)

STEPS = 50  # Number of training steps after which logs are recorded
SAVE_STEPS = 500  # Number of training steps after which a checkpoint (model save) is created
LOG_TO_WANDB = True  # Enable logging to Weights & Biases (W&B) for experiment tracking

%matplotlib inline

In [10]:
from datasets import load_dataset
dataset = load_dataset(DATASET_NAME)
train = dataset['train']
test = dataset['test']

In [113]:
from datasets import Dataset

# Assuming 'dataset' is your original Hugging Face dataset
def combine_columns(example):
    example["combined"] = example["question"] + "\nAnswer: " + example["answer"]
    return example

# Apply transformation
dataset = train.map(combine_columns)

# Select only the new 'combined' column
new_train = dataset.remove_columns(["question", "answer"])

dataset = test.map(combine_columns)

# Select only the new 'combined' column
new_test = dataset.remove_columns(["question", "answer"])

# Check the output
print(new_train,new_test)

Dataset({
    features: ['combined'],
    num_rows: 646
}) Dataset({
    features: ['combined'],
    num_rows: 216
})


**Optionally you can use this Alpaca prompt style**

In [11]:
#Optionally you can use this Alpaca prompt style

from datasets import Dataset

instruction = "You are Sri Sri Ravi Shankar. A spiritual guru from India. You answer questions of spiritual seekers based on your deep spiritual knowledge and insight."
# Function to transform dataset into Alpaca format
def transform_to_alpaca_format(example):
    example["combined"] = f"system: {instruction}\nassistant: {example['question']}\nuser: {example['answer']}"
    return example

# Apply transformation to the training dataset
new_train = train.map(transform_to_alpaca_format)

# Select only the new 'combined' column
new_train = new_train.remove_columns(["question", "answer"])

# Apply transformation to the test dataset
new_test = test.map(transform_to_alpaca_format)

# Select only the new 'combined' column
new_test = new_test.remove_columns(["question", "answer"])

# Check the output
print(new_train, new_test)


Map:   0%|          | 0/646 [00:00<?, ? examples/s]

Map:   0%|          | 0/216 [00:00<?, ? examples/s]

Dataset({
    features: ['combined'],
    num_rows: 646
}) Dataset({
    features: ['combined'],
    num_rows: 216
})


In [12]:
# prompt: Print few dataset items from new_train

print(new_train[:5])

{'combined': ["system: You are Sri Sri Ravi Shankar. A spiritual guru from India. You answer questions of spiritual seekers based on your deep spiritual knowledge and insight.\nassistant: What is the best thing to do when we make an effort to do better but don't see any progress?\nuser: The first thing to tell yourself is that you are almost entirely incapable of knowing whether you are making progress or not.", 'system: You are Sri Sri Ravi Shankar. A spiritual guru from India. You answer questions of spiritual seekers based on your deep spiritual knowledge and insight.\nassistant: What is the most essential quality for perseverance?\nuser: The most essential quality is perseverance, endurance, and a kind of inner good humour which helps you not to get discouraged.', 'system: You are Sri Sri Ravi Shankar. A spiritual guru from India. You answer questions of spiritual seekers based on your deep spiritual knowledge and insight.\nassistant: What is the first thing to do in the sadhana?\n

In [13]:
if LOG_TO_WANDB:
    wandb.init(project=PROJECT_NAME, name=RUN_NAME)

**BitsAndBytesConfig**

load_in_4bit=True:

Loads the model weights in 4-bit precision instead of the default 32-bit. This significantly reduces memory usage.

bnb_4bit_use_double_quant=True:

Applies double quantization, which further reduces memory usage by quantizing the quantization constants themselves. This is an advanced optimization technique.

bnb_4bit_compute_dtype=torch.bfloat16:

Specifies that computations (e.g., matrix multiplications) should use the bfloat16 data type. This balances precision and speed while maintaining reasonable accuracy.

bnb_4bit_quant_type="nf4":

Uses the NormalFloat4 (nf4) quantization type, which is optimized for 4-bit quantization. It distributes the quantization levels more effectively for neural network weights. NF4 is a learned quantization format that maps floating-point numbers non-linearly to a special set of 16 floating-point values.

In [14]:
# Check if 4-bit quantization is enabled
if QUANT_4_BIT:
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,  # Enable 4-bit quantization to reduce model size and improve efficiency
        bnb_4bit_use_double_quant=True,  # Use double quantization for better compression and efficiency
        bnb_4bit_compute_dtype=torch.bfloat16,  # Use bfloat16 precision for computation (saves memory while maintaining numerical stability)
        bnb_4bit_quant_type="nf4"  # Use Normal Float 4 (NF4) quantization, which improves performance over standard 4-bit quantization
    )
else:
    # If 4-bit quantization is disabled, use 8-bit quantization
    quant_config = BitsAndBytesConfig(
        load_in_8bit=True,  # Enable 8-bit quantization (better balance between efficiency and accuracy)
        bnb_8bit_compute_dtype=torch.bfloat16  # Use bfloat16 precision for 8-bit computation
    )

In [15]:
# Load the Tokenizer and the Model

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=quant_config,
    device_map="auto",
)

base_model.generation_config.pad_token_id = tokenizer.pad_token_id #pad_token_id is a special token used in tokenization to represent padding. Padding is added to sequences to ensure they all have the same length, which is necessary for batching inputs in neural networks.

print(f"Memory footprint: {(base_model.get_memory_footprint()/1e6):.1f} MB")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Memory footprint: 5591.5 MB


In [16]:
print(base_model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps

In [17]:
from trl import SFTConfig, SFTTrainer
from peft import LoraConfig  # Ensure the correct import

# Define LoRA parameters for efficient fine-tuning
lora_parameters = LoraConfig(
    r=LORA_R,  # LoRA rank: defines the size of the low-rank matrices
    lora_alpha=LORA_ALPHA,  # LoRA scaling factor: controls the update magnitude
    lora_dropout=LORA_DROPOUT,  # Dropout rate for LoRA layers to prevent overfitting
    bias="none",  # Specifies whether to adapt bias terms (none, all, or only certain ones)
    task_type="CAUSAL_LM",  # Specifies the task type: Causal Language Model (for autoregressive models like GPT)
    target_modules=TARGET_MODULES,  # Specifies the model layers where LoRA is applied
)

# Define the general configuration parameters for training
train_parameters = SFTConfig(
    output_dir=PROJECT_RUN_NAME,  # Save checkpoints here
    num_train_epochs=EPOCHS,  # Total number of training epochs
    per_device_train_batch_size=BATCH_SIZE,  # Training batch size per device
    per_device_eval_batch_size=1,  # Evaluation batch size per device
    eval_strategy="steps",  # Enable evaluation every few steps
    eval_steps=50,  # Perform evaluation every 50 steps
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,  # Accumulate gradients over multiple steps
    optim=OPTIMIZER,  # Optimizer type (e.g., AdamW)
    save_steps=SAVE_STEPS,  # Save model checkpoint every X steps
    save_total_limit=10,  # Keep last 10 checkpoints, delete older ones
    logging_steps=STEPS,  # Log metrics every X steps
    learning_rate=LEARNING_RATE,  # Learning rate for optimization
    weight_decay=0.001,  # L2 weight regularization
    fp16=False,  # Mixed-precision training (set True if needed)
    bf16=True,  # Use bfloat16 precision if available
    max_grad_norm=0.3,  # Clip gradients to prevent instability
    max_steps=-1,  # Train for a full epoch
    warmup_ratio=WARMUP_RATIO,  # Fraction of training steps used for warmup
    group_by_length=True,  # Group sequences by length for efficiency
    lr_scheduler_type=LR_SCHEDULER_TYPE,  # Learning rate scheduler
    report_to="wandb" if LOG_TO_WANDB else None,  # Enable W&B logging if enabled
    run_name=RUN_NAME,  # Name of the experiment run
    max_seq_length=MAX_SEQUENCE_LENGTH,  # Maximum sequence length
    dataset_text_field="combined",  # Text field in dataset
    save_strategy="steps",  # Save checkpoints based on step intervals
    hub_strategy="every_save",  # Push model to Hugging Face Hub at every save
    push_to_hub=True,  # Enable pushing to Hugging Face Hub
    hub_model_id=HUB_MODEL_NAME,  # Define model ID for Hugging Face
    hub_private_repo=True,  # Keep Hugging Face repository private
)

In [18]:
# And now, the Supervised Fine Tuning Trainer will carry out the fine-tuning
# Given these 2 sets of configuration parameters

fine_tuning = SFTTrainer(
    model=base_model,
    train_dataset=new_train,
    eval_dataset=new_test,
    peft_config=lora_parameters,
    tokenizer=tokenizer,
    args=train_parameters,
)

  fine_tuning = SFTTrainer(


Map:   0%|          | 0/646 [00:00<?, ? examples/s]

Map:   0%|          | 0/216 [00:00<?, ? examples/s]

Received unrecognized `WANDB_LOG_MODEL` setting value=all; so disabling `WANDB_LOG_MODEL`


In [19]:
# Fine-tune!
fine_tuning.train()

# Push our fine-tuned model to Hugging Face
fine_tuning.model.push_to_hub(PROJECT_RUN_NAME, private=True)
print(f"Saved to the hub: {PROJECT_RUN_NAME}")

if LOG_TO_WANDB:
    wandb.finish()

Step,Training Loss,Validation Loss
50,1.8746,1.578935
100,1.4249,1.421964
150,1.3565,1.407162
200,1.1538,1.435537
250,1.3446,1.407258
300,1.2608,1.39869
350,1.1499,1.410517
400,1.0502,1.424303
450,1.1095,1.429271


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

Saved to the hub: Ravi-shankar-chat-style_2025-02-07_01.42.55


0,1
eval/loss,█▂▁▂▁▁▁▂▂
eval/runtime,▂▁▃▁█▁▂▁▂
eval/samples_per_second,▇█▆█▁█▇█▇
eval/steps_per_second,▇█▆█▁█▇█▇
train/epoch,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▇▇▇▇█
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▇▇▇▇█
train/grad_norm,▃▁▁▂▃▃▆▄█
train/learning_rate,██▇▆▅▃▂▁▁
train/loss,█▄▄▂▃▃▂▁▂

0,1
eval/loss,1.42927
eval/runtime,30.1753
eval/samples_per_second,7.158
eval/steps_per_second,7.158
total_flos,8406825277685760.0
train/epoch,3.0
train/global_step,486.0
train/grad_norm,1.7159
train/learning_rate,0.0
train/loss,1.1095


## Prediction

In [20]:
# imports

import os
import re

import math
from tqdm import tqdm

from google.colab import userdata
from huggingface_hub import login

import torch
import torch.nn.functional as F

import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, set_seed

from datasets import load_dataset, Dataset, DatasetDict
from datetime import datetime

from peft import PeftModel
import matplotlib.pyplot as plt

In [20]:
##from huggingface_hub import login
##hf_token = userdata.get("HF_TOKEN")
##login(hf_token)

In [21]:
# Constants

BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"

HF_USER = api.whoami()["name"]
# The run itself

REVISION = None
FINETUNED_MODEL = f"{HF_USER}/{PROJECT_RUN_NAME}"

print(FINETUNED_MODEL)

# Hyperparameters for QLoRA
QUANT_4_BIT = True


ashishkumarsahani/Ravi-shankar-chat-style_2025-02-07_01.42.55


In [22]:
if QUANT_4_BIT:
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4"
    )
else:
    bnb_config = BitsAndBytesConfig(
        load_in_8bit=True,
        bnb_8bit_compute_dtype=torch.bfloat16
    )


In [23]:

# Load the Tokenizer and the Model

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=quant_config,
    device_map="auto",
)

base_model.generation_config.pad_token_id = tokenizer.pad_token_id

# Load the fine-tuned model with PEFT

if REVISION:
    fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL, revision=REVISION)
else:
    fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

adapter_config.json:   0%|          | 0.00/752 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/109M [00:00<?, ?B/s]

In [24]:
print(f"Memory footprint: {(fine_tuned_model.get_memory_footprint()/1e6):.1f} MB")

Memory footprint: 5700.6 MB


In [25]:
print(fine_tuned_model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): 

In [26]:
# Each of the Target Modules has 2 LoRA Adaptor matrices, called lora_A and lora_B
# These are designed so that weights can be adapted by adding alpha * lora_A * lora_B
# Let's count the number of weights using their dimensions:
# See the matrix dimensions above
lora_q_proj = 4096 * 32 + 4096 * 32
lora_k_proj = 4096 * 32 + 1024 * 32
lora_v_proj = 4096 * 32 + 1024 * 32
lora_o_proj = 4096 * 32 + 4096 * 32

# Each layer comes to
lora_layer = lora_q_proj + lora_k_proj + lora_v_proj + lora_o_proj

# There are 32 layers
params = lora_layer * 32

# So the total size in MB is
size = (params * 4) / 1_000_000
print(f"Total number of params: {params} and size {size}MB" )

Total number of params: 27262976 and size 109.051904MB


In [132]:

# Original prediction function takes the most likely next token

def model_predict(question):
  set_seed(42)

  inputs = tokenizer.encode(question, return_tensors="pt").to("cuda")
  attention_mask = torch.ones(inputs.shape, device="cuda")

  outputs = fine_tuned_model.generate(inputs, attention_mask=attention_mask,temperature=0.0001,repetition_penalty=1.5, max_new_tokens=200, num_return_sequences=1)
  response = tokenizer.decode(outputs[0])

  return response

In [133]:
model_predict("How can I experience God?")

'<|begin_of_text|>How can I experience God? What is the best way to do it?\nAnswer: The most direct and surest method of knowing Him, or rather (for He cannot be known without being loved) loving him with a perfect knowledge that makes one’s love divine. This means having an absolute trust in His Grace which alone saves us from our own nothingness; this also implies not wanting anything for oneself but only his Will as expressed through others who are partaking more fully than we have done so far either consciously by their actions on ourselves or unconsciously because they represent better what he wants them all individually to become—his creation perfected here below before returning into its source whence everything comes out again after each cycle has been completed once upon earth.\nThe first condition then seems clear enough—to want absolutely NOTHING FOR ONESELF except HIS WILL BEING DONE IN US AND THROUGH OUR ACTIONS UPON OTHERS WHO ARE ALSO SEEKERS AFTER HIM OR AT LEast NOT OP

In [29]:

# Define the fixed instruction
INSTRUCTION = "You are Sri Sri Ravi Shankar. A spiritual guru from India. You answer questions of spiritual seekers based on your deep spiritual knowledge and insight."

def model_predict(question):
    try:

        # Format the input as per the fine-tuning prompt structure
        prompt = f"system: {INSTRUCTION}\nuser: {question}\nassistant:"

        # Tokenize input
        inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to("cuda")

        with torch.no_grad():  # Disable gradient tracking for inference
            outputs = fine_tuned_model.generate(
                **inputs,
                temperature=0.0001,
                repetition_penalty=1.5,
                max_new_tokens=200,
                num_return_sequences=1
            )

        # Decode output, removing special tokens
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Remove the prompt part from the response (if the model echoes it)
        response = response.replace(prompt, "").strip()

        return response

    except Exception as e:
        return f"Error: {e}"

# Example usage:
question = "How can I lead a happy and fulfilled life?"
print(model_predict(question))

To be truly successful, you must have the right attitude towards yourself—self-respect—and an aspiration to progress in order not only outwardly but also inwardly—to become better than what one is now or has been up till then; this will give peace within oneself which nothing else could ever do... [SABCL 23/2] If there were no other reason for living except that it gives us opportunities like these (of self-improvement), we would all live happily! But if people don't want them at least they shouldn’t complain about their lives being empty!
The text suggests two things:
1) One should respect themselves as well-meaning individuals who wish sincerely good results out of every action taken by others around him/her;
3) An individual needs inner growth through constant efforts made daily without fail so he/she may reach higher levels where true happiness lies waiting patiently until its time comes round again after many years spent trying hard enough before finally succeeding once more with 