In [None]:
# installing necessary libraries
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

In [None]:
# importing libraries

import pandas as pd
from tqdm import tqdm

import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [None]:
# mounting google-drive to load the trained model
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Import necessary libraries for loading the model
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer

# Define the directory where the PEFT model is stored
peft_model_dir = "/content/drive/MyDrive/llama-2-7b-custom"

# Load the base LLM model with PEFT configurations
trained_model = AutoPeftModelForCausalLM.from_pretrained(
    peft_model_dir,
    low_cpu_mem_usage=True,  # Use low CPU memory usage
    torch_dtype=torch.float16,  # Set the torch data type to float16
    load_in_4bit=True  # Load model parameters in 4-bit format for memory efficiency
)

# Load the tokenizer corresponding to the PEFT model
tokenizer = AutoTokenizer.from_pretrained(peft_model_dir)

In [None]:
time_df = pd.read_csv("/content/drive/MyDrive/final_task2_time.csv")
time_df.head()

In [None]:
logging.set_verbosity(logging.CRITICAL)

# Create a pipeline for text generation using a pre-trained model and tokenizer
pipe = pipeline(task="text-generation", model=trained_model, tokenizer=tokenizer, max_length=250)

# Initialize an empty list to store generated text
generated_text = []

# Iterate through the dataframe and generate text prompts
for i in tqdm(range(len(time_df))):
    # Retrieve a text prompt from a DataFrame named 'time_df'
    prompts = time_df['prompt'].iloc[i]
    try:
        # Generate text based on the prompt and retrieve the result
        result = pipe(prompts, num_return_sequences=1)
        # Extract the generated text from the result, splitting at '[/INST]' and store it in the list
        generated_text.append(result[0]['generated_text'].split('[/INST]')[1])
    except Exception as e:
        # Handle exceptions, print the error message and index for debugging purposes
        print(e)
        print(i)
        # In case of an exception, insert a placeholder text
        generated_text.append("ABCD1234@#")


In [None]:
otpt = pd.DataFrame()
otpt["generated_tweet"] = generated_text
otpt["id"] = [i for i in range(1,len(time_df)+1)]
otpt.to_csv("/content/drive/MyDrive/task2-time-part1.csv", index = False)