## Notebook for data cleaning the dataset with LLama2

The following columns will be cleaned
1. Maximum Height
2. Flower Colour
3. Trunk Texture
4. Trunk Colour
5. Leaf Texture

In [1]:
# Basic imports and setting up of environment variables
import os
import torch
import json
import pandas as pd
import re
import ast
import gc
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig

os.environ['HG_ACCESS_TOKEN'] = '' # To be filled

### Set up
Download Llama 2 into local system, ignore this cell if you already have a HG_ACCESS_TOKEN and do not want to download the models locally

In [None]:
def setup_llama(model_directory:str='../src/llama/model', tokenizer_directory:str='../src/llama/tokenizer'):
    """
    Function to download llama2 7b model

    Args:
        model_directory (str): directory to the path with llama model, defaults to ../src/llama/model'
        tokenizer_directory (str): directory to the path with llama tokenizer, defaults to ../src/llama/tokenizer'

    Returns:
        0 for success and 1 for failure
    """
    hg_access_token = os.getenv('HG_ACCESS_TOKEN')
    if len(hg_access_token) <=0:
        print("No valid access key found, did you update your .env file?")
        return 1

    if not os.path.isdir(model_directory):
        os.makedirs(model_directory) 
        print("Llama Model directory not found, directory created")

    if not os.path.isdir(tokenizer_directory):
        os.makedirs(tokenizer_directory) 
        print("Llama tokenizer directory not found, directory created")

    print("Preparing to download LLama")

    try:
        llm_model = AutoModelForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', token=hg_access_token)
        llm_model.save_pretrained(model_directory)
        print("LLama Model downloaded")

        llm_tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-chat-hf', token=hg_access_token)
        llm_tokenizer.save_pretrained(tokenizer_directory)
        print("LLama Tokenizer downloaded")
        return 0

    except Exception as e:
        print(f"Failed to download Llama, error: {e}")
        return 1
    
setup_llama()

### Llama Set Up

In [3]:
class PromptTemplate:
    """
    Class to generate prompt template for Llama2-7b
    """
    system_prompt = None
    user_messages = []
    model_replies = []

    def __init__(self, system_prompt=None):
        self.system_prompt = system_prompt

    def add_user_message(self, message: str, return_prompt=True):
        self.user_messages.append(message)
        if return_prompt:
            return self.build_prompt()

    def add_model_reply(self, reply: str, includes_history=True, return_reply=True):
        reply_ = reply.replace(self.build_prompt(), "") if includes_history else reply
        self.model_replies.append(reply_)
        if len(self.user_messages) != len(self.model_replies):
            raise ValueError(
                "Number of user messages does not equal number of system replies."
            )
        if return_reply:
            return reply_

    def get_user_messages(self, strip=True):
        return [x.strip() for x in self.user_messages] if strip else self.user_messages

    def get_model_replies(self, strip=True):
        return [x.strip() for x in self.model_replies] if strip else self.model_replies

    def clear_chat_history(self):
        self.user_messages.clear()
        self.model_replies.clear()

    def build_prompt(self):
        if self.user_messages == [] and self.model_replies == []:
            return f"<s>[INST] <<SYS>>\n{self.system_prompt}\n<</SYS>> [/INST]</s>"
        
        elif len(self.user_messages) != len(self.model_replies) + 1:
            raise ValueError(
                "Error: Expected len(user_messages) = len(model_replies) + 1. Add a new user message!"
            )

        if self.system_prompt is not None:
            SYS = f"[INST] <<SYS>>\n{self.system_prompt}\n<</SYS>>"
        else:
            SYS = ""

        CONVO = ""
        SYS = "<s>" + SYS
        for i in range(len(self.user_messages) - 1):
            user_message, model_reply = self.user_messages[i], self.model_replies[i]
            conversation_ = f"{user_message} [/INST] {model_reply} </s>"
            if i != 0:
                conversation_ = "[INST] " + conversation_
            CONVO += conversation_

        CONVO += f"[INST] {self.user_messages[-1]} [/INST]"

        return SYS + CONVO

In [4]:
class LlamaModel():
    def __init__(self, mode:str, llama_model_path:str=None, llama_tokenizer_path:str=None):
        """
        Llama Model Class for data cleaning

        Args:
            mode (str): Determines what type the LLama model should be, QnA or Classification
            llama_model_path (str, optional): If locally hosting llama model, show the path of the model folder. Defaults to None.
            llama_tokenizer_path (str, optional): If locally hosting llama model, show the path of the tokenizer folder. Defaults to None.        
        """
        # Setup llama models
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.llama_model, self.llama_tokenizer = self._load_llama(llama_model_path, llama_tokenizer_path)
        self.config = GenerationConfig(max_new_tokens=1024,
                        do_sample=True,
                        top_k = 10,
                        num_return_sequences = 1,
                        return_full_text = False,
                        temperature = 0.01,
                        )

        self.QAsystemPrompt = """You are a question answering model, given a question and context in the format of a JSON:
        {
        "question":'',
        "context":''
        }
        , you are to return the answer with the following JSON format.
        {
        "answer": 
        }
        If the context does not answer the question, the answer is -.
        Only return the JSON with the correct answer. No other text is allowed.
        """
        # Note: Classification System Prompt is very specific to the leaf texture
        self.ClassificationsystemPrompt = """You are a classification model with 3 possible categories. Each category and it's description is given in the following JSON.
        {
        "fine": "Linear, thin shaped leaves and stems with no spikes or rough edges. Leaves are needle like shape and should not stand upwards.",
        "medium": "The most common texture in plants, medium size and shape. Often fleshy, rounded or oval, and not overly detailed. If unsure, pick this class.",
        "coarse": "Must be large, broad leaves with size larger than 15cm. Often rough or thick with prominent veins, lobes or edges that stand out visually."
        }
        You will be given a description of the plant leaf type. Categorise the description into one of the classes and return your answer in following JSON format. Always prioritise the size before the shape texture in your classification.
        {
        "answer": class
        }
        Only return the JSON with the classification. No description is accepted. No other text.
        """
        # Setup the correct system prompt based on the model requirements
        self.promptGenerator = PromptTemplate(system_prompt= self.QAsystemPrompt if mode == 'QnA' else self.ClassificationsystemPrompt)


    def _load_llama(self, llama_model_path:str, llama_tokenizer_path:str):
        """
        Function to load llama 2-7b model
        Uses hg_access from .env as a default, if there isn't any hg_access it looks for the directory of downloaded Llama models

        Returns:
            llama_model
            llama_tokenizer
        """
        hg_access = os.getenv('HG_ACCESS_TOKEN')
        if hg_access != None:
            try:
                print("Loading Llama Models")
                llama_model = AutoModelForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', token=hg_access, torch_dtype=torch.bfloat16, device_map="auto")
                llama_tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-chat-hf', token=hg_access)
                print("Llama Loaded Successfully")
                return llama_model, llama_tokenizer
            
            except Exception as e:
                raise Exception(f"Unable to load Llama model from hugging face, reasons: {e}")

        elif llama_model_path != None and llama_tokenizer_path != None:
            try:
                print("Loading Llama Models")
                llama_model = AutoModelForCausalLM.from_pretrained(llama_model_path, torch_dtype=torch.bfloat16, device_map="auto")
                llama_tokenizer = AutoTokenizer.from_pretrained(llama_tokenizer_path)
                print("Llama Loaded Successfully")
                return llama_model, llama_tokenizer
            
            except Exception as e:
                raise Exception(f"Unable to load Llama model from directory, reasons: {e}")
        
        else:
            raise Exception("No Llama resources provided")


    def question_answer(self, question:str, context:str):
        """
        Function to call the llama model to question answer

        Args:
            question (str): The question to be asked
            context (str): The context that contains the answer to the question

        Returns:
            response (str): Llama2-7b model's response
        """
        # Add user prompt
        llama_prompt = self.promptGenerator.add_user_message(
            json.dumps({
                "question": question,
                "context": context
            })
        )
        # Generate response
        encoded_input = self.llama_tokenizer.encode(llama_prompt, return_tensors='pt', add_special_tokens=False).to(self.device)
        results = self.llama_model.generate(encoded_input, generation_config=self.config)
        decoded_output = self.llama_tokenizer.decode(results[0], skip_special_tokens=True)
        response = decoded_output.split("[/INST]")[-1].strip()
        self.promptGenerator.clear_chat_history() #Clear history to reset back to just system prompt

        return response


    def classify(self, context:str):
        """
        Function to call the llama model to classify leaf texture

        Args:
            context (str): The context to help the model classify

        Returns:
            response (str): Llama2-7b model's response
        """
        llama_prompt = self.promptGenerator.add_user_message(context)

        encoded_input = self.llama_tokenizer.encode(llama_prompt, return_tensors='pt', add_special_tokens=False).to(self.device)
        results = self.llama_model.generate(encoded_input, generation_config=self.config)
        decoded_output = self.llama_tokenizer.decode(results[0], skip_special_tokens=True)
        response = decoded_output.split("[/INST]")[-1].strip()
        self.promptGenerator.clear_chat_history()

        return response

### Data Cleaning
Things to check:
1. Maximum height -> QA, extract tallest height from xx to xx, convert all to m and leave the integer
2. Flower Colour -> QA, if flowers, spathe inside data, need to do QA
3. Trunk Texture -> QA, if trunks/bark inside data
4. Trunk Colour -> QA, if trunks/bark inside data
5. Leaf Texture -> Classification

In [None]:
class DataCleaningModel():
    def __init__(self, csv_filepath:str="../src/flora_data/flora_species_updated.csv"):
        self.flora_data = pd.read_csv(csv_filepath)
        # Updates to ensure all None is a string instead (if not it will be empty in the csv)
        self.data = self.flora_data.where(pd.notnull(self.flora_data), 'None')


    def clean_maximum_height(self, llama_model:AutoModelForCausalLM):
        """
        Function to do QA on the maximum height and convert all heights to metre
        Checks the data and if it does not follow a xx cm/m to xx cm/m format do QA

        Args:
            llama_model (AutoModelForCausalLM): Llama2-7b model set to QnA mode
        """
        # Clear any chat history
        llama_model.promptGenerator.clear_chat_history()
        for index, value in tqdm(self.data['Maximum Height'].items(), total=len(self.data['Maximum Height']), desc="Cleaning Maximum Heights"):
            #Regex pattern for xx cm/m to xx cm/m
            pattern = '^\d+(\.\d+)?( ?(cm|m))?( to \d+(\.\d+)?( ?(cm|m))?)?$'
            if value != '-' and not re.match(pattern, value):
                #Do not meet regex pattern and not - Llama QA
                response = llama_model.question_answer("What is the max height of the plant in meters?", value)
                try:
                    new_height = json.loads(response)['answer']
                
                except:
                    # For when the model fails to return a JSON
                    # Just assume no height for now
                    new_height = '-'
                
                self.data.at[index, 'Maximum Height'] = new_height

            # Meet the regex pattern, convert to metre
            elif re.match(pattern, value):
                max_height_m_int = value
                max_height = value
                if 'to' in value:
                    max_height = value[value.index('to') + 2:] #Taking the latter half max height
                
                # Convert value from cm to metre
                if 'cm' in max_height:
                    max_height_cm_str = max_height[:(max_height.index('cm'))].strip() 
                    max_height_m_int = float(max_height_cm_str)/100
                
                elif 'm' in max_height:
                    max_height_m_str = max_height[:max_height.index('m')].strip()
                    max_height_m_int = float(max_height_m_str)

                self.data.at[index, 'Maximum Height'] = max_height_m_int


    def clean_flower_colour(self, llama_model:AutoModelForCausalLM):
        """
        Function to do QA on the flower colour 
        Checks the data if flower, flowers, spathe are inside the text

        Args:
            llama_model (AutoModelForCausalLM): Llama2-7b model set to QnA mode
        """        
        # Clear any chat history
        llama_model.promptGenerator.clear_chat_history()
        for index, value in tqdm(self.data['Flower Colour'].items(), total=len(self.data['Flower Colour']), desc='Cleaning Flower Colour'):
            if 'flower' in value.lower() or 'flowers' in value.lower() or 'spathe' in value.lower():
                #Llama QA
                response = llama_model.question_answer("What are the colours of the flowers?", value)
                try:
                    flower_colour = json.loads(response)['answer']
                    # Just in case model returned a list 
                    try:
                        flower_colour = ast.literal_eval(flower_colour)
                    except:
                        pass
                    # Just in case model returned a list 
                    if isinstance(flower_colour, list):
                        flower_colour = " ".join(flower_colour).title()
                    # Captialise all words
                    else:
                       flower_colour = flower_colour.title()
                
                except:
                    # For now, if fails js changed the height to - To be manully retrieved
                    flower_colour = '-'

                self.data.at[index, 'Flower Colour'] = flower_colour


    def clean_trunk_texture(self, llama_model:AutoModelForCausalLM):
        """
        Function to do QA on trunk texture
        if trunk, trunks, bark, barks, stem, stems or girth in description, query

        Args:
            llama_model (AutoModelForCausalLM): Llama2-7b model set to QnA mode

        """
        # Clear any chat history
        llama_model.promptGenerator.clear_chat_history()
        for index, value in tqdm(self.data['Trunk Texture'].items(), total=len(self.data['Trunk Texture']), desc='Cleaning Trunk Texture'):
            response = None
            if 'trunk' in value.lower() or 'trunks' in value.lower():
                #Llama QA
                response = llama_model.question_answer("What is the texture of the trunk?", value)

            elif 'bark' in value.lower() or 'barks' in value.lower():
                response = llama_model.question_answer("What is the texture of the bark?", value)

            elif 'stem' in value.lower() or 'stems' in value.lower() or 'girth' in value.lower():
                response = llama_model.question_answer("What is the texture of the stem?", value)
            # If any query was done
            if response:
                try:
                    bark_texture = json.loads(response)['answer']
                
                except:
                    bark_texture = '-'

                self.data.at[index, 'Trunk Texture'] = bark_texture

        
    def clean_trunk_colour(self, llama_model:AutoModelForCausalLM):
        """
        Function to do QA on trunk colour
        if trunk, trunks, bark, barks, stem, stems or girth in description, query

        Args:
            llama_model (AutoModelForCausalLM): Llama2-7b model set to QnA mode

        """
        # Clear any chat history
        llama_model.promptGenerator.clear_chat_history()
        for index, value in tqdm(self.data['Trunk Colour'].items(), total=len(self.data['Trunk Colour']), desc='Cleaning Trunk Colour'):
            response = None
            if 'trunk' in value.lower() or 'trunks' in value.lower():
                #Llama QA
                response = llama_model.question_answer("What is the colour of the trunk?", value)

            elif 'bark' in value.lower() or 'barks' in value.lower():
                response = llama_model.question_answer("What is the colour of the bark?", value)

            elif 'stem' in value.lower() or 'stems' in value.lower() or 'girth' in value.lower():
                response = llama_model.question_answer("What is the colour of the stem?", value)
            # If any query was done
            if response:
                try:
                    bark_texture = json.loads(response)['answer']
                
                except:
                    bark_texture = '-'

                self.data.at[index, 'Trunk Colour'] = bark_texture
    

    def classify_leaf_texture(self, llama_model:AutoModelForCausalLM):
        """
        Function to classify leaf texture

        Args:
            llama_model (AutoModelForCausalLM): Llama2-7b model set to Classification mode (preset for leaf texture)
        """
        # Clear any chat history
        llama_model.promptGenerator.clear_chat_history()
        for index, value in tqdm(self.data['Leaf Texture'].items(), total=len(self.data['Leaf Texture']), desc='Classifying Leaf Texture'):
            if value != '-' and value != 'None':
                response = llama_model.classify(value)
                try:
                    leaf_texture = json.loads(response)['answer']
                    if leaf_texture.lower() in ['fine', 'medium', 'coarse']:
                        leaf_texture = leaf_texture.title()
                    else:
                        leaf_texture = '-'
                
                except:
                    leaf_texture = '-'

                self.data.at[index, 'Leaf Texture'] = leaf_texture


    def clean_data(self, output_path:str):
        """
        Function to run all data cleaning functions before downloading updated data into a csv file

        Args:
            output_path (str): filepath to csv output for clean dataset
        """
        QnAModel = LlamaModel('QnA')
        print(f"Starting data cleaning.")
        self.clean_maximum_height(QnAModel)
        self.clean_flower_colour(QnAModel)
        self.clean_trunk_texture(QnAModel)
        self.clean_trunk_colour(QnAModel)

        print("Information extration complete, starting classification.")
        del QnAModel
        gc.collect()
        classificationModel = LlamaModel("Classification")
        self.classify_leaf_texture(classificationModel)

        print("Data cleaning completed.")
        self.data.to_csv(output_path, index=False)
        print(f"Data saved to {output_path}")


In [None]:
dataset = DataCleaningModel(csv_filepath="../src/flora_data/flora_species_updated.csv")
dataset.clean_data('../src/flora_data/cleaned_flora_species.csv')