### Data Cleaning

In [1]:
import os
os.environ['HG_ACCESS_TOKEN'] = ''

In [None]:
#Function to load up LLama2-7b
import os
from transformers import AutoTokenizer, AutoModelForCausalLM

def setup_llama(model_directory:str='../src/llama/model', tokenizer_directory:str='../src/llama/tokenizer'):
    """
    Function to download llama2 7b model

    Args:
        model_directory (str): directory to the path with llama model, defaults to ../src/llama/model'
        tokenizer_directory (str): directory to the path with llama tokenizer, defaults to ../src/llama/tokenizer'

    Returns:
        0 for success and 1 for failure
    """
    hg_access_token = os.getenv('HG_ACCESS_TOKEN')
    if len(hg_access_token) <=0:
        print("No valid access key found, did you update your .env file?")
        return 1

    if not os.path.isdir(model_directory):
        os.makedirs(model_directory) 
        print("Llama Model directory not found, directory created")

    if not os.path.isdir(tokenizer_directory):
        os.makedirs(tokenizer_directory) 
        print("Llama tokenizer directory not found, directory created")

    print("Preparing to download LLama")

    try:
        llm_model = AutoModelForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', token=hg_access_token)
        llm_model.save_pretrained(model_directory)
        print("LLama Model downloaded")

        llm_tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-chat-hf', token=hg_access_token)
        llm_tokenizer.save_pretrained(tokenizer_directory)
        print("LLama Tokenizer downloaded")
        return 0

    except Exception as e:
        print(f"Failed to download Llama, error: {e}")
        return 1
    
setup_llama()

In [2]:
class PromptTemplate:
    system_prompt = None
    user_messages = []
    model_replies = []

    def __init__(self, system_prompt=None):
        self.system_prompt = system_prompt

    def add_user_message(self, message: str, return_prompt=True):
        self.user_messages.append(message)
        if return_prompt:
            return self.build_prompt()

    def add_model_reply(self, reply: str, includes_history=True, return_reply=True):
        reply_ = reply.replace(self.build_prompt(), "") if includes_history else reply
        self.model_replies.append(reply_)
        if len(self.user_messages) != len(self.model_replies):
            raise ValueError(
                "Number of user messages does not equal number of system replies."
            )
        if return_reply:
            return reply_

    def get_user_messages(self, strip=True):
        return [x.strip() for x in self.user_messages] if strip else self.user_messages

    def get_model_replies(self, strip=True):
        return [x.strip() for x in self.model_replies] if strip else self.model_replies

    def clear_chat_history(self):
        self.user_messages.clear()
        self.model_replies.clear()

    def build_prompt(self):
        if self.user_messages == [] and self.model_replies == []:
            return f"<s>[INST] <<SYS>>\n{self.system_prompt}\n<</SYS>> [/INST]</s>"
        
        elif len(self.user_messages) != len(self.model_replies) + 1:
            raise ValueError(
                "Error: Expected len(user_messages) = len(model_replies) + 1. Add a new user message!"
            )

        if self.system_prompt is not None:
            SYS = f"[INST] <<SYS>>\n{self.system_prompt}\n<</SYS>>"
        else:
            SYS = ""

        CONVO = ""
        SYS = "<s>" + SYS
        for i in range(len(self.user_messages) - 1):
            user_message, model_reply = self.user_messages[i], self.model_replies[i]
            conversation_ = f"{user_message} [/INST] {model_reply} </s>"
            if i != 0:
                conversation_ = "[INST] " + conversation_
            CONVO += conversation_

        CONVO += f"[INST] {self.user_messages[-1]} [/INST]"

        return SYS + CONVO

In [3]:
# Data Cleaning Model
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
import json
import pandas as pd

class LlamaModel():
    def __init__(self, mode:str, llama_model_path:str=None, llama_tokenizer_path:str=None):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.llama_model, self.llama_tokenizer = self._load_llama(llama_model_path, llama_tokenizer_path)
        self.config = GenerationConfig(max_new_tokens=1024,
                        do_sample=True,
                        top_k = 10,
                        num_return_sequences = 1,
                        return_full_text = False,
                        temperature = 0.01,
                        )

        self.QAsystemPrompt = """You are a question answering model, given a question and context in the format of a JSON:
        {
        "question":'',
        "context":''
        }
        , you are to return the answer with the following JSON format.
        {
        "answer": 
        }
        If the context does not answer the question, the answer is -.
        Only return the JSON with the correct answer. No other text is allowed.
        """
        
        self.ClassificationsystemPrompt = """You are a classification model with 3 possible categories. Each category and it's description is given in the following JSON.
        {
        "fine": "Linear, thin shaped leaves and stems with no spikes or rough edges. Leaves are needle like shape and should not stand upwards.",
        "medium": "The most common texture in plants, medium size and shape. Often fleshy, rounded or oval, and not overly detailed. If unsure, pick this class.",
        "coarse": "Must be large, broad leaves with size larger than 15cm. Often rough or thick with prominent veins, lobes or edges that stand out visually."
        }
        You will be given a description of the plant leaf type. Categorise the description into one of the classes and return your answer in following JSON format. Always prioritise the size before the shape texture in your classification.
        {
        "answer": class
        }
        Only return the JSON with the classification. No description is accepted. No other text.
        """

        self.promptGenerator = PromptTemplate(system_prompt= self.QAsystemPrompt if mode == 'QnA' else self.ClassificationsystemPrompt)


    def _load_llama(self, llama_model_path:str, llama_tokenizer_path:str):
        """
        Function to load llama 2-7b model
        Uses hg_access from .env as a default, if there isn't any hg_access it looks for the directory of downloaded Llama models

        Returns:
            llama_model
            llama_tokenizer
        """

        hg_access = os.getenv('HG_ACCESS_TOKEN')
        if hg_access != None:
            try:
                print("Loading Llama Models")
                llama_model = AutoModelForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', token=hg_access, torch_dtype=torch.bfloat16, device_map="auto")
                llama_tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-chat-hf', token=hg_access)
                print("Llama Loaded Successfully")
                return llama_model, llama_tokenizer
            
            except Exception as e:
                raise Exception(f"Unable to load Llama model from hugging face, reasons: {e}")

        elif llama_model_path != None and llama_tokenizer_path != None:
            try:
                print("Loading Llama Models")
                llama_model = AutoModelForCausalLM.from_pretrained(llama_model_path, torch_dtype=torch.bfloat16, device_map="auto")
                llama_tokenizer = AutoTokenizer.from_pretrained(llama_tokenizer_path)
                print("Llama Loaded Successfully")
                return llama_model, llama_tokenizer
            
            except Exception as e:
                raise Exception(f"Unable to load Llama model from directory, reasons: {e}")
        
        else:
            raise Exception("No Llama resources provided")


    def question_answer(self, question, context):
        llama_prompt = self.promptGenerator.add_user_message(
            json.dumps({
                "question": question,
                "context": context
            })
        )
        
        encoded_input = self.llama_tokenizer.encode(llama_prompt, return_tensors='pt', add_special_tokens=False).to(self.device)
        results = self.llama_model.generate(encoded_input, generation_config=self.config)
        decoded_output = self.llama_tokenizer.decode(results[0], skip_special_tokens=True)
        response = decoded_output.split("[/INST]")[-1].strip()
        self.promptGenerator.clear_chat_history()

        return response


    def classify(self, context):
        llama_prompt = self.promptGenerator.add_user_message(context)

        encoded_input = self.llama_tokenizer.encode(llama_prompt, return_tensors='pt', add_special_tokens=False).to(self.device)
        results = self.llama_model.generate(encoded_input, generation_config=self.config)
        decoded_output = self.llama_tokenizer.decode(results[0], skip_special_tokens=True)
        response = decoded_output.split("[/INST]")[-1].strip()
        self.promptGenerator.clear_chat_history()

        return response

Things to check:
1. Maximum height -> QA, extract tallest height from xx to xx, convert all to m and leave the integer
2. Flower Colour -> QA, if flowers inside data, need to do QA
3. Trunk Texture -> QA, if trunks/bark inside data
4. Trunk Colour -> QA, if trunks/bark inside data
5. Leaf Texture -> Classification

In [4]:
Llama_model = LlamaModel('QnA')

Loading Llama Models


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Llama Loaded Successfully


In [21]:
flora_data = pd.read_csv("../src/flora_data/flora_species_updated.csv")

In [23]:
import re
from tqdm import tqdm

def clean_maximum_height(data, llama_model):
    llama_model.promptGenerator.clear_chat_history()
    for index, value in tqdm(data['Maximum Height'].items(), total=len(data['Maximum Height']), desc="Cleaning Maximum Heights"):
        #Regex pattern for xx cm/m to xx cm/m
        pattern = '^\d+(\.\d+)?( ?(cm|m))?( to \d+(\.\d+)?( ?(cm|m))?)?$'
        if value != '-' and not re.match(pattern, value):
            #Llama QA
            response = llama_model.question_answer("What is the max height of the plant in meters?", value)
            try:
                new_height = json.loads(response)['answer']
            
            except:
                # For now, if fails js changed the height to - To be manully retrieved
                new_height = '-'

            data.at[index, 'Maximum Height'] = new_height


        elif re.match(pattern, value):
            max_height_m_int = value
            if 'to' in value:
                max_height = value[value.index('to') + 2:]

            else:
                max_height = value
            
            if 'cm' in max_height:
                max_height_cm_str = max_height[:(max_height.index('cm') - 1)].strip()
                max_height_m_int = float(max_height_cm_str)/100
            
            elif 'm' in max_height:
                max_height_m_str = max_height[:max_height.index('m')].strip()
                max_height_m_int = float(max_height_m_str)

            data.at[index, 'Maximum Height'] = max_height_m_int

    return data

updated_height_df = clean_maximum_height(flora_data, Llama_model)

Cleaning Maximum Heights: 100%|██████████| 4927/4927 [10:07<00:00,  8.11it/s]


In [28]:
from tqdm import tqdm

def clean_flower_colour(data, llama_model):
    llama_model.promptGenerator.clear_chat_history()
    for index, value in tqdm(data['Flower Colour'].items(), total=len(data['Flower Colour']), desc='Cleaning Flower Colour'):
        if 'flower' in value.lower() or 'flowers' in value.lower():
            #Llama QA
            response = llama_model.question_answer("What are the colours of the flowers?", value)
            try:
                flower_colour = json.loads(response)['answer']
            
            except:
                # For now, if fails js changed the height to - To be manully retrieved
                flower_colour = '-'

            data.at[index, 'Flower Colour'] = flower_colour
            
    return data

In [17]:
from tqdm import tqdm

def clean_trunk_texture(data, llama_model):
    llama_model.promptGenerator.clear_chat_history()
    for index, value in tqdm(data['Trunk Texture'].items(), total=len(data['Trunk Texture']), desc='Cleaning Trunk Texture'):
        response = None
        if not pd.isna(value):
            if 'trunk' in value.lower() or 'trunks' in value.lower():
                #Llama QA
                response = llama_model.question_answer("What is the texture of the trunk?", value)

            elif 'bark' in value.lower() or 'barks' in value.lower():
                response = llama_model.question_answer("What is the texture of the bark?", value)

            elif 'stem' in value.lower() or 'stems' in value.lower() or 'girth' in value.lower():
                response = llama_model.question_answer("What is the texture of the stem?", value)

            if response:
                try:
                    bark_texture = json.loads(response)['answer']
                
                except:
                    bark_texture = '-'

                data.at[index, 'Trunk Texture'] = bark_texture

        else:
            data.at[index, 'Trunk Texture'] = 'None'

    return data

In [11]:
from tqdm import tqdm

def clean_trunk_colour(data, llama_model):
    llama_model.promptGenerator.clear_chat_history()
    for index, value in tqdm(data['Trunk Colour'].items(), total=len(data['Trunk Colour']), desc='Cleaning Trunk Colour'):
        response = None
        if not pd.isna(value):
            if 'trunk' in value.lower() or 'trunks' in value.lower():
                #Llama QA
                response = llama_model.question_answer("What is the colour of the trunk?", value)

            elif 'bark' in value.lower() or 'barks' in value.lower():
                response = llama_model.question_answer("What is the colour of the bark?", value)

            elif 'stem' in value.lower() or 'stems' in value.lower() or 'girth' in value.lower():
                response = llama_model.question_answer("What is the colour of the stem?", value)

            if response:
                try:
                    bark_colour = json.loads(response)['answer']
                
                except:
                    bark_colour = '-'

                data.at[index, 'Trunk Colour'] = bark_colour
        
        else:
            data.at[index, 'Trunk Colour'] = 'None'

    return data

In [29]:
updated_flower__df = clean_flower_colour(updated_height_df, Llama_model)

Cleaning Flower Colour: 100%|██████████| 4927/4927 [02:48<00:00, 29.21it/s] 


In [46]:
updated_trunk_texture_df = clean_trunk_texture(updated_flower__df, Llama_model)
updated_trunk_colour_df = clean_trunk_colour(updated_trunk_texture_df, Llama_model)

Cleaning Trunk Colour: 100%|██████████| 4927/4927 [01:08<00:00, 71.97it/s] 


In [49]:
updated_trunk_colour_df.to_csv('output.csv', index=False)

In [20]:
from tqdm import tqdm

def classify_leaf_texture(data, llama_model):
    llama_model.promptGenerator.clear_chat_history()
    for index, value in tqdm(data['Leaf Texture'].items(), total=len(data['Leaf Texture']), desc='Classifying Leaf Texture'):
        if pd.isna(value):
            data.at[index, 'Leaf Texture'] = 'None'
        
        elif value != '-':
            response = llama_model.classify(value)
            try:
                leaf_texture = json.loads(response)['answer']
            
            except:
                leaf_texture = '-'

            data.at[index, 'Leaf Texture'] = leaf_texture

    return data

In [19]:
del Llama_model
import gc
gc.collect()
Llama_model = LlamaModel('Classification')

Loading Llama Models


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Llama Loaded Successfully


In [21]:
updated_leaf_texture = classify_leaf_texture(updated_trunk_texture_df, Llama_model)

Classifying Leaf Texture: 100%|██████████| 4927/4927 [6:18:56<00:00,  4.61s/it]    


In [22]:
# Updates to ensure all None is a string instead (if not it will be empty in the csv)
updated_leaf_texture = updated_leaf_texture.where(pd.notnull(updated_leaf_texture), 'None')
updated_leaf_texture.to_csv('../src/flora_data/cleaned_flora_species.csv', index=False)

### Data Analysis of Full Data

In [1]:
import pandas as pd

flora_data = pd.read_csv('../src/flora_data/cleaned_flora_species.csv', sep=',', header=0)
flora_data.head()

Unnamed: 0,Scientific Name,Common Name,Species ID,Link,Plant Type,Light Preference,Water Preference,Drought Tolerant?,Native to SG?,Fruit Bearing?,...,Hazard,Attracted animals,Native habitat,Mature Leaf Colour,Young Flush Leaf Colour,Leaf Area Index,Growth rate,Trunk Texture,Trunk Colour,Leaf Texture
0,Abelia × grandiflora (Planch. & Linden) Traub,Glossy abelia,4055,https://www.nparks.gov.sg/florafaunaweb/flora/...,Shrub,Full Sun,Moderate Water,False,False,False,...,-,Butterfly-Attracting,Terrestrial,Green,-,-,Moderate,,,medium
1,Abelia × grandiflora 'Francis Mason',Golden Abelia,3463,https://www.nparks.gov.sg/florafaunaweb/flora/...,Shrub,"Full Sun, Semi Shade",Moderate Water,False,False,False,...,-,Butterfly-Attracting,-,"Green, Yellow / Golden",Red,4.5 (Shrub & Groundcover - Dicot),Moderate,,,medium
2,Abelia × grandiflora 'Kaleidoscope',Kaleidoscope Abelia,5038,https://www.nparks.gov.sg/florafaunaweb/flora/...,Shrub,Full Sun,Moderate Water,False,False,False,...,-,Butterfly-Attracting,Terrestrial,Green - Light Green,Red,-,Moderate,,,medium
3,Abelia × grandiflora 'Variegata',-,4056,https://www.nparks.gov.sg/florafaunaweb/flora/...,Shrub,"Full Sun, Semi Shade",Moderate Water,False,False,False,...,-,-,Terrestrial,-,-,-,Moderate,,,-
4,Abelmoschus esculentus (L.) Moench,Lady's Fingers,1581,https://www.nparks.gov.sg/florafaunaweb/flora/...,"Herbaceous Plant, Shrub",Full Sun,Moderate Water,False,False,False,...,"Spines/Thorns - Stem/Branch, Spines/Thorns - Leaf",-,-,Green,Green,-,Moderate,,,coarse


In [2]:
# Basic Data analysis
import matplotlib.pyplot as plt

# Plant Types
plant_counts = flora_data['Plant Type'].value_counts()
tree_shrub_count = {'tree':0, 'shrub': 0, 'both': 0}

for plant_types, counts in plant_counts.items():
    if 'Shrub' in plant_types and 'Tree' in plant_types:
        tree_shrub_count['both'] += counts

    if 'Shrub' in plant_types:
        tree_shrub_count['shrub'] += counts

    if 'Tree' in plant_types or 'Palm' in plant_types:
        tree_shrub_count['tree'] += counts

print(tree_shrub_count)

# fig, ax = plt.subplots(figsize=(8, 5))
# ax.bar(plant_counts.index, plant_counts.values)
# ax.set_xlabel('Plant Type')
# ax.set_ylabel('Count')
# ax.set_title('Plant Count')
# plt.xticks(rotation=90)
# plt.show()

{'tree': 1507, 'shrub': 1307, 'both': 197}


In [3]:
# Tree Analysis
tree_data = flora_data[flora_data['Plant Type'].str.contains('Tree|Palm', case=False)]
print(tree_data.shape)

tree_data_characteristics = {}
labels_to_check = ["Light Preference", "Water Preference","Drought Tolerant?", "Growth rate", "Native to SG?", "Attracted animals", "Fruit Bearing?"]

for label in labels_to_check:
    data_counts = tree_data[label].value_counts()

    for type, counts in data_counts.items():
        type_list = str(type).split(",")
        for data_type in type_list:
            # Retrieve list for the type
            data = tree_data_characteristics.get(label, {})
            # Retrieve value
            type_count = data.get(data_type.strip(), 0)
            data[data_type.strip()] = type_count + counts
            tree_data_characteristics[label] = data


for label, value in tree_data_characteristics.items():
    print(label, value)

(1507, 23)
Light Preference {'Full Sun': 1393, 'Semi Shade': 452, 'Full Shade': 20, '-': 2}
Water Preference {'Moderate Water': 1434, 'Lots of Water': 149, 'Little Water': 51, 'Occasional Misting': 3, '-': 3}
Drought Tolerant? {'False': 1383, 'True': 124}
Growth rate {'Moderate': 1284, 'Fast to Moderate': 65, 'Slow': 57, 'Moderate to Slow': 42, 'Fast': 42, '-': 20}
Native to SG? {'False': 990, 'True': 517}
Attracted animals {'-': 1167, 'Bird-Attracting': 142, 'Bird-Attracting (Fruits)': 60, 'Butterfly Host Plant': 43, 'Butterfly-Attracting': 55, 'Bat Food': 22, 'Bee-Attracting': 30, 'Caterpillar Moth Food Plant': 29, 'Butterfly Host Plant (Leaves)': 17, 'Caterpillar Moth Food Plant (Leaves)': 8, 'Bird-Attracting (Flowers)': 12, 'Butterfly-Attracting (Flower Nectar)': 13, 'Moth Food Plant': 6, 'Bird-Attracting (Seeds)': 3, 'Bat Food (Fruits)': 3, 'Moth Food Plant (Flower Nectar)': 2, 'Butterfly Host Plant (Leaves': 12, 'Associated with: Hidari irava)': 1, 'Bird-Attracting (Fruits': 5, '

In [4]:
# Shrub Analysis
shrub_data = flora_data[flora_data['Plant Type'].str.contains('Shrub', case=False)]
print(shrub_data.shape)

shrub_data_characteristics = {}
labels_to_check = ["Light Preference", "Water Preference","Drought Tolerant?", "Growth rate", "Native to SG?", "Attracted animals", "Fruit Bearing?", "Fragrant Plant?", "Leaf Texture"]

for label in labels_to_check:
    data_counts = shrub_data[label].value_counts()

    for type, counts in data_counts.items():
        type_list = str(type).split(",")
        for data_type in type_list:
            # Retrieve list for the type
            data = shrub_data_characteristics.get(label, {})
            # Retrieve value
            type_count = data.get(data_type.strip(), 0)
            data[data_type.strip()] = type_count + counts
            shrub_data_characteristics[label] = data


for label, value in shrub_data_characteristics.items():
    print(label, value)

(1307, 23)
Light Preference {'Full Sun': 1056, 'Semi Shade': 661, 'Full Shade': 48}
Water Preference {'Moderate Water': 1055, 'Little Water': 172, 'Lots of Water': 191, 'Occasional Misting': 67}
Drought Tolerant? {'False': 1110, 'True': 197}
Growth rate {'-': 724, 'Moderate': 367, 'Fast': 119, 'Fast to Moderate': 72, 'Slow': 21, 'Moderate to Slow': 4, 'Very Fast': 1}
Native to SG? {'False': 1155, 'True': 152}
Attracted animals {'-': 964, 'Butterfly-Attracting': 134, 'Bird-Attracting': 92, 'Butterfly-Attracting (Flower Nectar)': 47, 'Bird-Attracting (Fruits)': 34, 'Butterfly Host Plant': 43, 'Bee-Attracting': 57, 'Bird-Attracting (Flowers)': 21, 'Moth Food Plant': 9, 'Butterfly Host Plant (Leaves)': 10, 'Caterpillar Moth Food Plant (Leaves)': 2, 'Bird-Attracting (Seeds)': 1, 'Butterfly Host Plant (Leaves': 5, 'Associated with: Megisba malaya)': 1, 'Bird-Attracting (Fruits': 3, 'Associated with: Pycnonotus goiavier)': 1, 'Caterpillar Moth Food Plant': 8, 'Butterfly-Attracting (': 1, 'Flo

### Data Analysis of Filtered Data

In [5]:
# The values are different because there is tree,shrub data
tree_shrub_data = flora_data[flora_data['Plant Type'].str.contains('Tree|Shrub|Palm', case=False)]
print(tree_shrub_data.shape)

# Filter empty / incomplete data
# Note: For Attracted animals, Hazard, Fragrant Plant -> '-' means False 
invalid_data_list = ['Maximum Height', 'Flower Colour', 'Native habitat', 'Mature Leaf Colour', 'Leaf Area Index', 'Growth rate', 'Trunk Texture', 'Trunk Colour', 'Leaf Texture']
# Remove any '-' which shows no data
# Filter rows where any of the specified columns contain exactly '-'
filtered_tree_shrub_data = tree_shrub_data[
    ~tree_shrub_data[invalid_data_list].apply(lambda row: (row.astype(str) == '-').any(), axis=1)
]

print(filtered_tree_shrub_data['Plant Type'].value_counts())
print(filtered_tree_shrub_data.shape)

(2616, 23)
Shrub                                75
Tree                                 72
Herbaceous Plant, Shrub              14
Shrub, Tree                          14
Creeper, Herbaceous Plant, Shrub      4
Climber, Shrub                        4
Palm                                  2
Epiphyte, Herbaceous Plant, Shrub     2
Epiphyte, Shrub                       1
Grass or Grass-like Plant, Shrub      1
Creeper, Shrub                        1
Name: Plant Type, dtype: int64
(190, 23)


In [6]:
# Tree Analysis
tree_data = filtered_tree_shrub_data[filtered_tree_shrub_data['Plant Type'].str.contains('Tree|Palm', case=False)]
print(tree_data.shape)

tree_data_characteristics = {}
labels_to_check = ["Light Preference", "Water Preference","Drought Tolerant?", "Growth rate", "Native to SG?", "Attracted animals", "Fruit Bearing?"]

for label in labels_to_check:
    data_counts = tree_data[label].value_counts()

    for type, counts in data_counts.items():
        type_list = str(type).split(",")
        for data_type in type_list:
            # Retrieve list for the type
            data = tree_data_characteristics.get(label, {})
            # Retrieve value
            type_count = data.get(data_type.strip(), 0)
            data[data_type.strip()] = type_count + counts
            tree_data_characteristics[label] = data


for label, value in tree_data_characteristics.items():
    print(label, value)

(88, 23)
Light Preference {'Full Sun': 87, 'Semi Shade': 13}
Water Preference {'Moderate Water': 85, 'Lots of Water': 12, 'Little Water': 6}
Drought Tolerant? {'False': 69, 'True': 19}
Growth rate {'Moderate': 65, 'Fast': 11, 'Fast to Moderate': 6, 'Slow': 6, 'Moderate to Slow': 1}
Native to SG? {'False': 61, 'True': 27}
Attracted animals {'-': 51, 'Bird-Attracting': 17, 'Butterfly-Attracting': 11, 'Butterfly Host Plant': 6, 'Bird-Attracting (Fruits)': 5, 'Caterpillar Moth Food Plant': 2, 'Bee-Attracting': 7, 'Butterfly-Attracting (Flower Nectar)': 1, 'Bird-Attracting (Seeds)': 1, 'Bird-Attracting (Flowers)': 2, 'Caterpillar Moth Food Plant (Leaves)': 3, 'Bat Food': 2, 'Caterpillar Moth Food Plant (Leaves': 1, 'Associated with (Attacus atlas': 1, 'Clethrogyna turbata': 1, 'and Strepsicrates rhothia.)': 1, 'Butterfly Host Plant (Leaves': 2, 'Associated with: Odontoptilum angulatum': 1, 'Rapala pheretima': 1, 'Rapala suffusa)': 1, 'Butterfly Host Plant (Associated with: Polyura hebe plau

In [7]:
# Shrub Analysis
shrub_data = filtered_tree_shrub_data[filtered_tree_shrub_data['Plant Type'].str.contains('Shrub', case=False)]
print(shrub_data.shape)

shrub_data_characteristics = {}
labels_to_check = ["Light Preference", "Water Preference","Drought Tolerant?", "Growth rate", "Native to SG?", "Attracted animals", "Fruit Bearing?", "Fragrant Plant?", "Leaf Texture"]

for label in labels_to_check:
    data_counts = shrub_data[label].value_counts()

    for type, counts in data_counts.items():
        type_list = str(type).split(",")
        for data_type in type_list:
            # Retrieve list for the type
            data = shrub_data_characteristics.get(label, {})
            # Retrieve value
            type_count = data.get(data_type.strip(), 0)
            data[data_type.strip()] = type_count + counts
            shrub_data_characteristics[label] = data


for label, value in shrub_data_characteristics.items():
    print(label, value)

(116, 23)
Light Preference {'Full Sun': 100, 'Semi Shade': 66, 'Full Shade': 4}
Water Preference {'Moderate Water': 97, 'Lots of Water': 22, 'Little Water': 16, 'Occasional Misting': 3}
Drought Tolerant? {'False': 92, 'True': 24}
Growth rate {'Moderate': 67, 'Fast': 32, 'Fast to Moderate': 11, 'Slow': 5, 'Moderate to Slow': 2}
Native to SG? {'False': 105, 'True': 11}
Attracted animals {'-': 73, 'Butterfly-Attracting': 19, 'Bird-Attracting': 11, 'Bee-Attracting': 9, 'Butterfly Host Plant': 9, 'Bird-Attracting (Flowers)': 3, 'Butterfly-Attracting (Flower Nectar)': 9, 'Moth Food Plant (Flower Nectar)': 1, 'Moth Food Plant': 2, 'Bird-Attracting (Fruits)': 1, 'Butterfly Host Plant (Associated with: Junonia atlites)': 1, 'Butterfly Host Plant (Leaves)': 1, 'Butterfly Host Plant (Leaves': 2, 'Associated with: Eurema  hecabe contubernalis (Moore': 1, '1886)': 1, 'Lexias pardalis)': 1, 'Butterfly Host Plant (Associated with: )': 1, 'Associated with: Hypolimnas bolina jacintha': 1, 'Doleschallia

### Picking Data

In [8]:
def analyse_data_attributes(selected_dataset, attributes_list):

    data_attributes = {"Tree": {}, "Herbaceous Plant":{}, "Shrub": {}, "Palm": {}}

    for index, row in selected_dataset.iterrows():
        plant_type = row['Plant Type']
        # Append Palm under tree category
        if 'Palm' in plant_type and 'Tree' not in plant_type:
            plant_type += ", Tree"
        plant_type_list = plant_type.split(',')

        for type in plant_type_list:
            type = type.strip()
            if type in data_attributes.keys():
                type_count = data_attributes[type].get('Count', 0)
                data_attributes[type]['Count'] = type_count + 1

        for attribute in attributes_list:
            row_data = row[attribute]
            row_attributes_list = str(row_data).split(",")

            for type in plant_type_list:
                type = type.strip()
                if type in data_attributes.keys():
                    attributes_dict = data_attributes[type].get(attribute, {})
                    
                    for attribute_type in row_attributes_list:
                        attribute_type = attribute_type.strip()
                        attribute_count = attributes_dict.get(attribute_type, 0)
                        attributes_dict[attribute_type] = attribute_count + 1
                        data_attributes[type][attribute] = attributes_dict
        
    return data_attributes

In [9]:
selected_dataset = filtered_tree_shrub_data.sample(n=30)
attributes_list = selected_dataset.columns[5:]

selected_data_attributes = analyse_data_attributes(selected_dataset, attributes_list)

for label, value in selected_data_attributes.items():
    print(label)
    for attribute, attribute_value in value.items():
        print(attribute, attribute_value)

Tree
Count 12
Light Preference {'Full Sun': 12, 'Semi Shade': 2}
Water Preference {'Little Water': 1, 'Moderate Water': 11, 'Lots of Water': 2}
Drought Tolerant? {'True': 1, 'False': 11}
Native to SG? {'False': 8, 'True': 4}
Fruit Bearing? {'False': 12}
Fragrant Plant? {'-': 1, 'True': 4, 'None': 7}
Maximum Height {'15.0': 1, '30': 1, '30.0': 2, '40.0': 1, '40': 1, '35.0': 1, '20.0': 2, '25.0': 1, '45.0': 1, '13.0': 1}
Flower Colour {'Green': 2, 'Purple': 3, 'White': 6, 'Yellow / Golden': 4, 'Brown': 1, 'Cream / Off-White': 2, 'Pink': 4, 'Red': 1, "['yellowish white']": 1}
Hazard {'Low Crown / Clearance': 1, '-': 11}
Attracted animals {'-': 9, 'Bat Food (Flower Nectar)': 1, 'Bird-Attracting (Flowers)': 1, 'Caterpillar Moth Food Plant (Leaves)': 1, 'Bird-Attracting': 1, 'Butterfly Host Plant': 1, 'Bat Food': 1}
Native habitat {'Terrestrial (Coastal Forest)': 1, 'Terrestrial': 4, 'Terrestrial (Primary Rainforest': 2, 'Monsoon Forest)': 2, 'Terrestrial (Monsoon Forest)': 1, 'Shoreline (Ma

In [10]:
def valid_dataset(dataset_attributes, n):
    """
    Ruleset for a valid dataset
    - Similar shrub and tree count (12-16 range)
    - Maximum 5 overlap of Shrub, Tree class
    - At least 5 Herbaceous plants and 1 Palm (there is only 2 palm tree in dataset)

    Trees:
    - 2 Light Preference: minimum 4 is semi shade
    - 3 Water Preference: minimumm value 3 for all
    - Min 5 Drought Tolerance
    - Min 4 Native to SG

    Shrubs:
    - 3 Light Preference: minimum value of 3 for all
    - 4 Water Preference: minimum value 1 for all
    - Min 5 Drought Tolerance
    - Min 2 Native to SG
    - Min 5 Fragrant Plant
    - Leaf Texture cannot be grassy
    """

    # At least 5 Herbaceous plants and 1 Palms
    if dataset_attributes['Palm'].get('Count', 0) < 1:
        return False
    
    if dataset_attributes['Herbaceous Plant'].get('Count', 0) < 5:
        return False

    tree_count = dataset_attributes['Tree']['Count'] + dataset_attributes['Palm']['Count']
    shrub_count = dataset_attributes['Shrub']['Count']

    # Maximum 5 overlap of tree & shrubs
    if tree_count + shrub_count - n > 5:
        return False
    
    if tree_count / shrub_count >= 1.15 or tree_count/shrub_count <= 0.85:
        return False

    # TREE
    # Light Preference
    # if len(list(dataset_attributes['Tree']['Light Preference'])) != 2 or not all(value >= 4 for value in dataset_attributes['Tree']['Light Preference'].values()):
    #     return False

    # # Water Preference
    # if len(list(dataset_attributes['Tree']['Water Preference'])) != 3 or not all(value >= 3 for value in dataset_attributes['Tree']['Water Preference'].values()):
    #     return False
    
    # # Drought Tolerant
    # if len(list(dataset_attributes['Tree']['Drought Tolerant?'])) != 2 or not all(value >= 5 for value in dataset_attributes['Tree']['Drought Tolerant?'].values()):
    #     return False
    
    # # Native to SG
    # if len(list(dataset_attributes['Tree']['Native to SG?'])) != 2 or not all(value >= 4 for value in dataset_attributes['Tree']['Native to SG?'].values()):
    #     return False


    # # SHRUB
    # Light Preference
    if len(list(dataset_attributes['Shrub']['Light Preference'])) != 3 or not all(value >= 2 for value in dataset_attributes['Shrub']['Light Preference'].values()):
        return False

    # # Water Preference
    # if len(list(dataset_attributes['Shrub']['Water Preference'])) < 3 or not all(value >= 1 for value in dataset_attributes['Shrub']['Water Preference'].values()):
    #     return False
    
    # # Drought Tolerant
    # if len(list(dataset_attributes['Shrub']['Drought Tolerant?'])) != 2 or not all(value >= 5 for value in dataset_attributes['Shrub']['Drought Tolerant?'].values()):
    #     return False
    
    # # Native to SG
    # if len(list(dataset_attributes['Shrub']['Native to SG?'])) != 2 or not all(value >= 2 for value in dataset_attributes['Shrub']['Native to SG?'].values()):
    #     return False

    # # Fragrant Plant
    # if len(list(dataset_attributes['Shrub']['Fragrant Plant?'])) != 2 or not all(value >= 5 for value in dataset_attributes['Shrub']['Fragrant Plant?'].values()):
    #     return False

    # Leaf Texture
    if 'grassy' in dataset_attributes['Shrub']['Leaf Texture'] or len(list(dataset_attributes['Shrub']['Leaf Texture'].keys())) != 3:
        return False
    
    return True

In [None]:
56659
145
235
367
2137
4580
4625

In [11]:
seed = 4625
while True:
    selected_dataset = filtered_tree_shrub_data.sample(n=30, random_state=seed)
    attributes_list = selected_dataset.columns[5:]

    selected_data_attributes = analyse_data_attributes(selected_dataset, attributes_list)

    if valid_dataset(selected_data_attributes, 30):
        break
    
    seed+=1
        
print(seed)
for label, value in selected_data_attributes.items():
    print(label)
    for attribute, attribute_value in value.items():
        print(attribute, attribute_value)

4625
Tree
Count 15
Light Preference {'Full Sun': 15, 'Semi Shade': 1}
Water Preference {'Lots of Water': 5, 'Moderate Water': 14, 'Little Water': 1}
Drought Tolerant? {'False': 13, 'True': 2}
Native to SG? {'True': 7, 'False': 8}
Fruit Bearing? {'False': 15}
Fragrant Plant? {'True': 5, 'None': 9, '-': 1}
Maximum Height {'30.0': 4, '20.0': 3, '45.0': 1, '25.0': 2, '35.0': 1, '15.0': 2, '40.0': 2}
Flower Colour {'Orange': 4, 'Yellow / Golden': 5, 'Pink': 6, 'Red': 5, 'White': 5, 'Cream / Off-White': 3, 'Purple': 2, 'Green': 1}
Hazard {'-': 12, 'Spines/Thorns - Stem/Branch': 1, 'Spines/Thorns - Trunk': 2, 'Toxic Upon Ingestion': 1, 'Low Crown / Clearance': 1}
Attracted animals {'Bird-Attracting (Fruits)': 2, 'Bird-Attracting (Flowers)': 1, 'Caterpillar Moth Food Plant (Leaves)': 1, 'Bird-Attracting': 4, 'Butterfly Host Plant': 1, 'Bat Food': 1, '-': 6, 'Butterfly Host Plant (Leaves': 1, 'Associated with: Eurema  hecabe contubernalis (Moore': 1, '1886)': 1, 'Lexias pardalis)': 1, 'Bee-Attr

In [161]:
# Updates to ensure all None is a string instead (if not it will be empty in the csv)
selected_dataset = selected_dataset.where(pd.notnull(selected_dataset), 'None')
selected_dataset.to_csv('../src/flora_data/dataset.csv', index=False)