In [1]:
import dspy
from dspy.evaluate import Evaluate
from dspy.evaluate.metrics import answer_exact_match
from dspy.teleprompt import *
import pandas as pd
from huggingface_hub import login
from typing import Literal
from tqdm import tqdm
import os
import json
import ollama


In [2]:
#Load model and configure
lm = dspy.LM('openai/unsloth/Llama-3.2-3B-Instruct', api_base="http://0.0.0.0:8000/v1", api_key="token-abc@123",model_type='text')
# ollama_lm = dspy.LM('ollama_chat/llama3.2:1b', api_base="http://localhost:11434", api_key="")
# hf_lm = dspy.LM('huggingface/meta-llama/Llama-3.2-3B-Instruct')

dspy.configure(lm=lm)

In [3]:
#Example
class Emotion(dspy.Signature):
    """Classify emotion."""

    sentence: str = dspy.InputField()
    sentiment: Literal['very funny', 'so love'] = dspy.OutputField()

sentence = "i love you "

classify = dspy.Predict(Emotion)
classify(sentence=sentence)

Prediction(
    sentiment='so love'
)

In [4]:
#To get history
def get_history(lm, n):
    history = lm.history
    last_history = {}
    if len(history) >= n:
        last_history['system'] = history[-n:][0]['messages'][0]['content']
        last_history['user'] = history[-n:][0]['messages'][1]['content']
    return last_history

In [5]:
#Inspect history
history = get_history(lm,1)
for k,v in history.items():
    print('----', k.upper(), '----')
    print(v)

---- SYSTEM ----
Your input fields are:
1. `sentence` (str)

Your output fields are:
1. `sentiment` (typing.Literal[very funny, so love])

All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## sentence ## ]]
{sentence}

[[ ## sentiment ## ]]
{sentiment}        # note: the value you produce must be one of: very funny; so love

[[ ## completed ## ]]

In adhering to this structure, your objective is: 
        Classify emotion.
---- USER ----
[[ ## sentence ## ]]
i love you 

Respond with the corresponding output fields, starting with the field `[[ ## sentiment ## ]]` (must be formatted as a valid Python typing.Literal[very funny, so love]), and then ending with the marker for `[[ ## completed ## ]]`.


In [6]:
#Define Signature
class ForestHealth(dspy.Signature):
    """Classify the health status of the forest, based on the given parameters. Also use column description for reference.
       Output should be in lowercase."""
    parameters = dspy.InputField(desc="comprehensive collection of ecological and environmental measurements focused on tree characteristics and site conditions.")
    answer: Literal['very healthy', 'healthy', 'sub-healthy', 'unhealthy'] = dspy.OutputField(desc="labels matching the health status of forest.")

In [7]:
#Define Module
class Classification(dspy.Module):
    def __init__(self):
        self.cot = dspy.ChainOfThought(ForestHealth)
    
    def forward(self, parameters):
        response = self.cot(parameters=parameters)
        return response

In [8]:
#Process dataframe
def proess_df(df, input_columns, output_column):
    processed_df = pd.DataFrame()
    formatted_rows = []
    for index, row in df.iterrows():
        new_row = ', '.join(
            f"{col} is {round(val) if isinstance(val, float) else val}" 
            for col, val in zip(input_columns, row)
        )
        formatted_rows.append(new_row)
    processed_df['parameters'] = formatted_rows
    processed_df['health_status'] = df[output_column]
    processed_df = processed_df.map(lambda x: x.lower() if isinstance(x, str) else x)
    return processed_df

In [9]:
#load dataset
test_df = pd.read_csv('dataset/test.csv')
train_df = pd.read_csv('dataset/train.csv')

input_columns = ['latitude', 'longitude', 'diameter_at_breast_height', 'tree_height', 'crown_width_north_south', 'crown_width_east_west', 'slope', 'elevation', 'temperature', 'humidity', 'soil_total_nitrogen', 'soil_total_phosphorus', 'soil_available_phosphorus', 'soil_available_nitrogen', 'menhinick_index', 'gleason_index', 'disturbance_level', 'fire_risk_index']
output_column = ['health_status']
processed_test_df = proess_df(test_df, input_columns, output_column)
processed_train_df = proess_df(train_df, input_columns, output_column)

test_set = [dspy.Example(parameters=row['parameters'],
                          answer=row['health_status']).with_inputs("parameters",) for index, row in processed_test_df.iterrows()]
train_set = [dspy.Example(parameters=row['parameters'],
                          answer=row['health_status']).with_inputs("parameters") for index, row in processed_train_df.iterrows()]

print('Length of train set: ', len(train_set))
print('Train set columns: ', list(train_set[0].keys()))
print('Length of test set: ', len(test_set))
print('Test set columns: ', list(test_set[0].keys()))

Length of train set:  600
Train set columns:  ['parameters', 'answer']
Length of test set:  200
Test set columns:  ['parameters', 'answer']


In [10]:
#Define classifier 
classifier = Classification()

In [11]:
#Single prediction
output = classifier(parameters=processed_test_df.loc[25, 'parameters'])
print(output)

Prediction(
    reasoning='the forest is classified as sub-healthy due to the presence of a disturbance level of 2, indicating some level of stress or damage to the forest ecosystem. However, the menhinick index is 0, suggesting no visible signs of disease or insect infestation. The gleason index is 1, indicating a relatively low level of competition among trees. The soil parameters are relatively balanced, with adequate nitrogen and phosphorus levels. The temperature and humidity are within normal ranges. The slope is moderate, but not extreme. Overall, the forest appears to be experiencing some stress, but it is not severely impacted.',
    answer='sub-healthy'
)


In [12]:
#Define evaluator and evaluate program
evaluate_program = Evaluate(devset=test_set, metric=answer_exact_match, 
                            num_threads=8, display_progress=True, 
                            provide_traceback=True)
eval_result=evaluate_program(classifier)

Average Metric: 32.00 / 200 (16.0%): 100%|███████████████████| 200/200 [06:40<00:00,  2.00s/it]

2024/12/24 08:52:58 INFO dspy.evaluate.evaluate: Average Metric: 32 / 200 (16.0%)





In [1]:
# Define optimiser (adding few-shot samples)
teleprompter_fsrs = BootstrapFewShotWithRandomSearch(metric=answer_exact_match, 
                                                     max_labeled_demos=5)
                                            
optimised_classifier = teleprompter_fsrs.compile(classifier, trainset=train_set)

In [None]:
#Evaluate optimised program
new_eval_result=evaluate_program(optimised_classifier)

In [None]:
#Save
save_path = 'forest_health_classification.json'
optimized_classifier.save(save_path)