# Notebook to demonstrate Zero shot and Few shot Learning

In [1]:
import pandas as pd 
from langchain_groq.chat_models import ChatGroq
from dotenv import load_dotenv
import os
from typing import Union
import sys
sys.path.append(os.path.abspath('..'))
from GenerateDataset import generate_dataset
import numpy as np
import re
import asyncio

In [7]:
activity_classes = ["LAYING", "SITTING", "STANDING", "WALKING", "WALKING_DOWNSTAIRS", "WALKING_UPSTAIRS"]

def get_sample_data(path, files_per_activity=-1):
    sample_data = []
    for activity_class in activity_classes:
        subpath = os.path.join(os.path.abspath('..'), path, activity_class)
        files = os.listdir(subpath)
        num_files = len(files) if files_per_activity == -1 else files_per_activity
        for file in range(num_files):
            sample_data.append(os.path.join(subpath, files[file]))
    return sample_data

def load_csv_files(directory, label, add_timestamp=True, add_total_acc=True, trim_data=True):
    df_list = []
    for filename in os.listdir(directory):
        if filename.endswith('.csv'):
            file_path = os.path.join(directory, filename)
            df = pd.read_csv(file_path)
            df['subject'] = filename.replace('.csv', '')
            # 50 samples per second
            # 1 sample takes 1/50 seconds
            if add_timestamp:
                df['timestamp'] = (df.index + 1) / 50
            if add_total_acc:
                df['total_acc'] = df['accx'] ** 2 + df['accy'] ** 2 + df['accz'] ** 2
            if trim_data:
                df_list.append(df.iloc[100:600, :])
            else:
                df_list.append(df)
    combined_df = pd.concat(df_list, ignore_index=True)
    combined_df['y'] = label
    return combined_df

def prepare_dataset(path):
    train_path = os.path.join(path, "Train")
    test_path = os.path.join(path, "Test")
    
    train_dfs = pd.concat([load_csv_files(os.path.join(train_path, activity_class), label) for label, activity_class in enumerate(activity_classes)])
    test_dfs = pd.concat([load_csv_files(os.path.join(test_path, activity_class), label) for label, activity_class in enumerate(activity_classes)])
    
    return train_dfs, test_dfs

def get_tsfel_features(tsfel_features_path):   
    data = pd.DataFrame()
    for label, activity_class in enumerate(activity_classes):
        folder_path = os.path.join(tsfel_features_path, activity_class)
        dfs = load_csv_files(folder_path, label, False, False, False)
        data = pd.concat([data, dfs], axis=0, ignore_index=True)
    return data

In [18]:
sample_data = get_sample_data(os.path.join("Datasets", 'Combined', 'Train'))
sample_data.extend(get_sample_data(os.path.join("Datasets", 'Combined', 'Test')))

In [19]:
class Example:

    def __init__(self, data: Union[pd.DataFrame, None], explanation: Union[str, None], classification: Union[str, None], modify=False, precision=None):
        self.data = self.modify_data(data, precision) if modify else data
        self.explanation = explanation
        self.classification = classification
        self.text = re.sub(r'[\[\]]', '', np.array2string(self.data.to_numpy(), separator=',')).replace(' ', '')

    @staticmethod
    def modify_data(data: Union[pd.DataFrame, None], precision=None):
        data = data.round(precision if precision is not None else 0) if data is not None else data
        return data

In [20]:
dfs = [pd.read_csv(sample_data[i]).iloc[100:600, :] for i in range(len(sample_data))]
titles = [sample.split('\\')[-2].title().replace('_', ' ') for sample in sample_data]

In [21]:
load_dotenv() # load environment variables
Groq_Token = os.environ['API_KEY']

groq_models = {"llama3-70b": "llama3-70b-8192", "mixtral": "mixtral-8x7b-32768", "gemma-7b": "gemma-7b-it","llama3.1-70b":"llama-3.1-70b-versatile","llama3-8b":"llama3-8b-8192","llama3.1-8b":"llama-3.1-8b-instant","gemma-9b":"gemma2-9b-it"}

**NOTE : DO NOT SHARE THE API KEY WITH ANYONE. DO NOT COMMIT THE API KEY TO GITHUB.**

Always do a sanity check before committing the code to github. If the key is found in the code, you will be penalized with a 0.5 marks deduction.

In [24]:
len(sample_data)

180

# Zero Shot 

In [22]:
examples = []

for i, df in enumerate(dfs):
    examples.append(Example(df, None, titles[i], True, 3))

query = """
You are given the x, y and z values of acceleration of a person. You need to classify the activities into one of the following classes: Laying, Sitting, Standing, Walking, Walking downstairs, Walking Upstairs.
Analyze the data carefully. Of course, in stationary activities like laying, sitting, and standing, there is not much change in their values. However, for dynamic activities like walking, some of their values change a lot.
Think in the following process:
- First determine whether it is a stationary activity or a dynamic activity. You can do this by observing if the change in each coordinate is minimal or a lot.
- If the activity is stationary: for sitting, the y and z components are nearly identical. For laying, the x component is lower than the y and z components, and for standing, the x component is higher than the y and z components. 
- If the activity is dynamic (walking): determine the activity based on observation.
Each activity consists of 100 rows equivalent to 2 seconds of that activity. Output your best guess without any explanation. Your response for each activity will therefore consist of at most two words.
"""

# for i, example in enumerate(examples):
#     query += f"\n\n## Activity {i + 1}\n\n{example.text}"

model_name = "llama3-70b"
llm = ChatGroq(model=groq_models[model_name], api_key=Groq_Token, temperature=0)
print(llm.invoke(query).content)

total = 0
correct = 0

for i, example in enumerate(examples):
    resposne = llm.invoke(f'{query}\n\n{example.text}').content
    print(resposne)
    print(example.classification)
    if resposne.lower() == example.classification.lower():
        correct += 1
    total += 1

accuracy = correct / total
accuracy *= 100

print(f'Accuracy is {accuracy:.2f}%')

I'm ready to classify the activities. Please provide the x, y, and z values of acceleration for each activity, and I'll respond with my best guess for each activity.
Sitting
Laying
Standing
Laying
Standing
Laying
Sitting
Laying
Standing
Laying
Standing
Laying
Sitting
Laying
Standing
Laying
Sitting
Laying
Standing
Laying
Standing
Laying
Sitting
Laying
Sitting
Laying
Laying
Laying
Sitting
Laying
Sitting
Laying
Standing
Laying
Standing
Laying
Sitting
Laying
Standing
Laying
Standing
Laying
Sitting
Sitting
Standing
Sitting
Standing
Sitting
Standing
Sitting
Standing
Sitting
Standing
Sitting
Standing
Sitting
Standing
Sitting
Standing
Sitting
Standing
Sitting
Sitting
Sitting
Sitting
Sitting
Standing
Sitting
Standing
Sitting
Standing
Sitting
Sitting
Sitting
Standing
Sitting
Standing
Sitting
Standing
Sitting
Standing
Sitting
Standing
Sitting
Standing
Standing
Standing
Standing
Standing
Standing
Standing
Standing
Standing
Standing
Standing
Standing
Standing
Standing
Standing
Standing
Standing
Sta

In [9]:
examples = []

for i, df in enumerate(dfs):
    examples.append(Example(df.iloc[100:400], None, titles[i], True, None))

query = """
You are given the x, y and z values of acceleration of a person. You need to classify the activity into one of the following classes: Laying, Sitting, Standing, Walking, Walking Downstairs, Walking Upstairs.
Analyze the data carefully. Of course, in stationary activities like laying, sitting, and standing, there is not much change in their values. However, for dynamic activities like walking, some of their values change a lot.

Think in the following process:
- First determine whether it is a stationary activity or a dynamic activity. You can do this by observing if the change in each coordinate is minimal (if the maximum value minus the minimal value is less than 0.1 for that component, then it is very little, it means it is most probable that there was no movement involved) or a lot.
- If the activity is stationary: for sitting, the y and z components are nearly identical (if the maximum value minus the minimal value is less than 0.1 for each of these components, then they are identical). For laying, the maximum x component is lower than the maximum y component by at least 0.1. For standing, the maximum x component is greater than the maximum y component by at least 0.1
- If the activity is dynamic (walking): determine the activity based on observation.

The activity consists of 300 rows equivalent to 6 seconds of that activity. Output your best guess without any explanation. Your response the activity will therefore consist of at most two words.

{text}
"""

model_name = "gemma-9b"
llm = ChatGroq(model=groq_models[model_name], api_key=Groq_Token, temperature=0)

correct = 0
total = 0

for example in examples:
    response = llm.invoke(query.format(text=example.text)).content.strip()
    print(response, "|" ,example.classification)
    if response == example.classification:
        correct += 1
    total += 1

accuracy = correct / total
accuracy *= 100
print(f'Accuracy: {accuracy:.2f}%')

Laying | Laying
Laying | Laying
Laying | Laying
Laying | Laying
Standing | Laying
Laying | Laying
Sitting | Laying
Laying | Laying
Laying | Laying
Standing | Laying
Standing | Laying
Sitting | Laying
Sitting | Laying
Laying | Laying
Laying | Laying
Laying | Laying
Laying | Laying
Sitting | Laying
Laying | Laying
Laying | Laying
Sitting | Laying
Laying | Sitting
Laying | Sitting
Laying | Sitting
Laying | Sitting
Laying | Sitting
Laying | Sitting
Standing | Sitting
Laying | Sitting
Laying | Sitting
Laying | Sitting
Standing | Sitting
Laying | Sitting
Laying | Sitting
Laying | Sitting
Laying | Sitting
Standing | Sitting
Laying | Sitting
Laying | Sitting
Laying | Sitting
Sitting | Sitting
Standing | Sitting
Laying | Standing
Laying | Standing
Laying | Standing
Laying | Standing
Laying | Standing
Laying | Standing
Laying | Standing
Laying | Standing
Laying | Standing
Laying | Standing
Laying | Standing
Laying | Standing
Laying | Standing
Laying | Standing
Laying | Standing
Laying | Standing

# Few Shot

To pass examples in the prompt, we need a high context window. For this task, we will use mixtral (mixtral-8x7b-32768). For each activity, if we were to take 1 subject's data with 500 time stamps, the 

In [18]:
# Statement 
sentence = "The product quality is amazing but the delivery was delayed. However I am happy with the customer service."

# System Prompts 
query = f"""
* You are a sentiment analysis model. 
* Your task is to analyze the sentiment expressed in the given text and classify it as 'positive', 'negative', or 'neutral'. 
* Provide the sentiment label and, if necessary, a brief explanation of your reasoning.

Here are few examples:
1. Sentence: 'The customer service was excellent, and I received my order quickly.'
Sentiment: Positive

2. Sentence: 'The food was bland and the service was slow.'
Sentiment: Negative

3. Sentence: 'The product is okay, but it's not worth the price.'
Sentiment: Neutral

Sentence: {sentence}
""" 

# To use Groq LLMs 
model_name = "llama3-70b" # We can choose any model from the groq_models dictionary
llm = ChatGroq(model=groq_models[model_name], api_key=Groq_Token, temperature=0)
answer = llm.invoke(query)

print(answer.content)

Sentiment: Positive

Explanation: Although the sentence mentions a negative aspect ("the delivery was delayed"), the positive sentiments expressed in the sentence ("The product quality is amazing" and "I am happy with the customer service") outweigh the negative one, resulting in an overall positive sentiment.
