In [11]:
import os
import json
import numpy as np
import pandas as pd
import cohere
from cohere import ClassifyExample
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [2]:
co = cohere.ClientV2("cZWxyHPX5B72hYVgeLK45bTrwiM05v8lQ5dHGIXS")

In [3]:
examples = [ClassifyExample(text="I’m so proud of you", label="positive"), 
            ClassifyExample(text="What a great time to be alive", label="positive"), 
            ClassifyExample(text="That’s awesome work", label="positive"), 
            ClassifyExample(text="The service was amazing", label="positive"), 
            ClassifyExample(text="I love my family", label="positive"), 
            ClassifyExample(text="They don't care about me", label="negative"), 
            ClassifyExample(text="I hate this place", label="negative"), 
            ClassifyExample(text="The most ridiculous thing I've ever heard", label="negative"), 
            ClassifyExample(text="I am really frustrated", label="negative"), 
            ClassifyExample(text="This is so unfair", label="negative"),
            ClassifyExample(text="This made me think", label="neutral"), 
            ClassifyExample(text="The good old days", label="neutral"), 
            ClassifyExample(text="What's the difference", label="neutral"), 
            ClassifyExample(text="You can't ignore this", label="neutral"), 
            ClassifyExample(text="That's how I see it", label="neutral")]

In [4]:
inputs = ["Hello, world! What a beautiful day",
          "It was a great time with great people",
          "Great place to work",
          "That was a wonderful evening",
          "Maybe this is why",
          "Let's start again",
          "That's how I see it",
          "These are all facts",
          "This is the worst thing",
          "I cannot stand this any longer",
          "This is really annoying",
          "I am just plain fed up"]

In [5]:
def classify_text(inputs, examples):
    """
    Classifies a list of input texts given the examples
    Arguments:
        model (str): identifier of the model
        inputs (list[str]): a list of input texts to be classified
        examples (list[Example]): a list of example texts and class labels
    Returns:
        classifications (list): each result contains the text, labels, and conf values
    """
    # Classify text by calling the Classify endpoint
    response = co.classify(
        model='embed-english-v3.0',
        inputs=inputs,
        examples=examples)

    classifications = response.classifications

    return classifications

# Classify the inputs
predictions = classify_text(inputs, examples)

In [8]:
predictions

[ClassifyResponseClassificationsItem(id='79c72411-d52a-43b2-b823-8f7539d11081', input='hello, world! what a beautiful day', prediction='positive', predictions=['positive'], confidence=0.40137812, confidences=[0.40137812], labels={'negative': ClassifyResponseClassificationsItemLabelsValue(confidence=0.23582731), 'neutral': ClassifyResponseClassificationsItemLabelsValue(confidence=0.36279458), 'positive': ClassifyResponseClassificationsItemLabelsValue(confidence=0.40137812)}, classification_type='single-label'),
 ClassifyResponseClassificationsItem(id='a699b539-f8d9-40bf-859b-3f3fbb3a1d8c', input='it was a great time with great people', prediction='positive', predictions=['positive'], confidence=0.49054274, confidences=[0.49054274], labels={'negative': ClassifyResponseClassificationsItemLabelsValue(confidence=0.19989403), 'neutral': ClassifyResponseClassificationsItemLabelsValue(confidence=0.30956325), 'positive': ClassifyResponseClassificationsItemLabelsValue(confidence=0.49054274)}, cl

In [10]:
for i in predictions:
    print(i.input)
    print(i.prediction)
    print(i.confidence)
    print("--------")

hello, world! what a beautiful day
positive
0.40137812
--------
it was a great time with great people
positive
0.49054274
--------
great place to work
positive
0.50350463
--------
that was a wonderful evening
positive
0.48206237
--------
maybe this is why
neutral
0.45420286
--------
let ' s start again
neutral
0.41799143
--------
that ' s how i see it
neutral
0.5275604
--------
these are all facts
neutral
0.41150308
--------
this is the worst thing
negative
0.5155172
--------
i cannot stand this any longer
negative
0.51797503
--------
this is really annoying
negative
0.5610805
--------
i am just plain fed up
negative
0.5657494
--------


### Fine tuning the classification model using a custom dataset

In [12]:
# Load the dataset to a dataframe
df = pd.read_csv('https://raw.githubusercontent.com/cohere-ai/notebooks/main/notebooks/data/atis_subset.csv', names=['query','intent'])

In [13]:
df

Unnamed: 0,query,intent
0,i want to fly from boston at 838 am and arriv...,atis_flight
1,what flights are available from pittsburgh to...,atis_flight
2,what is the arrival time in san francisco for...,atis_flight_time
3,cheapest airfare from tacoma to orlando,atis_airfare
4,round trip fares from pittsburgh to philadelp...,atis_airfare
...,...,...
995,flights from baltimore to dallas,atis_flight
996,what do you have tomorrow morning from pittsb...,atis_flight
997,i would like some information on the earliest...,atis_flight
998,what flights are there from cleveland to miam...,atis_flight


In [15]:
# Split the dataset into training and test portions
df_train, df_test = train_test_split(df, test_size=200, random_state=21)

In [17]:
def create_classification_data(text, label):
    formatted_data = {
        "text": text,
        "label": label
    }
    return formatted_data

if not os.path.isfile("data.jsonl"):
    print("Creating jsonl file ...")
    with open("data.jsonl", 'w+') as file:
        for row in df_train.itertuples():
            formatted_data = create_classification_data(row.query, row.intent)
            file.write(json.dumps(formatted_data) + '\n')
        file.close()
        print("Done")
else:
    print("data.jsonl file already exists")


Creating jsonl file ...
Done


#### navigate to cohere dashboard and upload jsonl data to fine tune a classifier model - https://dashboard.cohere.com/fine-tuning

In [None]:
# Generate classification predictions on the test dataset using the finetuned model

# Classification function
def classify_text_finetune(texts):
    classifications = co.classify(
        model='b2c94ac3-7a74-4de7-a11a-9808a3b8ef59-ft',
        inputs=texts,
        examples=None
    ).classifications
    return [c.predictions[0] for c in classifications]

# Create batches of texts and classify them
BATCH_SIZE = 90 # The API accepts a maximum of 96 inputs
y_pred = []
for i in range(0, len(df_test), BATCH_SIZE):
    batch_texts = df_test["query"][i:i+BATCH_SIZE].tolist()
    y_pred.extend(classify_text_finetune(batch_texts))

In [None]:
# Compute metrics on the test dataset
accuracy = accuracy_score(df_test["intent"], y_pred)
f1 = f1_score(df_test["intent"], y_pred, average='weighted')

print(f'Accuracy: {100*accuracy:.2f}')
print(f'F1-score: {100*f1:.2f}')
