In [1]:
import pandas as pd
import cohere
from sklearn.model_selection import train_test_split

import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [3]:
import sys
sys.path.insert(0,'../')
import config

In [4]:
import cohere
co = cohere.Client(config.api_key)

In [5]:
df = pd.read_csv('../data/news.csv')
df.shape

(10, 9)

In [6]:
df['Analyst_Rank'] = df['Analyst_Rank'].apply(lambda x: 0 if x<4 else 1)
df['Analyst_Rank'].value_counts()

1    7
0    3
Name: Analyst_Rank, dtype: int64

In [7]:
# Split the dataset into training and test portions
# Training = training the LLM
# Test = For evaluating the classifier performance
X, y = df["Title"], df["Analyst_Rank"].astype(str)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.10, random_state=21)

In [8]:
# View the list of all available categories
intents = y_train.unique().tolist()
print(intents)

['1', '0']


### Few-shot classification

#### sample data

In [9]:
# Set the number of examples per category
EX_PER_CAT = 6

# Create list of examples containing texts and labels - sample from the dataset
ex_texts, ex_labels = [], []
for intent in intents:
  y_temp = y_train[y_train == intent]
  sample_indexes = y_temp.index#.sample(n=EX_PER_CAT, random_state=42).index
  ex_texts += X_train[sample_indexes].tolist()
  ex_labels += y_train[sample_indexes].tolist()

# print(f'Number of examples per class: {EX_PER_CAT}')
print(f'Number of classes: {len(intents)}')
print(f'Total number of examples: {len(ex_texts)}')

Number of classes: 2
Total number of examples: 9


#### classification using cohene classify endpoint

In [10]:
# Collate the examples via the Example module
from cohere.classify import Example

examples = list()
for txt, lbl in zip(X_train,y_train):
  examples.append(Example(txt,lbl))

In [11]:
def classify_text(text, examples):
  classifications = co.classify(
      model='medium',  # model version - medium-22020720
      inputs=[text],
      examples=examples
  )
  return classifications.classifications[0].prediction

In [12]:
examples

[cohere.Example {
 	text: Marché Résines dans les peintures et revêtements 2021 avec les données des meilleurs pays et l analyse Covid 19  la portée future  l estimation de la taille  les revenus  les tendances des prix et les prévisions d ici 2026
 	label: 1
 },
 cohere.Example {
 	text: The construction sector is expected to be boosted by riots and looting repairs
 	label: 0
 },
 cohere.Example {
 	text: AI drives data analytics surge  study finds
 	label: 1
 },
 cohere.Example {
 	text: Male arrested for the murder of an elderly female in Cofimvaba   SAPS Crime Report  2021 09 09 13 22 58
 	label: 0
 },
 cohere.Example {
 	text: 7th Anniversary of SCOAN Collapse in Nigeria SABC News
 	label: 1
 },
 cohere.Example {
 	text: Boris Johnson using a taxpayer funded jet for an election campaign fits a long history of taking things he didn t pay for
 	label: 1
 },
 cohere.Example {
 	text: Triacetin Vertrieb Markt 2021  Globale Unternehmensanalyse  Merkmale  Marktplatzlänge und  prognosen 

In [13]:
# Generate classification predictions on the test dataset (this will take a few minutes)
y_pred = X_test.apply(classify_text, args=(examples,)).tolist()

In [14]:
# Compute metrics on the test dataset
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {100*accuracy:.2f}')
print(f'F1-score: {100*f1:.2f}')

Accuracy: 100.00
F1-score: 100.00


#### embedding using the embed endpoint from cohere

In [15]:
# Embed the training set
embeddings_train = co.embed(texts=X_train.tolist(),
                            model="large",
                            truncate="LEFT").embeddings
# Embed the testing set
embeddings_test = co.embed(texts=X_test.tolist(),
                           model="large",
                           truncate="LEFT").embeddings

In [16]:
print(f"Review text: {X_train[0]}")
print(f"Embedding vector: {embeddings_train[0][:10]}")

Review text: Boris Johnson using a taxpayer funded jet for an election campaign fits a long history of taking things he didn t pay for
Embedding vector: [1.5737689, 1.0490721, -1.079248, 0.16202086, 0.72694844, -0.6338423, 2.262563, -0.5368688, -0.46243724, 1.3330556]


In [18]:
# Collate the examples via the Example module
from cohere.classify import Example

examples = list()
for txt, lbl in zip(X_train, y_train):
  examples.append(Example(txt,lbl))