We will try to use existing LLMs for classification without fine-tuning because our dataset is small.

# Libraries and utilities

In [17]:
import CBTTextProcessing as cbt_text
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
from ipywidgets import IntProgress
from IPython.display import display
from langchain_community.llms import Ollama
from transformers import pipeline

In [18]:
def get_highest_score_and_label(scores, labels):
    highest_score_id = np.argmax(scores)
    return scores[highest_score_id], labels[highest_score_id]

# Getting data

In [10]:
documents = []
headers = []

progress = IntProgress(min=0, max=1010)
display(progress)


for id in range(0, 1010):
    readme_file_name = cbt_text.id_to_file_name(id, 'readme')
    about_file_name = cbt_text.id_to_file_name(id, 'about')
    header, readme = cbt_text.get_text_from_file(readme_file_name)
    header, about = cbt_text.get_text_from_file(about_file_name)
    content = 'about: ' + about + ' readme: ' + readme
    content = cbt_text.preprocess_text(header, content)
    headers.append(header)
    documents.append(content)
    progress.value += 1

IntProgress(value=0, max=1010)

# CLassification

In [15]:
candidate_labels = ["tool", "article", "guide", "other"]

In [28]:
tools = []
other = []
articles = []
guides = []

## Classification using `Bart`

In [12]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", revision="c626438")

In [30]:
progress = IntProgress(min=0, max=5)
display(progress)

for i in range(5):
    result = classifier(
        documents[i],
        candidate_labels = candidate_labels
    )
    score, label = get_highest_score_and_label(result['scores'], result['labels'])
    object = {}
    object['id'] = i
    object['label'] = label
    object['score'] = score

    if label == 'tool':
        tools.append(object)
    elif label == 'other':
        other.append(object)
    elif label == 'article':
        articles.append(object)
    else:
        guides.append(object)

    progress.value += 1

IntProgress(value=0, max=5)

In [31]:
print(tools)

[{'id': 3, 'label': 'tool', 'score': 0.4068570137023926}]


In [32]:
print(articles)

[{'id': 2, 'label': 'article', 'score': 0.7431250214576721}, {'id': 4, 'label': 'article', 'score': 0.36776354908943176}]


In [33]:
print(guides)

[]


In [34]:
print(other)

[{'id': 0, 'label': 'other', 'score': 0.393204003572464}, {'id': 1, 'label': 'other', 'score': 0.37961238622665405}]


In [35]:
print("Tools: ")
for tool in tools:
    print(headers[tool['id']])

Tools: 
CBT010


In [36]:
print("Articles: ")
for article in articles:
    print(headers[article['id']])

Articles: 
CBT009
CBT011


In [37]:
print("Guides: ")
for guide in guides:
    print(headers[guide["id"]])

Guides: 


In [38]:
print("Other: ")
for o in other:
    print(headers[o["id"]])

Other: 
CBT006
CBT008
