# Text Classification Exercise - Topic Detection

In [None]:
# Dataset - [Topic, Text to classify]
labels = ['Technology','Sports','Language','Food']

# From the given label, add a ground truth topic label to each statement below.
dataset = [
    ['',"Technology shapes our lives, from smartphones to algorithms. It drives innovation and connects us in ways we couldn't imagine"],
    ['', "AI ethics is a critical consideration in developing responsible algorithms."],
    ['', "Language is the expression of ideas through speech-sounds and words."], 
    ['', "Words are combined into sentences, answering to ideas into thoughts."], 
    ['',"Content moderation on social media platforms detect and filter out inappropriate language and harmful content to maintain a respectful and safe online environment"],
    ['',"Speech-to-text software has become crucial for accessibility, allowing users to transcribe spoken language into written text efficiently"],
    ['',"Language is a dynamic system of communication that evolves over time, reflecting cultural, social, and historical changes in society."],
    ['', "Golden State Warriors seek a second star alongside Stephen Curry."], 
    ['', "San Francisco 49ers maintain a successful offensive strategy."], 
    ['','In the case of food establishments, like most sports, the first line of defense are the players in the game, which are the industry that produces the products.'],
    ['','After a thrilling soccer match, fans celebrate with stadium hot dogs and cold beverages.'],
    ['','Athletes know that proper nutrition is as crucial as their training regimen.'],
    ['','In the culinary Olympics, the gold medal goes to the chef who masters flavor balance.'],
    ['','Basketball players fuel up with protein-packed meals before hitting the court.'],
    ['','The marathon of cooking competitions leaves chefs both exhausted and exhilarated.'],
]

import pandas as pd
df = pd.DataFrame(dataset, columns=['Topic', 'Text'])
display(df)

Run the topic detection test with a prompt to label each text with your one of your topics.

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI

tagging_prompt = ChatPromptTemplate.from_template(
    """
Extract the desired information from the following passage.

Only extract the properties mentioned in the 'Classification' function.

Passage:
{input}
"""
)

class Classification(BaseModel):
    Topic: str = Field(description="Choose at most one topic from this list: " + ''.join(labels) + " that are related to the content")

# LLM
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo").with_structured_output(
    Classification
)

tagging_chain = tagging_prompt | llm

sample = df
results = []
expected_result = []
for i in sample.iterrows():
    expected_result.append(i[1]['Topic'])
    result = tagging_chain.invoke({"input": i[1]['Text']})
    results.append(result.Topic)


Here view the result

In [None]:
combined_data = list(zip(expected_result, results, df['Text']))
df2 = pd.DataFrame(combined_data, columns=['expected', 'actual','text'])
pd.set_option('display.max_colwidth', None)
display(df2)

Review the results of the confusion matrix and identify which kind of texts are not performing well.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

# Sample true labels and predicted labels
y_true = np.array(expected_result)
y_pred = np.array(results)

print('expected value: ', y_true)
print('actual value: ', y_pred)

# Compute the confusion matrix
cm = confusion_matrix(y_true, y_pred, labels=labels)

report = classification_report(y_true, y_pred, labels=labels, target_names=labels, zero_division=0)
print(report)


# Display the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot()

plt.show()

# Code below this cell has more detail on TP, FP, TN, FN

In [None]:
from sklearn.metrics import multilabel_confusion_matrix

def calculate_score(TN, FP, FN, TP):
    # Sensitivity, hit rate, recall, or true positive rate
    TPR = TP/(TP+FN)
    # Specificity or true negative rate
    TNR = TN/(TN+FP) 
    # Precision or positive predictive value
    PPV = TP/(TP+FP)
    # Negative predictive value
    NPV = TN/(TN+FN)
    # Fall out or false positive rate
    FPR = FP/(FP+TN)
    # False negative rate
    FNR = FN/(TP+FN)
    # False discovery rate
    FDR = FP/(TP+FP)

    # Overall accuracy
    ACC = (TP+TN)/(TP+FP+FN+TN)

    return ACC, PPV, TPR

mcm = multilabel_confusion_matrix(y_true, y_pred,labels=labels)
count = 0
calculation_result = []
# Display the confusion matrix
for cm_i in mcm:
    disp2 = ConfusionMatrixDisplay(confusion_matrix=cm_i, display_labels=['not '+ labels[count], labels[count]])
    disp2.plot()

    tn, fp, fn, tp = cm_i.ravel()
    acc, prec, rec = calculate_score(tn, fp, fn, tp)
    calculation_result.append([labels[count],tn, fp, fn, tp, acc, prec, rec])
    count = count + 1

df2 = pd.DataFrame(calculation_result, columns=['Topic','TN', 'FP','FN', 'TP', 'Accuracy','Precision','Recall'])
pd.set_option('display.max_colwidth', None)
display(df2)

plt.show()

Explore reasons for poor performance by modifying your prompt or modifying the texts to see what changes.