In [1]:
%run 00_utils.ipynb

In [2]:
import litellm
from litellm import completion
import instructor
from tenacity import retry, stop_after_attempt, wait_exponential
from langsmith import traceable
from pydantic import BaseModel, Field
from instructor.utils import disable_pydantic_error_url
from typing import Literal
from textwrap import dedent
from tqdm.notebook import tqdm

In [3]:
#litellm._turn_on_debug()
litellm.drop_params = True
disable_pydantic_error_url()

# Initialize the instructor client
client = instructor.from_litellm(completion)

In [4]:
def sanitise_inputs(inputs: dict) -> dict:
    del inputs['response_model']
    return inputs

@traceable(name='LLMRun', run_type='llm', process_inputs=sanitise_inputs)
@retry(stop=stop_after_attempt(2), wait=wait_exponential(multiplier=10, max=60))
def get_llm_response(
        messages: list[dict[str, str]],
        ls_provider="openai",
        ls_model_name="gpt-4.1",
        temperature=0,
        seed=None,
        response_model=None,
        max_retries=2,
):
    """
    Helper function to get a response from the OpenAI compatible completion endpoints using litellm and instructor.
    Includes retry logic for rate limits as well as tracing with langsmith.
    """

    model = f"{ls_provider}/{ls_model_name}"

    params = {
        "messages": messages,
        "model": model,
        "temperature": temperature,
    }

    if seed is not None:
        params["seed"] = seed

    if response_model is not None:
        # Instructor structured outputs
        params["response_model"] = response_model

        # Set number of retries incase output does not match the response_model
        params["max_retries"] = max_retries
        return client.chat.completions.create(**params)

    return completion.create(**params)

In [6]:
# Define the classification prompt and response model
classifier_prompt = dedent("""
You will be provided with a short summary of a news article.
Your task is to classify the news article into one of the following categories: <Finance>, <Music>, <Lifestyle>, <Sports>, <Other>.
Use <Other> if the category cannot be confidently determined as either <Finance>, <Music>, <Lifestyle>, <Sports>.
""")

class Classifier(BaseModel):
    """
    Classify the news article into one of the following categories: Finance, Music, Lifestyle, Sports, Other
    Use Other if the category cannot be confidently determined as either Finance, Music, Lifestyle, Sports.
    """
    Category: Literal["Finance", "Music", "Lifestyle", "Sports", "Other"] = Field(
        ...,
        description="The predicted category of the news article"
    )

In [14]:
# Create a tool to help classify the news articles
@traceable(name='Classifier', run_type='tool')
def run_classification(summary: str) -> str:
    messages = [
        {"role": "system", "content": classifier_prompt},
        {"role": "user", "content": summary}
    ]
    model_params = {
            "ls_provider": "openai",
            "ls_model_name": "gpt-4.1"
        }
    response = get_llm_response(
        messages=messages,
        **model_params,
        seed=42,
        response_model=Classifier,
        langsmith_extra={
            'metadata': {
                'ls_provider': model_params['ls_provider'],
                'ls_model_name': model_params['ls_model_name']
            }
        }
    )
    return response

In [7]:
# Get the data from sheets
df = sheets_to_df("data_feed", SHEET_URL)
df['published'] = pd.to_datetime(df['published'], utc=True, errors='coerce')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307 entries, 0 to 306
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype              
---  ------       --------------  -----              
 0   url          307 non-null    object             
 1   source       307 non-null    object             
 2   title        307 non-null    object             
 3   description  307 non-null    object             
 4   author       307 non-null    object             
 5   published    307 non-null    datetime64[ns, UTC]
 6   summary      307 non-null    object             
dtypes: datetime64[ns, UTC](1), object(6)
memory usage: 16.9+ KB


In [17]:
# Create a new dataframe to store the classification results
classify_df = df.copy()
classify_df['category'] = None

In [20]:
# Run classification job over the entire dataset
for idx, row in tqdm(classify_df.iterrows(), total=len(classify_df)):
    summary = row['summary']
    try:
        res = run_classification(summary)
        classify_df.at[idx, 'category'] = res.Category
    except:
        print(f"Classification failed for row: {idx}.")

  0%|          | 0/307 [00:00<?, ?it/s]

In [21]:
classify_df.category.value_counts()

category
Other        106
Lifestyle     84
Sports        66
Finance       33
Music         18
Name: count, dtype: int64

In [22]:
# Export to sheets
df_to_sheets(classify_df, "data_classify", SHEET_URL)

"Data uploaded successfully to sheet: 'data_classify'"