# Document Classification

1. Look at NYT Data
2. Create prompts 
3. Run tests

## Load Data

In [15]:
import pandas as pd

def load_data(file_path):
    """
    Load data from a text file and return it as a list of non-empty lines.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.readlines()
    content = [line.strip() for line in content if line.strip()]
    return content

def load_nyt_data(dataset_path, labels_path, classes_path, output_path=None):
    """
    Load and clean NYT data, including articles, labels, and label names.
    
    Args:
        dataset_path (str): Path to the dataset file containing articles.
        labels_path (str): Path to the labels file.
        classes_path (str): Path to the classes file containing label names.
        output_path (str, optional): Path to save the cleaned DataFrame as a CSV file.

    Returns:
        pd.DataFrame: Cleaned DataFrame with articles, integer labels, and label names.
    """
    # Load data into a DataFrame
    nyt_data = pd.DataFrame().assign(
        articles=load_data(dataset_path),
        labels=load_data(labels_path),
    )
    nyt_data['labels'] = nyt_data['labels'].astype(int)

    # Load label names and map them
    classes = load_data(classes_path)
    nyt_data = nyt_data.assign(
        labels_named=nyt_data['labels'].apply(lambda x: classes[x])
    )
    
    # Save to CSV if output path is provided
    if output_path is not None:
        nyt_data.to_csv(output_path, index=False)
        
    return nyt_data

In [16]:
nyt_data = load_nyt_data(
    dataset_path='data/raw/NYT-Small/dataset.txt', 
    labels_path='data/raw/NYT-Small/labels.txt', 
    classes_path='data/raw/NYT-Small/classes.txt',
    output_path='data/clean/nyt_data.csv'
)

nyt_data

Unnamed: 0,articles,labels,labels_named
0,(Reuters) - Carlos Tevez sealed his move to Ju...,4,sport
1,"NASA, in preparation for a spacewalk on Saturd...",3,science
2,If professional pride and strong defiance can ...,4,sport
3,"Admittedly, the language is reconstructed and ...",1,art
4,"PALERMO, Sicily — Roberta Vinci beat top-seede...",4,sport
...,...,...,...
13076,"MELBOURNE, Australia — After the tears and the...",4,sport
13077,"PERTH, Australia — Wimbledon semifinalist Jerz...",4,sport
13078,While the Nets are hoisting No. 5 to the Barcl...,4,sport
13079,SYDNEY — Tennis Australia chief executive Stev...,4,sport


## Build Classifier

In [17]:
classes = load_data('data/NYT-Small/classes.txt')
classes

FileNotFoundError: [Errno 2] No such file or directory: 'data/NYT-Small/classes.txt'

In [20]:
from dotenv import load_dotenv
import os 
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

load_dotenv()
# api_key = os.environ.get("SAMBANOVA_API_KEY")
api_key = os.environ.get("OPENAI_API_KEY")

# llm = ChatOpenAI(
#     base_url="https://api.sambanova.ai/v1/",  
#     api_key=api_key,
#     streaming=True,
#     model="Meta-Llama-3.1-70B-Instruct",
# )

llm = ChatOpenAI(
    base_url="https://api.openai.com/v1/",  
    api_key=api_key,
    streaming=True,
    model="gpt-4o-mini",
)

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert in document classification. "
            "Your task is to classify an article into one of the following categories. \n"
            "0 - politics \n"
            "1 - art \n"
            "2 - business \n"
            "3 - science \n"
            "4 - sport \n"
        ),
        ("human", "Classify the article. Respond with 0 for politics, 1 for art, 2 for business, 3 for science, 4 for sport. RESPOND WITH AN INTEGER ONLY. \n\n{article}"),
    ]
)

classifier = prompt | llm

def classify_article(article):
    result = classifier.invoke({'article': article})
    try:
        return int(result.content)
    except ValueError: 
        print(f'Cannot represent {result.content} as an integer!')
        return None


## Run Tests

In [21]:
import pandas as pd

data = nyt_data
run_all = False
class_wise_sample_size = 20
truncate_length = 200

if not run_all:   
    # Sample the first 100 from each class in the 'labels' column
    data = data.groupby('labels').head(class_wise_sample_size)
    
# Truncate articles before classification
articles = data['articles'].str[0:truncate_length]

pred = data['articles'].apply(classify_article)

In [22]:
from sklearn.metrics import precision_recall_fscore_support

# Assuming 'pred' and 'labels' are your input series
precision, recall, f1, _ = precision_recall_fscore_support(data['labels'], pred, average='weighted')

print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

Precision: 0.96
Recall: 0.96
F1 Score: 0.96


In [17]:
sampled_data.iloc[:10]

Unnamed: 0,articles,labels,labels_named
0,(Reuters) - Carlos Tevez sealed his move to Ju...,4,sport
1,"NASA, in preparation for a spacewalk on Saturd...",3,science
2,If professional pride and strong defiance can ...,4,sport
3,"Admittedly, the language is reconstructed and ...",1,art
4,"PALERMO, Sicily — Roberta Vinci beat top-seede...",4,sport
5,Spain's big two soccer teams face a pair of It...,4,sport
6,The Argentine soccer club San Lorenzo complete...,4,sport
7,Bryce Petty had to wait behind a Heisman Troph...,4,sport
8,ROME — The work was by Verdi. Three fine young...,1,art
9,PARIS — Wild card Gael Monfils confirmed there...,4,sport
