In [None]:
import os
import openai
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Ensure your OpenAI API key is set in the environment
openai.api_key = os.getenv("OPENAI_API_KEY")

In [None]:
df = pd.read_csv('labeled.csv')
df

In [None]:
SYSTEM_PROMPT = f"""Categorize the email below into exactly one of the following categories:

    Internship — only if it contains an update about the status of my internship applications (e.g., interview invitations, rejections, offers). Application confirmation or acknowledgment emails such as "Thank you for applying", "Thank you for your interest", "Thank you for your application", or "Your application is under review" is considered Irrelevant.
    
    Canvas — only if it announces a grade release for an assignment, quiz, exam, or important information regarding the courses.
    
    Personal — only if the message is personally written to me and is NOT spam, automated, or promotional.
    
    Irrelevant — anything else.
    
    Respond with ONLY the category name."""

def predict_label(text: str, model: str) -> str:
    # make sure text is a plain Python str (not a pandas or numpy object)
    text = str(text)
    messages = [
        {"role": "system",    "content": SYSTEM_PROMPT},
        {"role": "user",      "content": text}
    ]
    payload = {
        "model": model,
        "messages": messages,
        "temperature": 0.0
    }
    # inspect the JSON you’ll send
    resp = openai.chat.completions.create(**payload)
    return resp.choices[0].message.content.strip()


In [None]:
# unit testing
print(predict_label(df['text'][0], FINE_TUNED_MODEL))

In [None]:
# Run Batch Predictions
FINE_TUNED_MODEL = "FINE_TUNE_MODEL_NAME"

preds = []
for text in df['text']:
    preds.append(predict_label(text, FINE_TUNED_MODEL))

df['prediction'] = preds

In [None]:
df = df.rename(columns = {'prediction': 'prediction-fine-tune'})

In [None]:
# Compute Metrics
acc = accuracy_score(df['label'], df['prediction-fine-tune'])
print(f"Overall Accuracy: {acc:.3f}\n")

report = classification_report(df['label'], df['prediction-fine-tune'], output_dict=True, zero_division=0)
report_df = pd.DataFrame(report).transpose()
print("Classification Report:")
report_df

# Display confusion matrix
labels = ["Internship", "Canvas", "Personal", "Irrelavent"]
cm = confusion_matrix(df['label'], df['prediction-fine-tune'], labels=labels)
cm_df = pd.DataFrame(cm, index=labels, columns=labels)
print("\nConfusion Matrix:")
cm_df

In [None]:
# test base line performance
BASE_MODEL = 'gpt-4o-mini'
preds = []
for text in df['text']:
    preds.append(predict_label(text, BASE_MODEL))

df['prediction-baseline'] = preds

In [None]:
df.loc[df['prediction-baseline'] == 'Irrelevant', 'prediction-baseline'] = 'Irrelavent'

In [None]:
# Compute Metrics
acc_base = accuracy_score(df['label'], df['prediction-baseline'])
print(f"Overall Accuracy: {acc:.3f}\n")

# Display confusion matrix
labels = ["Internship", "Canvas", "Personal", "Irrelavent"]
cm = confusion_matrix(df['label'], df['prediction-baseline'], labels=labels)
cm_df = pd.DataFrame(cm, index=labels, columns=labels)
print("\nConfusion Matrix for base model (4o-mini):")
cm_df

In [None]:
print(f'Overall accuracy increase: {((acc - acc_base) / acc_base * 100):.3f}%')

In [None]:
print(f'Decrease in misclassified data: {(((23+7) - (2+2)) / (23+7) * 100):.3f}%')

## Conclusions: 

Fine-tuning is effective as it:
- Increased overall accuracy by 16.774%
- Decreased misclassified data by 86.667%

Also verified my intuition of getting a lot of false Internship updates.