In [1]:
## IMPORTS
import openai
import pandas as pd
import numpy as np
import json
from prompts import PRODUCT_PROMPT, ISSUE_TYPE_PROMPT, SERVICES_PROMPT, RELATIONSHIP_PROMPT, RESOLUTION_COMPLETION_PROMPT
import os 
from dotenv import load_dotenv
from tqdm import tqdm

load_dotenv()

# Load environment variables from .env file
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Set OpenAI API key
client = openai.OpenAI(api_key=OPENAI_API_KEY)

In [2]:
data_name = "twcs_structured_UniqueCount-4000_time-20250420-1907_extracted"

In [3]:
data = pd.read_excel(f'..\\..\\data\processed\extraction_output\\{data_name}.xlsx')

In [4]:
## Product Extraction
def extract_product(text):

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": PRODUCT_PROMPT},
            {"role": "user", "content": text}
        ],
        temperature=0,
        top_p=0.95
    )

    return response.choices[0].message.content

## Issue Type Extraction
def extract_issue_type(text):

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": ISSUE_TYPE_PROMPT},
            {"role": "user", "content": text}
        ],
        temperature=0,
        top_p=0.95
    )

    return response.choices[0].message.content

## Services Extraction
def extract_services(text):

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": SERVICES_PROMPT},
            {"role": "user", "content": text}
        ],
        temperature=0,
        top_p=0.95
    )

    return response.choices[0].message.content

## Relationship Extraction
def extract_relationship(conversation_text, entities):
    
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": RELATIONSHIP_PROMPT},
            {"role": "user", "content": f"""
                Here is the conversation text: \n'{conversation_text}'.
                Extracted entities: \n{entities}.
                Identify relationships between these elements and provide RDF triples.
                """}
        ],
        temperature=0,
        top_p=0.95
    )

    return response.choices[0].message.content

def resolution_completion(conversation_text, relationship_text):

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": RESOLUTION_COMPLETION_PROMPT},
            {"role": "user", "content": f"""
                Here is the conversation text: \n'{conversation_text}'.
                Triples:: \n{relationship_text}.
                """}
        ],
        temperature=0,
        top_p=0.95
    )

    return response.choices[0].message.content

In [5]:
def safe_json_load(value):
    if pd.isna(value):  # catches NaN, None, etc.
        return {}
    if isinstance(value, str) and value.strip():
        try:
            return json.loads(value)
        except json.JSONDecodeError:
            return {}
    elif isinstance(value, dict):
        return value
    return {}

def process_dataframe(df):
    processed_data = []

    for _, row in df.iterrows():
        product_data = safe_json_load(row.get("Product", ""))
        service_data = safe_json_load(row.get("Services", ""))
        issue_data = safe_json_load(row.get("Issue Type", ""))

        entities = {
            "products": product_data.get("product", []) or [],
            "services": service_data.get("service", []) or [],
            "issue_types": issue_data.get("issue_type", []) or []
        }

        # ✨ Replace any np.nan with None (JSON safe)
        entities_clean = json.loads(json.dumps(entities, allow_nan=False))

        processed_data.append({
            "entities": json.dumps(entities_clean)
        })

    return pd.DataFrame(processed_data)


In [6]:
# Apply the function to the DataFrame with progress tracking
tqdm.pandas(desc="Processing structured conversations")

processed_df = data

# Merge the processed DataFrame with the original DataFrame
data = pd.concat([data, processed_df], axis=1)

# Extract relationships with progress tracking
data['relationship'] = data.progress_apply(
    lambda row: extract_relationship(row['cleaned_conversations'], row['entities']), axis=1
)

# Extract resolution completion with progress tracking
data['resolution'] = data.progress_apply(
    lambda row: resolution_completion(row['cleaned_conversations'], row['relationship']), axis=1
)

Processing structured conversations: 100%|██████████| 5041/5041 [1:09:43<00:00,  1.21it/s]
Processing structured conversations: 100%|██████████| 5041/5041 [7:20:26<00:00,  5.24s/it]  


In [8]:
data.to_excel(f'..\\..\\data\processed\extraction_output\\{data_name}_extracted.xlsx', index=False)