In [None]:
import pandas as pd
import time
from datetime import datetime
import time
from openai import OpenAI
from tqdm import tqdm
from sklearn.metrics import classification_report, accuracy_score

In [None]:
df = pd.read_csv("infoevents.csv")

DESCRIPTION_COL = "targetbusinessdescription"
SYNOPSIS_COL = "dealsynopsis"
DATE_COL = "dateannounced"

# Prepare result columns
df["category"] = ""
df["explanation"] = ""
df["confidence"] = ""


In [None]:
example_block = '''Example 1:
- Desc: Engine component supplier for combustion powertrain
- Synop: Mitsubishi increases stake for ICE part security
→ Category: 1 #ice_supplier #combustion_parts #oem_support

Example 2:
- Desc: Diesel truck axle manufacturer
- Synop: Tata acquires full control for fleet integration
→ Category: 1 #truck_axle #diesel #ice_parts

Example 3:
- Desc: Gasoline sedan assembly facility
- Synop: GM purchases legacy ICE manufacturing plant
→ Category: 1 #gasoline_sedan #ice_manufacturing #assembly_plant

Example 4:
- Desc: Smart electronics supplier for ICE and hybrid systems
- Synop: Used for combustion-focused integration
→ Category: 1 #hybrid_support #ice_systems #not_ev

Example 5:
- Desc: Engine control software company
- Synop: Acquired to optimize diesel fuel efficiency
→ Category: 1 #engine_software #combustion #not_ev

Example 6:
- Desc: EV battery and powertrain integration firm
- Synop: Geely acquires to scale EV architecture
→ Category: 2 #battery #ev_architecture #powertrain

Example 7:
- Desc: Charging infrastructure network
- Synop: OEM invests in national EV rollout
→ Category: 2 #charging_network #ev_charging #infrastructure

Example 8:
- Desc: Electric bus and chassis manufacturer
- Synop: OEM acquisition to expand EV fleet
→ Category: 2 #ev_chassis #electric_bus #fleet_mobility

Example 9:
- Desc: Lithium-ion cell recycling startup
- Synop: VW acquires to secure battery supply chain
→ Category: 2 #battery_recycling #sustainability #ev_supply

Example 10:
- Desc: EV motor controller and inverter firm
- Synop: Acquired for electrification roadmap
→ Category: 2 #motor_control #ev_components #inverter

Example 11:
- Desc: ICE engine and fuel system supplier
- Synop: Mistaken as EV due to electronics, but diesel-focused
→ Category: 1 #combustion_fuel #diesel_system #not_ev

Example 12:
- Desc: Smart mobility software for ride coordination
- Synop: Platform not involved in vehicle production
→ Category: 3 #mobility_software #non_vehicle #logistics

Example 13:
- Desc: CRM and website tech for dealerships
- Synop: Digital service platform without vehicle components
→ Category: 3 #dealer_crm #digital_services #no_powertrain

Example 14:
- Desc: HVAC systems supplier for auto interiors
- Synop: Comfort tech used in both ICE and EVs
→ Category: 3 #hvac #interior_parts #climate_control

Example 15:
- Desc: Lithium battery production for EVs
- Synop: Acquired by OEM to secure battery supply chain
→ Category: 2 #lithium #battery #ev_supply

Example 16:
- Desc: Fleet management system with EV charging support
- Synop: Includes smart charging APIs, not just logistics
→ Category: 2 #charging #smart_ev #ev_software

Example 17:
- Desc: AI-based vehicle route planner for delivery optimization
- Synop: Software used by logistics firms, with no link to EV systems or ICE hardware
→ Category: 3 #route_planning #logistics_ai #no_powertrain

Example 18:
- Desc: Automotive ad-tech firm
- Synop: Enables car dealers to run targeted ads; not involved in manufacturing or electrification
→ Category: 3 #dealer_adtech #non_technical #not_ice_not_ev

Example 19:
- Desc: Combustion system analytics platform
- Synop: Analytics software often mistaken for general mobility tool; focused on optimizing diesel engine efficiency
→ Category: 1 #combustion_analytics #diesel_software #misleading_ev_like'''


In [None]:
from tqdm import tqdm
import time
import openai
import tiktoken
from collections import Counter

# Initialize tokenizer
enc = tiktoken.encoding_for_model("gpt-4")

# Initialize LiteLLM proxy client
client = openai.OpenAI(
    api_key="",  # Replace as needed
    base_url="https://litellmproxy.osu-ai.org/"
)

# Ensure all necessary columns are initialized
df["category"] = None
df["explanation"] = ""
df["confidence"] = 0
df["tokens_used"] = 0

# Add columns for voting
df["votes"] = ""
df["final_prompt"] = ""

# Define your three templates
prompt_templates = [prompt_template_a, prompt_template_b, prompt_template_c]

for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        description = str(row.get("description", ""))
        synopsis = str(row.get("synopsis", ""))
        date = str(row.get("date_announced", ""))

        votes = []
        confidences = []
        explanations = []
        tokens_total = 0
        prompt_used = None

        for idx, prompt_template in enumerate(prompt_templates):
            user_prompt = prompt_template.format(
                example_block=example_block,
                description=description,
                synopsis=synopsis,
                date=date
            )

            # Token count
            token_count = len(enc.encode(user_prompt))
            tokens_total += token_count

            response = client.chat.completions.create(
                model="GPT-4o",
                messages=[
                    {"role": "system", "content": "You are an expert consultant in the automobile industry."},
                    {"role": "user", "content": user_prompt}
                ],
                temperature=0,
                logprobs=True,
                top_logprobs=3
            )

            content = response.choices[0].message.content
            logprobs_data = response.choices[0].logprobs

            # Parse response
            pred_category, pred_confidence, pred_explanation = None, 0, ""

            for line in content.splitlines():
                line_lower = line.lower().strip()
                if line_lower.startswith("category:"):
                    pred_category = line.split(":")[1].strip()
                elif line_lower.startswith("confidence:"):
                    try:
                        pred_confidence = int(line.split(":")[1].strip())
                    except:
                        pred_confidence = 0
                elif line_lower.startswith("explanation:") or "because" in line_lower:
                    pred_explanation += line.strip() + " "

            # Backup confidence from logprobs
            if logprobs_data and "tokens" in logprobs_data:
                tokens = logprobs_data["tokens"]
                token_logprobs = logprobs_data["token_logprobs"]
                for tok, logprob in zip(tokens, token_logprobs):
                    if tok.strip() in {"1", "2", "3"}:
                        prob = 10 ** logprob
                        pred_confidence = min(int(round(prob * 100)), 100)
                        break

            if pred_category in {"1", "2", "3"} and pred_confidence >= 50:
                votes.append(pred_category)
                confidences.append(pred_confidence)
                explanations.append(pred_explanation)
            else:
                votes.append("NA")

        # Decide final prediction by majority vote
        vote_counts = Counter(votes)
        if vote_counts.get("NA") == 3:
            df.at[i, "category"] = "NA"
            df.at[i, "confidence"] = 0
            df.at[i, "explanation"] = "All prompts returned invalid or low-confidence answers."
            df.at[i, "final_prompt"] = "none"
        else:
            valid_votes = [v for v in votes if v in {"1", "2", "3"}]
            most_common = vote_counts.most_common(1)[0][0]
            final_idx = votes.index(most_common)
            df.at[i, "category"] = int(most_common)
            df.at[i, "confidence"] = confidences[final_idx]
            df.at[i, "explanation"] = explanations[final_idx]
            df.at[i, "final_prompt"] = f"Prompt {chr(65 + final_idx)}"

        df.at[i, "votes"] = ",".join(votes)
        df.at[i, "tokens_used"] = tokens_total

        time.sleep(1.0)

    except Exception as e:
        print(f"Error on row {i}: {e}")
        df.at[i, "category"] = "error"
        df.at[i, "explanation"] = str(e)
        df.at[i, "confidence"] = 0
        df.at[i, "tokens_used"] = 0
        df.at[i, "votes"] = "error"
        df.at[i, "final_prompt"] = "error"


In [None]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"infoevents_with_Predictions_{timestamp}.csv"
df.to_csv(filename, index=False)
print(f"Saved to {filename}")

df = pd.read_csv(filename)