In [None]:
import pandas as pd
import time
from datetime import datetime
import time
from openai import OpenAI
from tqdm import tqdm
from sklearn.metrics import classification_report, accuracy_score

In [None]:
df = pd.read_csv("infoevents.csv")

DESCRIPTION_COL = "targetbusinessdescription"
SYNOPSIS_COL = "dealsynopsis"
DATE_COL = "dateannounced"

# Prepare result columns
df["category"] = ""
df["explanation"] = ""
df["confidence"] = ""


In [None]:
example_block = '''Example 1:
- Desc: Engine component supplier for combustion powertrain
- Synop: Mitsubishi increases stake for ICE part security
‚Üí Category: 1 #ice_supplier #combustion_parts #oem_support

Example 2:
- Desc: Diesel truck axle manufacturer
- Synop: Tata acquires full control for fleet integration
‚Üí Category: 1 #truck_axle #diesel #ice_parts

Example 3:
- Desc: Gasoline sedan assembly facility
- Synop: GM purchases legacy ICE manufacturing plant
‚Üí Category: 1 #gasoline_sedan #ice_manufacturing #assembly_plant

Example 4:
- Desc: Smart electronics supplier for ICE and hybrid systems
- Synop: Used for combustion-focused integration
‚Üí Category: 1 #hybrid_support #ice_systems #not_ev

Example 5:
- Desc: Engine control software company
- Synop: Acquired to optimize diesel fuel efficiency
‚Üí Category: 1 #engine_software #combustion #not_ev

Example 6:
- Desc: EV battery and powertrain integration firm
- Synop: Geely acquires to scale EV architecture
‚Üí Category: 2 #battery #ev_architecture #powertrain

Example 7:
- Desc: Charging infrastructure network
- Synop: OEM invests in national EV rollout
‚Üí Category: 2 #charging_network #ev_charging #infrastructure

Example 8:
- Desc: Electric bus and chassis manufacturer
- Synop: OEM acquisition to expand EV fleet
‚Üí Category: 2 #ev_chassis #electric_bus #fleet_mobility

Example 9:
- Desc: Lithium-ion cell recycling startup
- Synop: VW acquires to secure battery supply chain
‚Üí Category: 2 #battery_recycling #sustainability #ev_supply

Example 10:
- Desc: EV motor controller and inverter firm
- Synop: Acquired for electrification roadmap
‚Üí Category: 2 #motor_control #ev_components #inverter

Example 11:
- Desc: ICE engine and fuel system supplier
- Synop: Mistaken as EV due to electronics, but diesel-focused
‚Üí Category: 1 #combustion_fuel #diesel_system #not_ev

Example 12:
- Desc: Smart mobility software for ride coordination
- Synop: Platform not involved in vehicle production
‚Üí Category: 3 #mobility_software #non_vehicle #logistics

Example 13:
- Desc: CRM and website tech for dealerships
- Synop: Digital service platform without vehicle components
‚Üí Category: 3 #dealer_crm #digital_services #no_powertrain

Example 14:
- Desc: HVAC systems supplier for auto interiors
- Synop: Comfort tech used in both ICE and EVs
‚Üí Category: 3 #hvac #interior_parts #climate_control

Example 15:
- Desc: Lithium battery production for EVs
- Synop: Acquired by OEM to secure battery supply chain
‚Üí Category: 2 #lithium #battery #ev_supply

Example 16:
- Desc: Fleet management system with EV charging support
- Synop: Includes smart charging APIs, not just logistics
‚Üí Category: 2 #charging #smart_ev #ev_software

Example 17:
- Desc: AI-based vehicle route planner for delivery optimization
- Synop: Software used by logistics firms, with no link to EV systems or ICE hardware
‚Üí Category: 3 #route_planning #logistics_ai #no_powertrain

Example 18:
- Desc: Automotive ad-tech firm
- Synop: Enables car dealers to run targeted ads; not involved in manufacturing or electrification
‚Üí Category: 3 #dealer_adtech #non_technical #not_ice_not_ev

Example 19:
- Desc: Combustion system analytics platform
- Synop: Analytics software often mistaken for general mobility tool; focused on optimizing diesel engine efficiency
‚Üí Category: 1 #combustion_analytics #diesel_software #misleading_ev_like'''


In [None]:
from tqdm import tqdm
import time
import openai
import tiktoken

# Setup tokenizer
enc = tiktoken.encoding_for_model("gpt-4")

# Setup LiteLLM proxy
client = openai.OpenAI(
    api_key="",  # replace with your real key
    base_url="https://litellmproxy.osu-ai.org/"
)

# Track token usage
df["tokens_used"] = 0

# Config
max_rows_to_run = len(df)
na_counter = 0
early_stop_threshold = 30  # %

for i, row in tqdm(df.iterrows(), total=len(df)):
    if i >= max_rows_to_run:
        break

    try:
        description = str(row.get("targetbusinessdescription", ""))
        synopsis = str(row.get("dealSynopsis", ""))
        date = str(row.get("dateAnnounced", ""))

        user_prompt = f"""
You are an expert consultant in the automobile industry. Classify each OEM investment into one of these categories:

1. ICEV support ‚Äì combustion, diesel, gasoline, engine, powertrain.
2. EV support ‚Äì electric, battery, lithium, charging, electrification.
3. Other ‚Äì software, AI, fintech, cloud, logistics, CRM, non-mobility.

Guidelines:
- If manufacturing: always Class 1 or 2.
- If software not controlling vehicles: Class 3.
- If "electric" is mentioned, make sure it's related to EVs, not misleading.
- CRM, logistics, fintech = Class 3.
- - If the description or synopsis clearly references EV-related terms like *electric*, *charging*, *battery*, etc., lean toward Class 2 ‚Äî unless context clearly contradicts.

{example_block}

Now classify this entry using reasoning.

Description: '''{description}'''
Deal Synopsis: '''{synopsis}'''
Date Announced: {date}

First explain your reasoning in 30‚Äì60 words.
Then respond in this format:
Category: [1/2/3]
Confidence: [1‚Äì100]

Do not use bold formatting (like **Category:**). Just output plain text exactly as shown above.
"""

        token_count = len(enc.encode(user_prompt))
        df.at[i, "tokens_used"] = token_count

        response = client.chat.completions.create(
            model="GPT-4o",
            messages=[
                {"role": "system", "content": "You are an expert consultant in the automobile industry."},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0
        )

        content = response.choices[0].message.content

        # ‚úÖ Clean parsing: no markdown dependency
        pred_category = None
        pred_confidence = 0
        pred_explanation = ""

        for line in content.splitlines():
            line_lower = line.lower().strip()

            if line_lower.startswith("category:"):
                pred_category = line.split(":")[1].strip()

            elif line_lower.startswith("confidence:"):
                try:
                    pred_confidence = int(line.split(":")[1].strip())
                except:
                    pred_confidence = 0

            elif line_lower.startswith("explanation:") or "reasoning" in line_lower or "‚Üí" in line_lower or "because" in line_lower:
                pred_explanation += line.strip() + " "

        # Final classification logic
        if pred_confidence < 30 or pred_category not in {"1", "2", "3"}:
            df.at[i, "category"] = "NA"
            na_counter += 1

            if na_counter <= 3:
                print(f"\n‚ö†Ô∏è ROW {i} RAW OUTPUT:\n{content}\n")

        else:
            df.at[i, "category"] = int(pred_category)

        df.at[i, "explanation"] = pred_explanation.strip()
        df.at[i, "confidence"] = pred_confidence

        # Early stop condition
        if (i + 1) % 10 == 0:
            na_ratio = (na_counter / (i + 1)) * 100
            print(f"üîç After {i+1} rows ‚Üí NA Ratio: {na_ratio:.2f}%")
            if na_ratio > early_stop_threshold:
                print(f"‚ùå Too many NAs ({na_ratio:.2f}%) ‚Äî stopping early to save tokens.")
                break

        time.sleep(1.2)

    except Exception as e:
        print(f"Error on row {i}: {e}")
        df.at[i, "category"] = "error"
        df.at[i, "explanation"] = str(e)
        df.at[i, "confidence"] = 0
        df.at[i, "tokens_used"] = 0


In [None]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"infoevents_with_Predictions_{timestamp}.csv"
df.to_csv(filename, index=False)
print(f"Saved to {filename}")

df = pd.read_csv(filename)