In [None]:
import pandas as pd
import openai
import time

# Set your OpenAI API key here
openai.api_key = 'OPEN API KEY / DELETED OUR KEY FOR SECURITY CONCER'

# Define whether to run the full file or just the first 100 rows
run_full_file = False  # Set to True for full file, False for first 100 rows

# Load the Excel file
file_path = '/Users/excel_files/classified_startups_with_LLM_notranslation.xlsx'
df = pd.read_excel(file_path)

# If run_full_file is False, restrict to the first 100 rows
if not run_full_file:
    df = df.head(20)

# Display the first few rows to understand the structure
print(df.head())

def classify_startup(description, web_content):
    # Combine the description and web content
    combined_content = f"Description: {description}\nWebsite Content: {web_content}"
    
    # Define the prompt with full categories and subcategories
    prompt = f"""
    Based on the following description and website content, classify the startup into one of the following categories and subcategories. However, respond only with the category name:
    
    Clinical Decision Support:
    - Patient Safety
        - Drug Control
            - Medication administration errors
            - Drug-drug Interactions (DDI)
        - Hygiene Compliance
        - Sepsis Prediction
    - Clinical Management
        - Clinical Trial Technology Personalized
            - Finding eligible patients
        - Clinician guideline adherence
        - Patient Management
            - Research/Treatment protocols
        - Healthcare Logistics
            - Tracking and Placing Orders
        - Follow-up referrals
        - Preventative Care
    - Cost Containment
        - Decreasing patient length-of-stay
        - Clinical Interventions
        - Payment Operations
        - CPOE-integrated systems for suggesting cheaper medication alternatives
        - Reducing test duplication
    - Administrative Functions
        - Clinical Workflow Tools
        - Documentation Templates
        - Patient Triage
    - Diagnostics Support
        - Imaging
        - Laboratory and Pathology
        - Virtual Assistant
    - Health Data Interoperability and Integration
        - Secure Data Exchange
        - Health Data Platforms

    Imaging and Diagnostics:
    - Diagnostic Devices
        - ML Enhanced Hardware, e.g. stethoscope
        - ECG
        - EEG
    - Diagnostic Imaging Software
        - Ultrasound
        - Magnetic Resonance Imaging (MRI)
        - Computed Tomography (CT)
    - Biological Diagnostics
        - Rapid Diagnostics
        - Laboratory Diagnostics
            - PCR
            - Bacterial cultures
    - Digital Biomarkers
        - Speech-based
        - Passive Sensing
        - Active Participation based Digital Biomarkers

    Drug Discovery:
    - Biotechnology Platform
    - Bioinformatics
    - Market Analysis (Application based)

    Mental Health:
    - Digital Health Apps
        - Mood Tracking
        - Psychoeducation
        - Mindfulness
        - Digital Diaries
        - Peer Support
        - Social Networking
        - Patient management

    Surgery:
    - Surgical Navigation
    - Acute Decision Support
    - Workflow Optimization

    Medical Treatment and Personalized Medicine:
    - Genomics
        - Genomics
        - Proteomics
        - Epigenomics
        - Multi-omics
    - Biomedical Devices
        - Implantable Devices
        - Non-invasive stimulation therapeutic devices
    - Regenerative Medicine
        - Tissue Engineering
    - Reproductive Medicine
        - In vitro fertilization technologies

    Public Health and Epidemiology:
    - Disease Forecasting
    - Risk Prediction
    - Health Diagnosis
    - Spatial Modeling
    - Public Health Surveillance
    - Epidemic Modeling
    - Infection Prevention and Control

    Medical Monitoring:
    - Telemedicine
        - General Telemedicine Services
            - Primary Care and General Health Monitoring
            - Remote Monitoring for General Health
        - Specialized Treatment in Telemedicine
            - Medical Specialties
                - Tele-diabetes Care
                - Telecardiology
                - Telepulmonary
                - Telepsychiatry
                - Teledermatology
                - Teleneurology
                - Tele-oncology
                - Telegeriatrics
                - Tele-palliative Care
                - Teleaudiology
                - Teledentistry
                - Telepharmacy
                - Teleophthalmology
                - Telegastrointestinal
                - Telerheumatology
                - Teleobstetrics
                - Telenephrology
                - Telesurgery
                - Tele-ICU
                - Tele-emergency
        - Telehealth Platforms and Tools
            - Tele-education
            - Broad Telehealth Platforms
        - Telerehabilitation
            - Cardiac Telerehabilitation
            - Neurological Telerehabilitation
            - Physiotherapy Telerehabilitation
    - Digital Biomarkers Monitoring
        - Biomarkers extracted through passive sensing from mobile phones or smartwatches
    - Monitoring Devices

    Rehabilitation and Assistive Technologies:
    - Survival
    - Communication
    - Low Vision Aid
    - Environmental Interaction
    - Mobility
    - Physical Education
    - Positioning
    
    {combined_content}

    Respond with only the main category name; don't give the subcategory name, only the big title one of those Nine categories should be assigned:
    - Clinical Decision Support
    - Imaging and Diagnostics
    - Drug Discovery
    - Mental Health
    - Surgery
    - Medical Treatment and Personalized Medicine
    - Public Health and Epidemiology
    - Medical Monitoring
    - Rehabilitation and Assistive Technologies
    """
    
    retries = 3
    for i in range(retries):
        try:
            # Call GPT-4o API
            response = openai.ChatCompletion.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that categorizes startups based on provided descriptions."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=10  # Keep the response short
            )
            
            # Extract the category from the response
            category = response.choices[0]['message']['content'].strip()
            return category
        
        except openai.error.APIError as e:
            if i < retries - 1:
                print(f"API error, retrying... ({i+1}/{retries})")
                time.sleep(2 ** i)  # Exponential backoff
            else:
                print(f"Failed after {retries} attempts.")
                return "API Error"
        except Exception as e:
            print(f"An error occurred: {e}")
            return "Error"

def apply_classification(row):
    description = row['Full Description']
    web_content = row.get('Website Content', '')
    
    # If web content is missing or indicates inactivity, rely only on the description
    if pd.isna(web_content) or "not active" in web_content.lower():
        web_content = ''
    
    category = classify_startup(description, web_content)
    return category

# Apply classification to all rows in the DataFrame
df['Category_byGPT'] = df.apply(apply_classification, axis=1)

# Save the updated DataFrame with categories to a new Excel file
output_file_path = 'classified_startups_with_LLM_notranslation.xlsx'
df.to_excel(output_file_path, index=False)

print("Classification completed and saved to 'classified_startups_with_LLM_notranslation.xlsx'")
