# Classifying AI Complexity

In [8]:
import pandas as pd
import os
import openai
import time
import asyncio
import swifter
from tqdm import tqdm
import concurrent.futures

In [9]:
from openai import OpenAI

client = OpenAI(
  api_key=os.environ['OPENAI_API_KEY'],  # this is also the default, it can be omitted
)

### Test OpenAI API Key

In [None]:



# Replace this with your API key temporarily to manually check it
openai.api_key = "HERE OPENAI API / WE DELETED OURS FOR SECURITY"

try:
    # Test the API key by making a simple API request
    response = openai.ChatCompletion.create(
        model="gpt-4o",  # Or "gpt-4" depending on your access
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "Say hello!"}
        ]
    )
    print("API key is valid. Response:", response['choices'][0]['message']['content'].strip())

except OpenAIError as e:
    print(f"OpenAI API error: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


### Load Data

In [None]:
# Define file paths as variables for flexibility
input_excel_file = 'classification_excel_files/only_content_25.xlsx'
output_parquet_file = 'only_content_25.parquet'

# 1. Load the Excel file into a pandas DataFrame
df = pd.read_excel(input_excel_file)

# 2. Save the DataFrame to Parquet format for faster reading/writing
df.to_parquet(output_parquet_file, engine='pyarrow', index=False)

print(f"Data successfully saved to Parquet format: {output_parquet_file}")



Data successfully saved to Parquet format: only_content_25.parquet


In [None]:
df_parquet = pd.read_parquet(output_parquet_file)
print(df_parquet.head())

### Functions and Main Code

In [None]:
def build_prompt_ai_complexity(description, web_content):
    """
    Build a dynamic prompt based on the description and web content to classify AI complexity,
    and count the token size of the generated prompt.
    """
    combined_content = f"Description: {description}\nWebsite Content: {web_content}"
    
    prompt = f"""
    Analyze the following text for descriptions of AI technology used by healthcare startups. 
    Based on the extracted information, classify the AI technology into one of the following categories: 
    Low, Moderate, High, Advanced, Pioneering.

    For each company description:

    **Algorithm Complexity**:
    - Low Complexity: Look for references to simple rule-based systems, decision trees, or basic regression models. 
      These systems will often rely on structured data and predefined rules.
    - Moderate Complexity: Identify mentions of basic machine learning models like supervised learning, 
      support vector machines, or decision trees. These systems should indicate some training and adaptation 
      but will mostly rely on structured data.
    - High Complexity: Focus on terms like deep learning, convolutional neural networks (CNNs), or 
      recurrent neural networks (RNNs). These systems are used for tasks like image analysis, 
      medical diagnostics, and personalized medicine. They will likely handle unstructured data such as medical images.
    - Advanced Complexity: Search for references to transfer learning, multimodal AI, or generative adversarial networks (GANs). 
      These systems will be more autonomous and able to integrate multiple data sources, such as genomics and clinical data.
    - Pioneering Complexity: Identify terms like federated learning, neurosymbolic AI, or edge AI. 
      These systems are fully autonomous, real-time, and capable of operating in distributed environments 
      like hospitals or global healthcare networks.

    **Integration into Healthcare Workflows**:
    - Look for mentions of integration with EHR systems, real-time clinical decision support, 
      or automation of clinical workflows.
    - Evaluate if the technology is described as seamlessly fitting into existing hospital or clinical environments.

    **Clinical Validation**:
    - Identify references to clinical trials, pilot studies, or regulatory approvals like FDA clearance or CE marking.
    - Look for partnerships with hospitals or medical institutions that validate the AI system's effectiveness.

    Use this information to classify each company's AI technology into the appropriate category and 
    evaluate their technological sophistication.

    {combined_content}

    Respond with only one category: Low, Moderate, High, Advanced, Pioneering.
    """
    
    # Count tokens (optional: can be used if batching multiple descriptions)
    # encoding = tiktoken.encoding_for_model("gpt-4")
    # num_tokens = len(encoding.encode(prompt))
    
    return prompt


import openai
from openai import OpenAIError

# Replace this with your API key
openai.api_key = "HERE OPENAI API / WE DELETED OURS FOR SECURITY"

def categorize_ai_complexity(description, web_content):
    prompt = f"Description: {description}\nWebsite Content: {web_content}"

    retries = 3  # Limit retry attempts to avoid blocking long processes
    for i in range(retries):
        try:
            # Call OpenAI API to classify AI complexity using the correct ChatCompletion method
            response = openai.ChatCompletion.create(
                model="gpt-4",  # Use "gpt-3.5-turbo" if you want
                messages=[
                    {"role": "system", "content": "You are a helpful assistant classifying AI complexity."},
                    {"role": "user", "content": prompt}
                ]
            )
            
            # Extract and return the AI complexity category
            category = response['choices'][0]['message']['content'].strip()
            return category
        
        except OpenAIError as e:
            print(f"An OpenAI API error occurred: {e}")
            return "API Error"
        
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            return "Error"

def apply_ai_complexity_classification(row):
    description = row['Full Description']
    web_content = row.get('Website Content', '')  # Default to empty string if no web content

    # If web content is missing or indicates inactivity, use only the description
    if pd.isna(web_content) or "not active" in web_content.lower():
        web_content = ''
    
    # Categorize the AI complexity based on the description and web content
    category = categorize_ai_complexity(description, web_content)
    return category

# Example usage with parallel processing
import pandas as pd
import concurrent.futures
from tqdm import tqdm

df = pd.read_excel('/Users/daniel/RA_MTEC/classification_excel_files/only_content_25.xlsx')

def parallel_apply_classification(df, max_workers=10):
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(tqdm(executor.map(apply_ai_complexity_classification, [row for index, row in df.iterrows()]), total=len(df)))
    return results

# Apply the classification in parallel and store results in a new column
df['AI_Complexity_Category'] = parallel_apply_classification(df)

# Save the updated DataFrame to a file (parquet or Excel)
df.to_parquet('classified_startups_ai_complexity.parquet', index=False)
print("AI complexity classification completed and saved.")
