In [63]:
import numpy as np

def select_model(data_type, num_samples, num_features=None, task_type="classification", is_structured=True, 
                 has_labels=True, is_real_time=False, computation_resources="high", data_quality="high"):

    models = []
    """
    Function to suggest ML/DL models based on input data characteristics.

    Parameters:
        data_type (str): Type of data (e.g., "tabular", "text", "image", "time-series", "audio", "graph").
        num_samples (int): Number of samples in the dataset.
        num_features (int, optional): Number of features for structured data. Default is None.
        task_type (str): Task type (e.g., "classification", "regression", "clustering", "detection", "segmentation").
        is_structured (bool): True if data is structured (e.g., tabular), False if unstructured (e.g., text, images).
        has_labels (bool): True if data has labels (supervised learning), False for unsupervised tasks.
        is_real_time (bool): True if the system requires real-time predictions.
        computation_resources (str): Available resources ("low", "medium", "high").
        data_quality (str): Quality of data ("low", "medium", "high").

    Returns:
        list: Suggested models for the given input.
    """
    if data_type == "tabular":
        if has_labels:
            if num_features and num_features > 100: 
                models.extend(["XGBoost", "LightGBM", "Random Forest"])
            elif num_features and num_features <= 100:  
                models.extend(["Logistic Regression", "Decision Tree"])
            else: 
                models.extend(["Random Forest", "SVM"])

    if data_type == "tabular":  
        if has_labels:
            if num_samples < 1000:
                if task_type == "classification":
                    models.extend(["Logistic Regression", "Decision Tree", "SVM", "K-Nearest Neighbors (KNN)", "Naive Bayes", "Perceptron"])
                elif task_type == "regression":
                    models.extend(["Linear Regression", "Decision Tree Regressor", "Ridge Regression", "Lasso Regression"])
            else:
                if task_type == "classification":
                    models.extend(["Random Forest", "XGBoost", "LightGBM", "CatBoost", "Gradient Boosting", "Extra Trees"])
                elif task_type == "regression":
                    models.extend(["Random Forest Regressor", "XGBoost Regressor", "CatBoost Regressor", "LightGBM Regressor", "ElasticNet"])
        else:
            models.extend(["K-Means", "DBSCAN", "Gaussian Mixture Model", "Hierarchical Clustering", "Affinity Propagation"])

    elif data_type == "text": 
        if task_type == "classification":
            if num_samples < 5000:
                models.extend(["Naive Bayes", "Logistic Regression", "SVM (with TF-IDF features)", "CRF (Conditional Random Fields)"])
            else:
                models.extend(["BERT (API / Login)", "RoBERTa (API / Login)", "DistilBERT (API / Login)", "ALBERT (API / Login)", "XLNet (API / Login)", "T5 (API / Login)"])
        elif task_type == "clustering":
            models.extend(["KMeans", "Agglomerative Clustering", "Spectral Clustering", "Latent Dirichlet Allocation (LDA)"])
        elif task_type == "generation":
            models.extend(["GPT-3 (API / Login / $)", "T5 (API / Login)", "BART (API / Login)", "CTRL (API / Login)", "XLNet (API / Login)", "Transformer (API)"])
        elif task_type == "translation":
            models.extend(["MarianMT (API)", "BERT-based Translation Models (API)", "T5 (API / Login)", "mBART (API)", "Transformer (API)"])
        elif task_type == "summarization":
            models.extend(["BART (API / Login)", "T5 (API / Login)", "GPT-3 (API / Login / $)", "PEGASUS (API / Login)", "DistilBERT (API / Login)"])

    elif data_type == "image": 
        if task_type == "classification":
            if num_samples < 1000:
                models.extend(["SVM (with HOG features)", "KNN", "Random Forest", "Logistic Regression" , "YOLO"])
            else:
                models.extend(["Convolutional Neural Network (CNN)", "ResNet", "EfficientNet", "DenseNet", "Inception", "VGGNet" , "YOLO"])
        elif task_type == "detection":
            models.extend(["YOLO", "Faster R-CNN", "SSD", "RetinaNet", "CenterNet"])
        elif task_type == "segmentation":
            models.extend(["YOLO", "UNet", "DeepLab", "Mask R-CNN", "SegNet"])
        elif task_type == "generation":
            models.extend(["Generative Adversarial Networks (GANs)", "DCGAN", "CycleGAN", "StyleGAN", "BigGAN"])
        elif task_type == "style-transfer":
            models.extend(["Fast Neural Style Transfer", "AdaIN", "CycleGAN", "Deep Photo Style Transfer"])
        elif task_type == "super-resolution":
            models.extend(["SRCNN", "VDSR", "ESRGAN", "FSRCNN", "EDSR"])

    elif data_type == "time-series": 
        if task_type == "classification":
            models.extend(["LSTM", "GRU", "Random Forest", "XGBoost", "Time Series Forest"])
        elif task_type == "regression":
            models.extend(["ARIMA", "Prophet", "SARIMA", "LSTM", "N-BEATS"])
        elif task_type == "forecasting":
            models.extend(["Prophet (API)", "SARIMA", "N-BEATS", "Transformer", "ETS"])
        elif task_type == "anomaly-detection":
            models.extend(["LSTM Autoencoder", "Isolation Forest", "One-Class SVM", "AutoRegressive Integrated Moving Average (ARIMA)", "DBSCAN"])
        elif task_type == "imputation":
            models.extend(["KNN Imputer", "Multivariate Imputation by Chained Equations (MICE)", "Deep Imputer", "Matrix Factorization", "LSTM"])
        elif task_type == "generation":
            models.extend(["GANs for Time Series", "Variational Autoencoders (VAE)", "LSTM", "N-BEATS"])

    elif data_type == "audio": 
        if task_type == "classification":
            models.extend(["CNN (on spectrogram features)", "LSTM", "GRU", "MFCC-based SVM"])
        elif task_type == "speech-to-text":
            models.extend(["Kaldi (API)", "DeepSpeech (API)", "Wav2Vec 2.0 (API)", "Jasper (API)", "Conformer (API)"])
        elif task_type == "generation":
            models.extend(["Tacotron", "WaveNet", "MelGAN", "HiFi-GAN", "FastSpeech", "Vocoder"])
        elif task_type == "denoising":
            models.extend(["DeepDenoisingAutoencoder", "Denoising Convolutional Autoencoder", "Wave-U-Net"])
        elif task_type == "speaker-recognition":
            models.extend(["VGG-SpeakerNet", "ResNet-based Speaker Recognition", "DeepSpeaker"])

    elif data_type == "graph": 
        if task_type == "classification":
            models.extend(["Graph Neural Networks (GNN)", "GraphSAGE", "Node2Vec", "GCN", "Graph Attention Networks (GAT)"])
        elif task_type == "link-prediction":
            models.extend(["Graph Autoencoders", "DeepWalk", "GraphSAGE", "LINE", "Struc2Vec", "Node2Vec"])
        elif task_type == "community-detection":
            models.extend(["Louvain Method", "Girvan-Newman Algorithm", "Spectral Clustering", "Modularity Optimization"])
        elif task_type == "graph-generation":
            models.extend(["GraphGAN", "Variational Graph Autoencoder", "GraphVAE", "MolGAN"])

    if not is_structured:
        if data_type == "text":
            models.extend(["BERT (API / Login)", "RoBERTa (API / Login)", "DistilBERT (API / Login)", "T5 (API / Login)"])
        elif data_type == "image":
            models.extend(["CNN", "ResNet", "YOLO", "EfficientNet", "DenseNet"])
        elif data_type == "audio":
            models.extend(["CNN", "LSTM", "GRU", "WaveNet", "Tacotron"])

    if is_real_time:
        models = [model for model in models if model in ["YOLO", "LSTM", "Transformer", "CNN", "GRU", "Fast R-CNN", "MobileNet"]]

    if computation_resources == "low":
        models = [model for model in models if model not in ["Transformer", "BERT", "XLNet", "RoBERTa", "ResNet", "DenseNet"]]

    if data_quality == "low":
        models = [model for model in models if model in ["Random Forest", "Decision Tree", "KNN", "Logistic Regression", "Naive Bayes", "SVM"]]



    return set(models)


# พารามิเตอร์การกำหนดค่าชุดข้อมูล 📊

## ลักษณะข้อมูล 📝
- **`data_type`** (str): ประเภทข้อมูล 🔍
     - ตัวอย่าง: "tabular", "text", "image", "time-series", "audio", "graph"

- **`num_samples`** (int): จำนวนตัวอย่างทั้งหมดในชุดข้อมูล 🔢

- **`num_features`** (int, optional): 
    - จำนวนคุณลักษณะสำหรับข้อมูลที่มีโครงสร้าง 🧩
    - ค่าเริ่มต้น: `None`

## คุณลักษณะการเรียนรู้ 🤖
- **`task_type`** (str): ประเภทงานการเรียนรู้ของเครื่อง 🎯
     - ตัวอย่าง: "classification", "regression", "clustering", "detection", "segmentation"

- **`is_structured`** (bool): 
     - `True`: ข้อมูลที่มีโครงสร้าง (เช่น ตาราง) 📑
     - `False`: ข้อมูลที่ไม่มีโครงสร้าง (เช่น ข้อความ, รูปภาพ) 🖼️

- **`has_labels`** (bool):
     - `True`: การเรียนรู้แบบมีผู้สอน (มีข้อมูลป้ายกำกับ) 🏷️
     - `False`: งานแบบไม่มีผู้สอน 🕵️

## ข้อกำหนดระบบ 💻
- **`is_real_time`** (bool): 
     - `True`: ระบบต้องการการทำนายแบบเรียลไทม์ ⚡
     - `False`: สามารถประมวลผลแบบเป็นชุดได้ 📦

- **`computation_resources`** (str): ทรัพยากรการคำนวณที่มี 🖥️
     - ตัวเลือก: "low", "medium", "high"

- **`data_quality`** (str): การประเมินคุณภาพชุดข้อมูล 💎
     - ตัวเลือก: "low", "medium", "high"

In [None]:
# task_type
# tabular : classification regression [clustering(has_labels=False)]
# text : classification clustering generation translation summarization
# image : classification detection segmentation generation style-transfer super-resolution
# time-series : classification forecasting anomaly-detection imputation generation regression
# audio : classification speech-to-text generation denoising speaker-recognition
# graph : classification link-prediction community-detection graph-generation

In [73]:
select_model(
   data_type="image",            # ประเภทข้อมูล "tabular", "text", "image", "time-series", "audio", "graph"
   num_samples=60,               # จำนวนตัวอย่างในชุดข้อมูล จำนวนเต็มอ / แถวข้อมูล
   num_features=10,                 # จำนวนคุณลักษณะ/คอลัมน์ของข้อมูล จำนวนเต็ม
   
   task_type="classification",         # ประเภทงาน "classification", "regression", "clustering" , "detection", "segmentation" , "generation" , "link-prediction" , "speech-to-text" , "forecasting" 
   is_structured=True,             # ข้อมูลมีโครงสร้างหรือไม่  True/False
   has_labels=True,                # มีข้อมูลป้ายกำกับหรือไม่ True/False
   
   is_real_time=False,             # ต้องการผลทำนายแบบเรียลไทม์หรือไม่ True/False
   computation_resources="high",   # ทรัพยากรการคำนวณ "low", "medium", "high"
   data_quality="high"             # คุณภาพข้อมูล "low", "medium", "high"
)

{'KNN',
 'Logistic Regression',
 'Random Forest',
 'SVM (with HOG features)',
 'YOLO'}