In [None]:
# Input question and data

import numpy as np
input_question = "I'm going to classify the following chinese medicine spectral data."
# input_question = "I'm going to predict the waste water spectral data quality of COD."

data = np.load(r'data\cn_medicine.npy')
# data = np.load(r'data\H2Olabel_cs.npy')

print(data.shape)

(3, 50, 228)


In [None]:
# Entity extraction

import Entity_extraction
extracted = Entity_extraction.extract_entities_and_task(input_question)

research_object = extracted['research_object']
task_type       = extracted['task_type']

print("Research object:", research_object)
print("Task type:", task_type)

Research object: chinese medicine spectral data
Task type: classification


In [None]:
# Knowledge Retrieval

import Retrieval

json_path = "./structured_papers1.json"
papers_list = Retrieval.load_papers_from_json(json_path)
bm25_index, tokenized_names = Retrieval.build_bm25_index(papers_list)
top_k = 2
matched_info = Retrieval.search_papers_with_bm25(
    papers=papers_list,
    bm25=bm25_index,
    tokenized_paper_names=tokenized_names,
    query=research_object,
    top_k=top_k
)
print(matched_info)

Paper_Name: Classification of Chinese Herbal Medicine by Laser-Induced Breakdown Spectroscopy with Principal Component Analysis and Artificial Neural Network
Relevant Scores: 6.801548501114336
Paper_Name: Application of handheld near infrared spectrometer in quality control of traditional Chinese medicine: Rapid screening and quantitative analysis of Lonicerae Japonicae Flos adulteration
Relevant Scores: 5.6501807410146805
[{'paper_name': 'Classification of Chinese Herbal Medicine by Laser-Induced Breakdown Spectroscopy with Principal Component Analysis and Artificial Neural Network', 'preprocessing_method': 'SNVFD', 'feature_extracting_method': 'principal component analysis (PCA)'}, {'paper_name': 'Application of handheld near infrared spectrometer in quality control of traditional Chinese medicine: Rapid screening and quantitative analysis of Lonicerae Japonicae Flos adulteration', 'preprocessing_method': 'SNVFD', 'feature_extracting_method': 'CARS'}]


In [None]:
# Preprocess and feature extraction

import Agent
methods_map = Agent.decide_methods_per_paper(matched_info)
print(methods_map)
results = Agent.process_all_papers(data, methods_map)

processed_data = results[matched_info[0]['paper_name']]['processed_data']
print(processed_data.shape)
features = results[matched_info[0]['paper_name']]['extracted_features']
for fn, arr in features.items():
    print(f"Feature '{fn}' shape:", arr.shape)

{'Classification of Chinese Herbal Medicine by Laser-Induced Breakdown Spectroscopy with Principal Component Analysis and Artificial Neural Network': {'preprocessing': ['snv_fd'], 'features': ['pca_feature_extraction']}, 'Application of handheld near infrared spectrometer in quality control of traditional Chinese medicine: Rapid screening and quantitative analysis of Lonicerae Japonicae Flos adulteration': {'preprocessing': ['snv_fd'], 'features': []}}
(3, 50, 228)
Feature 'pca_feature_extraction' shape: (3, 50, 5)


In [None]:
# Build dataset

from dataset import CLS_Dataset,REG_Dataset,ANO_Dataset

if task_type == 'classification':
    dataset = CLS_Dataset(
        feature=features['pca_feature_extraction'],
        label_type='CN_medicine',
        n_classes=3,
        n_train_per_class=6,
        n_val_per_class=2,
        n_test_per_class=2,
        random_seed=0
    )
    dataset.summary()
    train_data = dataset.train_data
    val_data   = dataset.val_data 
    test_data  = dataset.test_data

    true_labels_val  = dataset.true_labels_val
    true_labels_test = dataset.true_labels_test                                 

elif task_type == 'regression':
    X = features['lambert_pearson_feature_extraction']
    Y = np.load('data\water_label.npy')
    dataset = REG_Dataset(
    X=X,
    Y=Y,
    n_train=12,
    n_val=10,
    n_test=10,
    random_seed=2
)
    dataset.summary()

    train_data = dataset.train_data
    val_data   = dataset.val_data
    test_data  = dataset.test_data

    y_val_true  = dataset.y_val_true
    y_test_true = dataset.y_test_true

elif task_type == 'anomaly detection':
    ds = ANO_Dataset(
        X = features['pca_feature_extraction'],
        normal_class=2,
        n_train_norm=4,
        n_val_norm=1,
        n_test_norm=3,
        n_inter_anom=5,
        n_intra_anom=2,
        random_seed=0
    )
    ds.summary()
    train_data = ds.train_data
    val_data   = ds.val_data
    test_data  = ds.test_data
    y_val      = ds.y_val
    y_test     = ds.y_test


Training samples: 18
Validation samples: 6, labels: ['山银花金银花混合物', '金银花', '山银花', '金银花', '山银花', '山银花金银花混合物']
Test samples: 6, labels: ['山银花', '金银花', '山银花金银花混合物', '山银花', '金银花', '山银花金银花混合物']


In [None]:
# Multi-task reasoning

from Generate import SpectrumCLS,SpectrumReg,SpectrumAno

if task_type == "classification":
    agent_cls = SpectrumCLS(train_data=train_data,test_data=test_data,val_data=val_data,true_labels_val=true_labels_val,true_labels_test=true_labels_test,
    api_key="..."
    )
    agent_cls.run()
elif task_type == "regression":
    agent = SpectrumReg(
    dataset=dataset,
    api_key="..."
)
    agent.run()
elif task_type == 'anomaly detection':
    agent = SpectrumAno(
    dataset=ds,
    api_key="..."
)
    agent.run()

=== Initial test set evaluation ===
Initial test accuracy: 100.00%
\n=== Round 1 validation ===
Validation predictions: ['山银花金银花混合物', '金银花', '山银花', '金银花', '山银花', '山银花金银花混合物']
Validation accuracy: 100.00%, wrong indices: []
Validation perfect, stopping early.
Final best test accuracy: 100.00%


In [None]:
# Comparison with other models

from other_models import ClassificationModelPipeline,RegressionModelPipeline,AnomalyModelPipeline

if task_type =='classification':
    pipeline = ClassificationModelPipeline(train_data, test_data)
    results = pipeline.train_and_evaluate()

elif task_type =='regression':
    pipeline = RegressionModelPipeline(train_data, test_data)
    results = pipeline.train_and_evaluate()
    
elif task_type == 'anomaly detection':
    pipeline = AnomalyModelPipeline(ds.train_data, ds.test_data)
    results =  pipeline.train_and_evaluate()



SVM: Accuracy = 66.67%
KNN: Accuracy = 66.67%
RandomForest: Accuracy = 66.67%
CNN1D: Accuracy = 66.67%
Transformer: Accuracy = 33.33%
