# Diagnosis of acute inflammation using XGBoost (classification)

## Acute Inflammations in the urinary system
The data was created by a medical expert as a data set to test the expert system, which will perform the presumptive diagnosis of two diseases of the urinary system.

In [1]:
import numpy as np
import logging
from giza_actions.action import Action, action
from giza_actions.task import task
from sklearn import preprocessing
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBClassifier as XGBC, XGBRegressor as XGBR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
import joblib
import onnx
from onnxmltools.convert import convert_xgboost
from skl2onnx.common.data_types import FloatTensorType, DoubleTensorType


## Collect datasets

We are looking for data related to the medical field. In the UCI dataset, we found data on two types of [acute inflammation](https://archive.ics.uci.edu/dataset/184/acute+inflammations) classifications. This dataset provides detailed information about these two types of inflammation, including patient temperature and symptoms. By analyzing this data, we hope to find patterns of inflammation and hope to construct verifiable medical care projects to help patients with symptoms diagnose in time.

Through this work, we also hope to improve the diagnostic accuracy and speed of doctors for these two types of inflammation. We believe that through scientific data analysis methods, we can better understand the patterns of inflammation and provide more accurate predictions and more effective treatment plans.

## Dataset Variable Information
 - a1	Temperature of patient  { 35C-42C }	
 - a2	Occurrence of nausea  { yes, no }	
 - a3	Lumbar pain  { yes, no }	
 - a4	Urine pushing (continuous need for urination)  { yes, no }	
 - a5	Micturition pains  { yes, no }	
 - a6	Burning of urethra, itch, swelling of urethra outlet  { yes, no }	
 - d1	decision: Inflammation of urinary bladder  { yes, no }	
 - d2	decision: Nephritis of renal pelvis origin { yes, no }	

### Prepare datasets and create loaders

In [2]:
def parser_acute_inflammations(line: str) -> str:
    line_list = line.replace('no', '0').replace('yes', '1').replace(
        '\n', '').replace(',', '.').split('\t')
    new_line_list = line_list[0:-2]
    if line_list[-2] == '0':
        if line_list[-1] == '0':
            new_line_list.append('0')
        else:
            new_line_list.append('2')
    else:
        if line_list[-1] == '0':
            new_line_list.append('1')
        else:
            new_line_list.append('3')
    return '\t'.join(new_line_list) + '\n'

def convert_list_line_to_str(line):
    res = ''
    for index, ele in enumerate(line):
        if index == len(line) - 1:
            res += str(ele) + '\n'
        else:
            res += str(ele) + '\t'
    return res
    
def generater_column_normalization_info(nparray_data):
    column_max = ["max"]
    column_min = ["min"]
    for column_index in range(len(nparray_data[0])):
        column = nparray_data[:, column_index]
        column_max.append(np.max(column))
        column_min.append(np.min(column))

    with open("column_normalization_info.tsv", 'w+', encoding="utf-8") as f:
        f.writelines([convert_list_line_to_str(column_max), convert_list_line_to_str(column_min)])
    
    
@task(name=f'Prepare Datasets')
def prepare_datasets():
    print("Prepare dataset...")
    dataset_path = "data/diagnosis.data"
    with open(dataset_path, encoding="utf-16") as f:
        lines = f.readlines();
        remove_index = []
        for index, line in enumerate(lines):
            line = parser_acute_inflammations(line)
            if len(line):
                lines[index] = [float(ele) for ele in line.replace('\n', '').split('\t')]
            else:
                remove_index.append(index)
        # remove none line
        for index in remove_index:
            lines.remove(lines[index - index_offset])
        x = [line[:-1] for line in lines]
        y = [[int(line[-1])] for line in lines]
        x = np.array(x)
        generater_column_normalization_info(x)
        min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
        x = min_max_scaler.fit_transform(x)
        combined_array = np.concatenate((x, y), axis=1)
        print("✅ Datasets prepared successfully")
    
        return x, y

## XGBoost classification model training

In [3]:
@task(name=f'Training model')
def get_model(x, y, model_type: str = "classification"):
    # Divide the training set and test set
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    XGBmodel = XGBC(n_estimators=10).fit(x_train, y_train)
   
    # Get Score
    print("Score", XGBmodel.score(x_test, y_test))
    # Get Mean Square Error
    print("Mean Square Error", MSE(y_test, XGBmodel.predict(x_test)))
    # Get feature importance
    print("Feature importance", XGBmodel.feature_importances_)

    return XGBmodel

## Export XGBoost model to ONNX
Use the `convert_xgboost` interface provided by sklearn to convert and export the model in ONNX format

In [4]:
@task(name=f'Export to ONNX')
def onnx_export(model, filename, input_size = 6, target_opset =15):
    # Export model as a ONNX
    onnx_model_converted = convert_xgboost(XGBmodel, 'tree-based classifier',
                             [('input', FloatTensorType([1, input_size]))],
                             target_opset=target_opset)
    onnx.save_model(onnx_model_converted, filename)

## Execution

In [5]:
@action(name=f'Model Development', log_prints=True )
def develop_model():
    x, y = prepare_datasets()
    XGBmodel = get_model(x, y)

    # Convert to ONNX
    onnx_export(XGBmodel, "acute_inflammation_xgboost.onnx", 6, 15)
    
develop_model()

  y = column_or_1d(y, warn=True)


[Completed(message=None, type=COMPLETED, result=UnpersistedResult(type='unpersisted', artifact_type='result', artifact_description='Unpersisted result of type `tuple`')),
 Completed(message=None, type=COMPLETED, result=UnpersistedResult(type='unpersisted', artifact_type='result', artifact_description='Unpersisted result of type `XGBClassifier`')),
 Completed(message=None, type=COMPLETED, result=UnpersistedResult(type='unpersisted', artifact_type='result', artifact_description='Unpersisted result of type `NoneType`'))]

In [6]:
# # Error execution
# if __name__ == "__main__":
#     action_deploy = Action(entrypoint=develop_model, name="acute_inflammation_xgboost_action")
#     action_deploy.serve(name="acute_inflammation_xgboost_deployment")