In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, accuracy_score

file_path = 'C:/Users/roza/Downloads/LLM_df_stringify_filtered.csv'
df = pd.read_csv(file_path)

print("Dataset shape:", df.shape)
print("Dataset columns:", df.columns)
print("First few rows of the dataset:")
print(df.head())

feature_col = 'Symptom_labels'
target_col = 'diseases'

tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df[feature_col].astype('str'))
y = df[target_col].astype('category').cat.codes

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

params = {
    'objective': 'multi:softprob',  
    'num_class': len(set(y)),     
    'eval_metric': 'mlogloss',   
}

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

bst = xgb.train(params, dtrain, num_boost_round=100)

y_pred_prob = bst.predict(dtest)
y_pred = y_pred_prob.argmax(axis=1)

# Evaluate the model
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
accuracy = accuracy_score(y_test, y_pred)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"accuracy: {accuracy}")

Dataset shape: (26350, 3)
Dataset columns: Index(['Unnamed: 0', 'diseases', 'Symptom_labels'], dtype='object')
First few rows of the dataset:
   Unnamed: 0                   diseases  \
0           0  abdominal aortic aneurysm   
1           1  abdominal aortic aneurysm   
2           2  abdominal aortic aneurysm   
3           3  abdominal aortic aneurysm   
4           4  abdominal aortic aneurysm   

                                      Symptom_labels  
0  shortness of breath, palpitations, arm swellin...  
1  shortness of breath, palpitations, burning abd...  
2  shortness of breath, palpitations, sharp abdom...  
3  shortness of breath, palpitations, arm swellin...  
4  shortness of breath, palpitations, arm swellin...  
Precision: 0.8152703622646668
Recall: 0.7967741935483871
accuracy: 0.7967741935483871
