In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


# Symptome disease file

## 1 clean data

### 1.0  Load and check data

In [2]:
##Google collab import
#from google.colab import drive
#drive.mount('/content/drive')
#df=pd.read_csv('/content/drive/MyDrive/Notebooks/MedAI/raw_data/Final_Augmented_dataset_Diseases_and_Symptoms.csv')
#df_symp=df

In [3]:
df_symp = pd.read_csv('../raw_data/Final_Augmented_dataset_Diseases_and_Symptoms.csv')

In [4]:
df_symp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246945 entries, 0 to 246944
Columns: 378 entries, diseases to neck weakness
dtypes: int64(377), object(1)
memory usage: 712.2+ MB


In [5]:
df_symp.head()

Unnamed: 0,diseases,anxiety and nervousness,depression,shortness of breath,depressive or psychotic symptoms,sharp chest pain,dizziness,insomnia,abnormal involuntary movements,chest tightness,...,stuttering or stammering,problems with orgasm,nose deformity,lump over jaw,sore in nose,hip weakness,back swelling,ankle stiffness or tightness,ankle weakness,neck weakness
0,panic disorder,1,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,panic disorder,0,0,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,panic disorder,1,1,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,panic disorder,1,0,0,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,panic disorder,1,1,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [6]:
#some symtomes have only 1 value, which makes no sense if teh value is 0, so we will drop them
#select columns with only 1 unique value

df_symp.nunique()

diseases                            773
anxiety and nervousness               2
depression                            2
shortness of breath                   2
depressive or psychotic symptoms      2
                                   ... 
hip weakness                          1
back swelling                         1
ankle stiffness or tightness          1
ankle weakness                        2
neck weakness                         1
Length: 378, dtype: int64

In [7]:
# Check the shape
print(df_symp.shape)


(246945, 378)


In [8]:
#Number of disease occurences
df_symp['diseases'].value_counts()

diseases
cystitis                          1219
vulvodynia                        1218
nose disorder                     1218
complex regional pain syndrome    1217
spondylosis                       1216
                                  ... 
typhoid fever                        1
rocky mountain spotted fever         1
open wound of the knee               1
hypergammaglobulinemia               1
open wound due to trauma             1
Name: count, Length: 773, dtype: int64

In [9]:
#Distrubution of number of symptoms per observations
df_symp.iloc[:,1:].sum(axis=1).value_counts()

5     57989
6     51116
4     47283
7     34037
3     25451
8     16425
2      6448
9      5980
10     1534
1       443
11      231
12        8
Name: count, dtype: int64

In [10]:
#distribution of average number of symptoms per unique disease
df_symp.iloc[:,1:].sum(axis=1).value_counts()/len(df_symp)

5     0.234826
6     0.206993
4     0.191472
7     0.137832
3     0.103063
8     0.066513
2     0.026111
9     0.024216
10    0.006212
1     0.001794
11    0.000935
12    0.000032
Name: count, dtype: float64


### 1.1 Clean symptoms and disease

In [11]:
# Remove classes with only 1 observation
#Count observations per disease
class_counts = df_symp['diseases'].value_counts()
#filter
filtered_classes = class_counts[class_counts > 1].index
#apply filter on df
df_symp_disease_filtered = df_symp[df_symp['diseases'].isin(filtered_classes)]
df_symp_disease_filtered.shape

(246926, 378)

In [12]:
#Identify columns with only one unique value
columns_single_value = [col for col in df_symp_disease_filtered.columns if df_symp_disease_filtered[col].nunique() == 1]
df_symp_single_value = df_symp_disease_filtered[columns_single_value]
#remove them from df
df_symp_filtered= df_symp_disease_filtered.drop(columns = columns_single_value)

In [13]:
df_symp_filtered.shape

(246926, 326)

In [14]:
#check if there are duplicates
df_symp_filtered.duplicated().sum()

57298

### 1.2 Generating Feature / target datasets

In [15]:
X=df_symp_filtered.drop(['diseases'], axis=1)
X.shape

(246926, 325)

In [16]:
y=df_symp_filtered['diseases']
y.shape

(246926,)

## 2 Classification model

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score, precision_score, f1_score

In [18]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_encoded

array([516, 516, 516, ..., 491, 491, 491])

In [19]:
import pickle
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)  # Save trained encoder

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_encoded,
    test_size=0.35,     # 35% test split
    random_state=42,   # for reproducibility
    stratify=y_encoded # ensures balanced distribution of diseases in train/test
)

### 2.1 Random forest

#### 2.1.1 Training for evaluation of model (Spliting)

In [38]:
rf_classifier = RandomForestClassifier(
    n_estimators=100,    # number of trees
    max_depth=30,
    min_samples_split=5, # Prevents trees from growing too deep
    min_samples_leaf=2,  # Forces leaves to have at least 2 samples
    max_features="sqrt",
    n_jobs=-1            # use all available CPU cores
)


In [39]:
rf_classifier.fit(X_train, y_train)

In [40]:
y_pred = rf_classifier.predict(X_test)

y_pred

array([659, 277, 172, ..., 393, 329, 354])

In [41]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.7388487127567255

In [28]:
from sklearn.metrics import recall_score

# Macro-averaged recall
macro_recall = recall_score(y_test, y_pred, average='macro')

# Weighted-averaged recall
weighted_recall = recall_score(y_test, y_pred, average='weighted')

print(f"Macro-Averaged Recall: {macro_recall:.4f}")
print(f"Weighted-Averaged Recall: {weighted_recall:.4f}")

Macro-Averaged Recall: 0.7321
Weighted-Averaged Recall: 0.6148


In [None]:
# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=_names=label_encoder.classes_))


#### 2.1.2 Training for app (full dataset)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score, precision_score, f1_score

In [3]:
#Label_Encoding the target
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_encoded

NameError: name 'LabelEncoder' is not defined

In [None]:
rf_classifier = RandomForestClassifier(
    n_estimators=50,    # number of trees
    max_depth=30,       # grow to 30
    n_jobs=-1           # use all available CPU cores
)

In [None]:
rf_classifier.fit(X, y_encoded)

In [None]:
df_symp_filtered[0:10]

Unnamed: 0,diseases,anxiety and nervousness,depression,shortness of breath,depressive or psychotic symptoms,sharp chest pain,dizziness,insomnia,abnormal involuntary movements,chest tightness,...,joint swelling,redness in or around nose,wrinkles on skin,foot or toe weakness,hand or finger cramps or spasms,back stiffness or tightness,wrist lump or mass,low urine output,sore in nose,ankle weakness
0,panic disorder,1,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,panic disorder,0,0,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,panic disorder,1,1,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,panic disorder,1,0,0,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,panic disorder,1,1,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
5,panic disorder,0,0,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,panic disorder,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,panic disorder,0,0,0,1,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
8,panic disorder,1,0,0,1,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
9,panic disorder,1,1,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
y[0]

'panic disorder'

In [None]:
test=label_encoder.classes_[rf_classifier.predict(X[0:1])][0]
print(test)

panic disorder


In [None]:
rf_classifier.predict(X[0:1])[0]


516

In [None]:
#import pickle

# save
#with open('model_rf.pkl','wb') as f:
#    pickle.dump(rf_classifier,f)


In [None]:
probs=rf_classifier.predict_proba(X[0:1])

In [None]:
probs_flat=probs[0]

In [None]:
df_probs = pd.DataFrame({
    "Disease": label_encoder.classes_,  # List of disease names
    "Probability": probs_flat   # Corresponding probabilities
})

In [None]:
df_probs_sorted = df_probs.sort_values(by="Probability", ascending=False).reset_index(drop=True)

In [None]:
df_probs_sorted[0:10]

Unnamed: 0,Disease,Probability
0,panic disorder,0.641914
1,esophagitis,0.003553
2,ischemic heart disease,0.003198
3,pulmonary embolism,0.002631
4,heart block,0.002212
5,acute bronchospasm,0.00204
6,heart failure,0.001968
7,pulmonary congestion,0.001933
8,sinus bradycardia,0.001821
9,hypertensive heart disease,0.001818


In [None]:
print(X[0:1])

   anxiety and nervousness  depression  shortness of breath  \
0                        1           0                    1   

   depressive or psychotic symptoms  sharp chest pain  dizziness  insomnia  \
0                                 1                 0          0         0   

   abnormal involuntary movements  chest tightness  palpitations  ...  \
0                               0                1             1  ...   

   joint swelling  redness in or around nose  wrinkles on skin  \
0               0                          0                 0   

   foot or toe weakness  hand or finger cramps or spasms  \
0                     0                                0   

   back stiffness or tightness  wrist lump or mass  low urine output  \
0                            0                   0                 0   

   sore in nose  ankle weakness  
0             0               0  

[1 rows x 325 columns]


: 

#### 2.1.3 testing predict

In [None]:
from medai.main import pred  # Import a specific function
pred(X[0:1])

ModuleNotFoundError: No module named 'medai'

: 

### 2.2 XGBOOST

In [23]:
#!pip install xgboost


Collecting xgboost
  Downloading xgboost-2.1.3-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.25.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.8 kB)
Downloading xgboost-2.1.3-py3-none-manylinux_2_28_x86_64.whl (153.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.9/153.9 MB[0m [31m125.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading nvidia_nccl_cu12-2.25.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (201.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m201.4/201.4 MB[0m [31m97.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.25.1 xgboost-2.1.3


In [44]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle


In [35]:
#!pip install scikit-learn==1.3.2


Collecting scikit-learn==1.3.2
  Downloading scikit_learn-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading scikit_learn-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m105.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.0
    Uninstalling scikit-learn-1.6.0:
      Successfully uninstalled scikit-learn-1.6.0
Successfully installed scikit-learn-1.3.2


#### 2.2.1 Training model 1

In [25]:
model = xgb.XGBClassifier(
    n_estimators=50,    # Number of trees
    learning_rate=0.1,   # Step size shrinkage
    max_depth=30,         # Maximum tree depth
    objective="multi:softprob", # For multi-class classification
    eval_metric="mlogloss"
)

In [26]:
model.fit(X_train, y_train)

In [28]:
# Evaluate Model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

Model Accuracy: 0.8347


In [32]:
from sklearn.metrics import recall_score

# Macro-averaged recall
macro_recall = recall_score(y_test, y_pred, average='macro')

# Weighted-averaged recall
weighted_recall = recall_score(y_test, y_pred, average='weighted')

print(f"Macro-Averaged Recall: {macro_recall:.4f}")
print(f"Weighted-Averaged Recall: {weighted_recall:.4f}")

Macro-Averaged Recall: 0.7441
Weighted-Averaged Recall: 0.8347


In [29]:
#import pickle

# save
with open('XGB_model.pkl','wb') as f:
    pickle.dump(model,f)

In [36]:
model.predict(X[0:1])[0]

In [37]:
probs=model.predict_proba(X[0:1])[0]

In [39]:
df_probs = pd.DataFrame({
    "Disease": label_encoder.classes_,  # List of disease names
    "Probability": probs   # Corresponding probabilities
})

In [42]:
df_probs.sort_values(by="Probability", ascending=False).reset_index(drop=True)[0:10]

Unnamed: 0,Disease,Probability
0,panic disorder,0.997007
1,anxiety,0.001486
2,panic attack,0.000195
3,sinus bradycardia,0.000101
4,acute respiratory distress syndrome (ards),7.9e-05
5,angina,7.2e-05
6,abscess of the lung,6.4e-05
7,premature ventricular contractions (pvcs),5.9e-05
8,heart block,5.8e-05
9,drug abuse (methamphetamine),5.5e-05


#### 2.2.2 XGB model 2

In [None]:
✅ Lower max_depth to 8-15 to prevent overfitting.
✅ Increase min_child_weight (e.g., 5-10) to reduce bias towards frequent diseases.
✅ Use regularization (gamma=1-5, reg_alpha=0.1-1, reg_lambda=1-10) to prune unnecessary splits.
✅ Increase n_estimators (200-500) while reducing learning_rate (0.05-0.01) for stability.
✅ Use scale_pos_weight for rare diseases OR try SMOTE for synthetic balancing.
✅ Feature selection: Drop unimportant symptoms based on feature_importances_.
✅ Hyperparameter tuning: Use Bayesian Optimization or GridSearchCV to automate tuning.

In [45]:
model_2 = xgb.XGBClassifier(
    n_estimators=100,    # Number of trees
    learning_rate=0.05,   # Step size shrinkage
    max_depth=30,        # Maximum tree depth
    objective="multi:softprob", # For multi-class classification
    eval_metric="mlogloss",
    reg_alpha=1.0,       # L1 regularization (Lasso) - Helps remove less important features
    reg_lambda=1.0,      # L2 regularization (Ridge) - Helps reduce model complexity
    subsample=0.8,       # Use 80% of data per boosting round (helps generalization)
    colsample_bytree=0.8 # Use 80% of features per tree (reduces correlation between trees)
)

In [23]:
model_2.fit(X_train,y_train)

In [24]:
# Evaluate Model
y_pred = model_2.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

Model Accuracy: 0.8498


In [25]:
from sklearn.metrics import recall_score

# Macro-averaged recall
macro_recall = recall_score(y_test, y_pred, average='macro')

# Weighted-averaged recall
weighted_recall = recall_score(y_test, y_pred, average='weighted')

print(f"Macro-Averaged Recall: {macro_recall:.4f}")
print(f"Weighted-Averaged Recall: {weighted_recall:.4f}")

Macro-Averaged Recall: 0.7107
Weighted-Averaged Recall: 0.8498


In [46]:
model_2.fit(X, y_encoded)
    

In [47]:
import pickle

#save
with open('xgb_model_full.pkl','wb') as f:
    pickle.dump(model_2,f)

In [47]:
from sklearn.model_selection import GridSearchCV

In [49]:
from xgboost import XGBClassifier


In [45]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    "max_depth": [6, 12, 20],
    "learning_rate": [0.01, 0.05, 0.1],
    "n_estimators": [100, 300, 500],
    "min_child_weight": [1, 5, 10],
    "gamma": [0, 1, 5]
}



In [50]:
grid_search = GridSearchCV(XGBClassifier(objective="multi:softprob", eval_metric="mlogloss"),
                           param_grid, scoring="recall_macro", cv=3, verbose=1)


In [61]:
pd.DataFrame(y_train).value_counts()

0  
164    792
471    792
747    792
137    791
233    790
      ... 
325      1
96       1
179      1
297      1
487      1
Name: count, Length: 754, dtype: int64

In [None]:
print(grid_search.best_params_)

# Global health file

In [None]:
df_glob= pd.read_csv('/home/greg_ytch/code/Gregytch/MedAI/raw_data/Global Health Statistics.csv')

In [None]:
df_glob.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 22 columns):
 #   Column                              Non-Null Count    Dtype  
---  ------                              --------------    -----  
 0   Country                             1000000 non-null  object 
 1   Year                                1000000 non-null  int64  
 2   Disease Name                        1000000 non-null  object 
 3   Disease Category                    1000000 non-null  object 
 4   Prevalence Rate (%)                 1000000 non-null  float64
 5   Incidence Rate (%)                  1000000 non-null  float64
 6   Mortality Rate (%)                  1000000 non-null  float64
 7   Age Group                           1000000 non-null  object 
 8   Gender                              1000000 non-null  object 
 9   Population Affected                 1000000 non-null  int64  
 10  Healthcare Access (%)               1000000 non-null  float64
 11  Doctors per 

### 1.3 Trying to detect disease duplicates :)

### 1.2 number of symptoms per disease (on-going)

In [None]:
df_symp[df_symp['diseases']=='thalassemia']
#this might not make sense that 1 symptome gives one disease -> Let's check the number of symptoms per disease

Unnamed: 0,diseases,anxiety and nervousness,depression,shortness of breath,depressive or psychotic symptoms,sharp chest pain,dizziness,insomnia,abnormal involuntary movements,chest tightness,...,stuttering or stammering,problems with orgasm,nose deformity,lump over jaw,sore in nose,hip weakness,back swelling,ankle stiffness or tightness,ankle weakness,neck weakness
202955,thalassemia,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


#### 1.3.1 Using BERT transformer (Disease name similarities)[texte du lien](https://)

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd


KeyboardInterrupt: 

In [None]:
# Load SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')


In [None]:
# Compute embeddings for disease names
disease_names = df_symp['diseases'].unique()  # Ensure this is your disease column
embeddings = model.encode(disease_names)


In [None]:
# Compute cosine similarity
similarity_matrix = cosine_similarity(embeddings)


In [None]:
# Create a DataFrame for easier analysis
similarity_df = pd.DataFrame(similarity_matrix, index=disease_names, columns=disease_names)

In [None]:
similarity_df

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Flatten the matrix to get all pairwise similarity scores
similarity_scores = similarity_matrix[np.triu_indices_from(similarity_matrix, k=1)]

# Plot the histogram
plt.hist(similarity_scores, bins=50, alpha=0.75)
plt.title("Distribution of Disease Name Similarity Scores")
plt.xlabel("Cosine Similarity")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Extract top pairs with similarity > 0.8
threshold = 0.8
similar_pairs = []

for i in range(len(disease_names)):
    for j in range(i + 1, len(disease_names)):  # Only look at upper triangle
        if similarity_matrix[i, j] > threshold:
            similar_pairs.append((disease_names[i], disease_names[j], similarity_matrix[i, j]))

# Sort pairs by similarity
similar_pairs = sorted(similar_pairs, key=lambda x: x[2], reverse=True)

# Display top pairs
for pair in similar_pairs:  # Inspect top 10 pairs
    print(pair)

In [None]:
len(similar_pabirs)

In [None]:
''' Diseases seem super close sometimes, mostly beause they are super specific not sure if it makes sense to cluster them with this methods since it will impact our recall / accuracy based on NLP methods'''
''' Maybe cluster thanks to symptoms is more consistent, since it should ave a direct impact on the KPI's, also make sense to group disease by symptoms for the users'''

#### 1.3.2 Using cosine_similarities  (Disease name similarities)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
import numpy as np

# Get symptom vectors
symptom_vectors = df_symp.iloc[:, 1:].values  # Assuming symptoms are columns after 'diseases'

# Compute cosine similarity
cosine_sim_matrix = cosine_similarity(symptom_vectors)

# Optional: Jaccard similarity for binary vectors
jaccard_sim_matrix = np.zeros((len(symptom_vectors), len(symptom_vectors)))
for i in range(len(symptom_vectors)):
    for j in range(len(symptom_vectors)):
        jaccard_sim_matrix[i, j] = jaccard_score(symptom_vectors[i], symptom_vectors[j])