<a href="https://colab.research.google.com/github/InsupCode/ML_Interpretability/blob/main/Doc_class_XGboost_Covid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

In [2]:
!python --version

Python 3.10.12


In [3]:
!nvidia-smi

Fri Dec 13 00:21:49 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [4]:
A_train = pd.read_csv("/content/Abstract_train_df.csv")
A_test = pd.read_csv("/content/Absract_test_df.csv")

In [5]:
X_train = A_train.drop(columns=['pmid', 'cleaned_abstract', 'label'])  # Replace 'target' with your actual target column name
X_train = X_train.select_dtypes(include=['number'])
y_train = A_train['label']

X_test = A_test.drop(columns=['pmid', 'cleaned_abstract', 'label'])
X_test = X_test.select_dtypes(include=['number'])
y_test = A_test['label']

In [6]:
from sklearn.preprocessing import LabelEncoder


label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

## Model and training

In [7]:
from xgboost import XGBClassifier

In [8]:
xgb_model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='mlogloss',
    tree_method='gpu_hist',  # Use GPU for boosting
    gpu_id=0,
    max_depth=6,             # Hyperparameter for max depth of the trees
    n_estimators=100,
    learning_rate=0.1,
    n_jobs=-1
)

In [9]:
#training
xgb_model.fit(X_train, y_train)


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "use_label_encoder" } are not used.



In [10]:
y_pred = xgb_model.predict(X_test)


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [11]:
#Metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Evaluation
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.86
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.78      0.86        36
           1       0.78      0.97      0.87        30

    accuracy                           0.86        66
   macro avg       0.87      0.87      0.86        66
weighted avg       0.88      0.86      0.86        66

Confusion Matrix:
[[28  8]
 [ 1 29]]


In [12]:
#To save model:
# save the model as a pickle file
import pickle
model_pkl_file = "XGboost_Abstract_Doc_classification.pkl"

with open(model_pkl_file, 'wb') as file:
    pickle.dump(xgb_model, file)

## Hyperparameter Tuning

In [13]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'max_depth': [3, 6, 10],
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

xgb_model1 = XGBClassifier(
    use_label_encoder=False,
    eval_metric='mlogloss',
    tree_method='gpu_hist',
    gpu_id=0,
    n_jobs=-1
)

In [14]:
random_search = RandomizedSearchCV(xgb_model1, param_distributions=param_grid, n_iter=10, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)


In [15]:
random_search.fit(X_train, y_train)


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "use_label_encoder" } are not used.



In [16]:
print("Best Hyperparameters: ", random_search.best_params_)


Best Hyperparameters:  {'subsample': 0.8, 'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.01, 'colsample_bytree': 0.8}


In [17]:
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)


    E.g. tree_method = "hist", device = "cuda"



In [18]:
#Metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Evaluation
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.89
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.83      0.90        36
           1       0.83      0.97      0.89        30

    accuracy                           0.89        66
   macro avg       0.90      0.90      0.89        66
weighted avg       0.90      0.89      0.89        66

Confusion Matrix:
[[30  6]
 [ 1 29]]


In [19]:
#To save model:
# save the model as a pickle file
import pickle
model_pkl_file = "XGboost_hyperparam_Abstract_Doc_classification.pkl"

with open(model_pkl_file, 'wb') as file:
    pickle.dump(best_model, file)

## SVM model

In [20]:
from sklearn.svm import SVC
model_svm = SVC(kernel='linear')  # Use 'linear' kernel for text classification
model_svm.fit(X_train, y_train)

In [21]:
y_pred = model_svm.predict(X_test)

In [22]:
#Metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Evaluation
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.88
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.83      0.88        36
           1       0.82      0.93      0.88        30

    accuracy                           0.88        66
   macro avg       0.88      0.88      0.88        66
weighted avg       0.89      0.88      0.88        66

Confusion Matrix:
[[30  6]
 [ 2 28]]


In [36]:
#To save model:
# save the model as a pickle file
import pickle
model_pkl_file = "SVM_Abstract_Doc_classification.pkl"

with open(model_pkl_file, 'wb') as file:
    pickle.dump(model_svm, file)

## MLP model

In [23]:
from sklearn.neural_network import MLPClassifier
model_mlp = MLPClassifier(hidden_layer_sizes=(100, ), max_iter=1000)
model_mlp.fit(X_train, y_train)

In [24]:
y_pred = model_mlp.predict(X_test)

In [25]:
#Metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Evaluation
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.91
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.94      0.92        36
           1       0.93      0.87      0.90        30

    accuracy                           0.91        66
   macro avg       0.91      0.91      0.91        66
weighted avg       0.91      0.91      0.91        66

Confusion Matrix:
[[34  2]
 [ 4 26]]


In [37]:
#To save model:
# save the model as a pickle file
import pickle
model_pkl_file = "MLP_Abstract_Doc_classification.pkl"

with open(model_pkl_file, 'wb') as file:
    pickle.dump(model_mlp, file)

#### Hyperparameter tuning for MLP

In [28]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

In [29]:
mlp = MLPClassifier()

# Define the parameter grid
param_distributions = {
    'hidden_layer_sizes': [(50,), (100,), (100, 50), (50, 50, 50)],
    'activation': ['tanh', 'relu'],
    'solver': ['adam', 'sgd'],
    'alpha': np.logspace(-4, -1, 4),  # [0.0001, 0.001, 0.01, 0.1]
    'learning_rate': ['constant', 'adaptive'],
    'max_iter': [200, 500]
}

# Create the RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=mlp, param_distributions=param_distributions,
                                   n_iter=20, cv=3, scoring='accuracy', n_jobs=-1)

In [30]:
random_search.fit(X_train, y_train)

  _data = np.array(data, dtype=dtype, copy=copy,


In [32]:
best_params_mlp = random_search.best_params_
best_model_mlp = random_search.best_estimator_

In [33]:
print(best_params_mlp)

{'solver': 'adam', 'max_iter': 500, 'learning_rate': 'adaptive', 'hidden_layer_sizes': (100,), 'alpha': 0.001, 'activation': 'tanh'}


In [34]:
y_pred = best_model_mlp.predict(X_test)

In [35]:
#Metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Evaluation
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.86
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.83      0.87        36
           1       0.82      0.90      0.86        30

    accuracy                           0.86        66
   macro avg       0.86      0.87      0.86        66
weighted avg       0.87      0.86      0.86        66

Confusion Matrix:
[[30  6]
 [ 3 27]]
