In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import shap

# Load data
pos = pd.read_csv('/Users/jiaming/Desktop/f5c/pos_domain_encoding.csv') 
pos = pos.iloc[:, 1:]
neg = pd.read_csv('/Users/jiaming/Desktop/f5c/neg_domain_encoding.csv')
neg = neg.iloc[:, 1:]

raw_datas = np.concatenate((pos, neg), axis=0)
raw_labels = np.concatenate(([1] * pos.shape[0], [0] * neg.shape[0]), axis=0)

np.random.seed(1)
indices = np.random.permutation(raw_labels.shape[0])

X = raw_datas[indices, :]
y = raw_labels[indices]

# Get the feature names from your dataset (excluding the first column)
feature_names = pos.columns  # Assuming columns in your DataFrame are the feature names

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X, y)

# Create a SHAP TreeExplainer
explainer = shap.TreeExplainer(rf_model)

# Calculate SHAP values
shap_values = explainer.shap_values(X)

# Plot SHAP summary plots with feature names and max_display=10
shap.summary_plot(shap_values, X, plot_type="bar", feature_names=feature_names, max_display=10)  # Bar plot


# Create separate violin plots for each class with feature names and max_display=10
shap.summary_plot(shap_values[0], X, plot_type="violin", feature_names=feature_names, color="red", class_names=["Negative"], max_display=10)
shap.summary_plot(shap_values[1], X, plot_type="violin", feature_names=feature_names, color="blue", class_names=["Positive"], max_display=10)

In [None]:
import numpy as np
import pandas as pd
import xgboost 
import shap
import matplotlib.pyplot as plt

# Load data
pos = pd.read_csv('/Users/jiaming/Desktop/f5c/pos_domain_encoding.csv') 
pos = pos.iloc[:, 1:]
print(pos.shape)
neg = pd.read_csv('/Users/jiaming/Desktop/f5c/neg_domain_encoding.csv')
neg = neg.iloc[:, 1:][0:1892]
print(neg.shape)

raw_datas = pd.concat([pos,neg])
print(raw_datas.shape)
raw_labels = np.concatenate(([1] * pos.shape[0], [0] * neg.shape[0]), axis=0)

np.random.seed(1)
indices = np.random.permutation(raw_labels.shape[0])

X = raw_datas.iloc[indices]
y = raw_labels[indices]

# Get the feature names from your dataset (excluding the first column)
feature_names = pos.columns  # Assuming columns in your DataFrame are the feature names

# Train XGBoost model
model = xgboost.XGBClassifier(eval_metric='mlogloss').fit(X, y)

# Create a SHAP TreeExplainer for XGBoost model
explainer = shap.TreeExplainer(model)  

shap_values2 = explainer(X)

# Plot SHAP bar plot
fig = shap.plots.bar(shap_values2,show=False) 

# Save the bar plot as a PDF file
plt.savefig('/Users/jiaming/Desktop/fig.pdf', bbox_inches='tight')

plt.close(fig) 