In [None]:
import xgboost
import shap
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import tensorflow as tf

df = pd.read_csv("diabetes_data.csv")

In [None]:
X = df.drop(columns=['Outcome'])
y = df['Outcome']

In [None]:
np.random.seed(42)
tf.random.set_seed(42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Gradient Boosting
model1 = xgboost.XGBRegressor(random_state=42).fit(X,y)

# NN
model2 = Sequential()
model2.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model2.add(Dense(1))
model2.compile(loss='mse', optimizer='adam')
model2.fit(X_train, y_train, epochs=50)

In [None]:
# Predicting results

preds1 = model1.predict(X)
preds2 = model2.predict(X)

explainer1 = shap.Explainer(model1)
explainer2 = shap.DeepExplainer(model2, X.values)

shap_values1 = explainer1(X)
shap_values2 = explainer2.shap_values(X.values)


In [None]:
with open("2Models_Diabetes.txt", "w") as f:
    for i in range(0, len(y)):
        f.write(str(preds1[i]))
        f.write(", ")
        f.write(str(preds2[i]))
        f.write("\n")

In [None]:
max_y = max(y)
for i in range(0, len(y)):
    preds1[i] = preds1[i] / max_y
    preds2[i] = preds2[i] / max_y

with open("2Models_Diabetes_preds_shaps.txt", "w") as f:
    for i in range(0, len(y)):
        f.write(str(preds1[i]))
        f.write(", ")
        f.write(str(preds2[i]))
        f.write(", ")
        tmp_lst1 = []
        tmp_lst2 = []
        for j in range(0, len(shap_values1[i].values)):
            tmp_lst1.append(str(shap_values1[i].values[j]))
            tmp_lst1.append(", ")
        tmp_strng1 = ''.join(tmp_lst1)
        f.write(tmp_strng1)
        tmp_lst2 = []
        for j in range(0, len(shap_values2[0][i])):
            tmp_lst2.append(str(shap_values2[0][i][j]))
            tmp_lst2.append(", ")
        tmp_strng2 = ''.join(tmp_lst2)
        nw_ts2 = tmp_strng2[:-2]
        f.write(nw_ts2)
        f.write("\n")

In [None]:
import random

def random_rows(input_file, output_file, num_rows):
    with open(input_file, 'r') as f_in:
        with open(output_file, 'w') as f_out:
            lines = f_in.readlines()
            num_lines = len(lines)
            if num_lines < num_rows:
                raise ValueError(f"Not enough lines in input file. Only found {num_lines} lines.")
            chosen_indices = set(random.sample(range(num_lines), num_rows))
            for i, line in enumerate(lines):
                if i in chosen_indices:
                    f_out.write(line)


In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

In [None]:
N = len(X)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(X)

In [None]:
shap_diffs = []
for i in range(0, len(y)):
    cur_shaps = []
    for j in range(0,len(shap_values1[i].values)):
        cur_d = shap_values1[i].values[j] - shap_values2[0][i][j]
        cur_shaps.append(cur_d)
    shap_diffs.append(cur_shaps)

In [None]:
df = pd.DataFrame(shap_diffs)
feature_names = list(X.columns)
df.columns = feature_names

In [None]:
# Load libraries and modules
import pandas as pd
from sklearn import preprocessing
import numpy as np
from numpy import linalg as LA
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
from sklearn.linear_model import LinearRegression
from sklearn.metrics.cluster import normalized_mutual_info_score
import os
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [None]:
from sklearn.metrics import silhouette_score

# Extract the features into X
X1 = df.iloc[:, 1:].values

# Calculate the silhouette score for different numbers of clusters
scores = []
for n_clusters in range(2, 20):
    clusterer = KMeans(n_clusters=n_clusters)
    cluster_labels = clusterer.fit_predict(X1)
    silhouette_avg = silhouette_score(X1, cluster_labels)
    scores.append(silhouette_avg)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)

# Find the optimal number of clusters
optimal_n_clusters = np.argmax(scores) + 2
print("Optimal number of clusters =", optimal_n_clusters)


In [None]:
kmeans = KMeans(n_clusters=optimal_n_clusters)
kmeans.fit(df)
labels=kmeans.labels_
df['cluster'] = labels
print(kmeans.cluster_centers_)

In [None]:
# t-SNE and scatterplot on original dataset, coloring it basing on cluster beloning

from sklearn.manifold import TSNE
import seaborn as sns

tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)

df2 = pd.DataFrame(X_tsne, columns=['tsne1','tsne2'])
df2['label'] = y
df2['cluster'] = df['cluster']

sns.scatterplot(data=df2, x='tsne1', y='tsne2', hue='cluster', palette='bright')


In [None]:
counts = df2['cluster'].value_counts()
print(counts)