# Klasifikacija podataka

In [243]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

In [244]:
# Učitavanje podataka
data_diab = pd.read_csv('diabetes.csv')

# feature i target
x = data_diab.drop('Outcome', axis = 1)
y = data_diab['Outcome']

# podjela na train i test 
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

# skaliranje podataka
X_train_sc = StandardScaler().fit_transform(X_train)
X_test_sc = StandardScaler().fit_transform(X_test)

# model RandomForestClassifier
model = RandomForestClassifier()

# treniranje modela
model.fit(X_train_sc, y_train)

y_pred = model.predict(X_test_sc)

# precision i accuracy
precision_og = precision_score(y_test, y_pred)
accuracy_og = accuracy_score(y_test, y_pred)

print(precision_og)
print(accuracy_og)

0.6938775510204082
0.8181818181818182


## PCA

In [245]:
from sklearn.decomposition import PCA

In [246]:
pca = PCA(n_components = 5)

In [247]:
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.fit_transform(X_test)

## treniranje modela u reduciranoj dimenziji
model_pca = RandomForestClassifier(random_state = 42)
model_pca.fit(X_train_pca, y_train)

# precision i accuracy u reduciranoj dimenziji
y_pred_pca = model_pca.predict(X_test_pca)
precision_pca = precision_score(y_test, y_pred_pca)
accuracy_pca = accuracy_score(y_test, y_pred_pca)

print(f"precision: {round(precision_pca*100,4)}%")
print(f"accuracy: {round(accuracy_pca*100,4)}%")

precision: 60.9756%
accuracy: 75.3247%


## SVD

In [248]:
from sklearn.decomposition import TruncatedSVD

In [249]:
svd = TruncatedSVD(n_components = 5)

In [250]:
X_train_svd = svd.fit_transform(X_train)
X_test_svd = svd.fit_transform(X_test)

# treniranje modela u reduciranoj dimenziji
model_svd = RandomForestClassifier(random_state = 42)
model_svd.fit(X_train_svd, y_train)

# precision i accuracy u reduciranoj dimenzjii
y_pred_svd = model_svd.predict(X_test_svd)
precision_svd = precision_score(y_test, y_pred_svd)
accuracy_svd = accuracy_score(y_test, y_pred)

print(f"precision: {round(precision_svd*100,4)}%")
print(f"accuracy: {round(accuracy_svd*100,4)}%")

precision: 61.7647%
accuracy: 81.8182%


## mMDS

In [251]:
import pandas as pd
import numpy as np
import tensorflow as tf

# Load CSV file



class MDSNet(tf.keras.Model):
    def __init__(self, input_dim, output_dim):
        super(MDSNet, self).__init__()
        self.fc1 = tf.keras.layers.Dense(output_dim, activation='tanh')
    
    def call(self, x):
        return self.fc1(x)

input_dim = data_diab.shape[1]
output_dim = 3
learning_rate = 0.001
batch_size = 256
epochs = 1000

model_mmds = MDSNet(input_dim, output_dim)
optimizer = tf.keras.optimizers.Adam(learning_rate)

@tf.function
def train_step(batch_data):
    with tf.GradientTape() as tape:
        output = model_mmds(batch_data)
        distance_orig = tf.norm(tf.expand_dims(batch_data, 0) - tf.expand_dims(batch_data, 1), axis=-1)
        distance_proj = tf.norm(tf.expand_dims(output, 0) - tf.expand_dims(output, 1), axis=-1)
        loss = tf.reduce_mean(tf.square(distance_proj - distance_orig))
    gradients = tape.gradient(loss, model_mmds.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model_mmds.trainable_variables))
    return loss

data_diab_tensor = tf.convert_to_tensor(data_diab, dtype=tf.float32)

for epoch in range(epochs):
    permutation = np.random.permutation(data_diab_tensor.shape[0])
    for i in range(0, data_diab_tensor.shape[0], batch_size):
        indices = permutation[i:i+batch_size]
        batch_data = tf.gather(data_diab_tensor, indices)
        loss = train_step(batch_data)
    if (epoch + 1) % 100 == 0:
        print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss:.4f}')

data_mmds = model_mmds(data_diab_tensor).numpy()


Epoch [100/1000], Loss: nan
Epoch [200/1000], Loss: nan
Epoch [300/1000], Loss: nan
Epoch [400/1000], Loss: nan
Epoch [500/1000], Loss: nan
Epoch [600/1000], Loss: nan
Epoch [700/1000], Loss: nan
Epoch [800/1000], Loss: nan
Epoch [900/1000], Loss: nan
Epoch [1000/1000], Loss: nan


## Autoencoder

In [252]:
x

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [253]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

# Normalize the data (optional, depending on your data characteristics)
scaler = MinMaxScaler()
data_diab = scaler.fit_transform(data_diab)

# Define the autoencoder model
input_layer = Input(shape=(data_diab.shape[1],))
encoded = Dense(5, activation='relu')(input_layer)
encoded = Dense(3, activation='relu')(encoded)

decoded = Dense(5, activation='relu')(encoded)
decoded = Dense(data_diab.shape[1], activation='sigmoid')(decoded)

autoencoder = Model(input_layer, decoded)
encoder = Model(input_layer, encoded)

# Compile the model
autoencoder.compile(optimizer='adam', loss='mse')

# Train the model
autoencoder.fit(data_diab, data_diab, epochs=1000, batch_size=256, shuffle=True, verbose=0)

# Use the encoder to transform the data
data_autoencoded = encoder.predict(data_diab)


[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 553us/step
