# Klasifikacija podataka

In [74]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [75]:
# Učitavanje podataka
data_diab = pd.read_csv('diabetes.csv')

data_diab

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [76]:
# feature i target
x = data_diab.drop('Outcome', axis = 1)
y = data_diab['Outcome']

In [99]:
# podjela na train i test 
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [100]:
# skaliranje podataka
X_train_sc = StandardScaler().fit_transform(X_train)
X_test_sc = StandardScaler().fit_transform(X_test)

In [102]:
# model RandomForestClassifier
model = RandomForestClassifier()

# treniranje modela
model.fit(X_train_sc, y_train)

y_pred = model.predict(X_test_sc)

# precision i accuracy
precision_og = precision_score(y_test, y_pred)
accuracy_og = accuracy_score(y_test, y_pred)



print(precision_og)
print(accuracy_og)

0.6666666666666666
0.7987012987012987


## PCA

In [80]:
from sklearn.decomposition import PCA

In [81]:
pca = PCA(n_components = 5)

In [107]:
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.fit_transform(X_test)

## treniranje modela u reduciranoj dimenziji
model_pca = RandomForestClassifier(random_state = 42)
model_pca.fit(X_train_pca, y_train)

# precision i accuracy u reduciranoj dimenziji
y_pred_pca = model_pca.predict(X_test_pca)
precision_pca = precision_score(y_test, y_pred_pca)
accuracy_pca = accuracy_score(y_test, y_pred_pca)

print(f"precision: {round(precision_pca*100,4)}%")
print(f"accuracy: {round(accuracy_pca*100,4)}%")

precision: 60.9756%
accuracy: 75.3247%


## SVD

In [83]:
from sklearn.decomposition import TruncatedSVD

In [84]:
svd = TruncatedSVD(n_components = 5)

In [108]:
X_train_svd = svd.fit_transform(X_train)
X_test_svd = svd.fit_transform(X_test)

# treniranje modela u reduciranoj dimenziji
model_svd = RandomForestClassifier(random_state = 42)
model_svd.fit(X_train_svd, y_train)

# precision i accuracy u reduciranoj dimenzjii
y_pred_svd = model_svd.predict(X_test_svd)
precision_svd = precision_score(y_test, y_pred_svd)
accuracy_svd = accuracy_score(y_test, y_pred)

print(f"precision: {round(precision_svd*100,4)}%")
print(f"accuracy: {round(accuracy_svd*100,4)}%")

precision: 61.7647%
accuracy: 79.8701%


## Autoencoder

In [86]:
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(X_train_sc.shape[1], 6),  # Smanjenje dimenzionalnosti
            nn.ReLU(),
            nn.Linear(6, 3)  # Latentni prostor
        )
        self.decoder = nn.Sequential(
            nn.Linear(3, 6),
            nn.ReLU(),
            nn.Linear(6, X_train_sc.shape[1]),
            nn.Sigmoid()  # Skaliranje izlaza između 0 i 1
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

# Treniranje autoencodera
autoencoder = Autoencoder()
criterion = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters(), lr=0.01)

n_epochs = 50
X_train_tensor = torch.tensor(X_train_sc, dtype=torch.float32)

for epoch in range(n_epochs):
    autoencoder.train()
    outputs = autoencoder(X_train_tensor)
    loss = criterion(outputs, X_train_tensor)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch {epoch+1}/{n_epochs}, Loss: {loss.item():.4f}')

# Ekstrakcija značajki iz enkodera
def extract_features(autoencoder, X_data):
    autoencoder.eval()
    with torch.no_grad():
        X_data_tensor = torch.tensor(X_data, dtype=torch.float32)
        encoded_data = autoencoder.encoder(X_data_tensor).numpy()
    return encoded_data

train_features = extract_features(autoencoder, X_train_sc)
test_features = extract_features(autoencoder, X_test_sc)

# Klasifikacija koristeći originalni skup podataka
model_original = RandomForestClassifier(random_state=42)
model_original.fit(X_train_sc, y_train)
y_pred_original = model_original.predict(X_test_sc)

precision_og = precision_score(y_test, y_pred_original, average='macro')
accuracy_og = accuracy_score(y_test, y_pred_original)

print("Original Data - Precision:", precision_og)
print("Original Data - Accuracy:", accuracy_og)

# Klasifikacija koristeći smanjeni skup podataka (latentni prostor)
model_compressed = RandomForestClassifier(random_state=42)
model_compressed.fit(train_features, y_train)
y_pred_compressed = model_compressed.predict(test_features)

precision_comp = precision_score(y_test, y_pred_compressed, average='macro')
accuracy_comp = accuracy_score(y_test, y_pred_compressed)

print("Compressed Data - Precision:", precision_comp)
print("Compressed Data - Accuracy:", accuracy_comp)

Epoch 10/50, Loss: 1.1420
Epoch 20/50, Loss: 0.9839
Epoch 30/50, Loss: 0.8746
Epoch 40/50, Loss: 0.8431
Epoch 50/50, Loss: 0.8294
Original Data - Precision: 0.7399764150943396
Original Data - Accuracy: 0.7597402597402597
Compressed Data - Precision: 0.6483494363929146
Compressed Data - Accuracy: 0.6818181818181818
