In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.multiclass import OneVsRestClassifier

In [2]:
# Read in csv file
df = pd.read_csv("../ModelKeras/train_combined_Species.csv")

In [3]:
# Drop the index column
df.drop(df.columns[0], axis=1, inplace=True)
df = df.fillna(0).drop_duplicates()
inputvalues = df.drop(['label'], axis=1)
outputvalues = df['label']
X, y = inputvalues.values, outputvalues.values
X = X.astype('float32')

In [4]:
display(df)

Unnamed: 0,Bacteria;Firmicutes;Clostridia;Oscillospirales;Oscillospiraceae;UCG-005;uncultured bacterium,Bacteria;Firmicutes;Clostridia;Lachnospirales;Lachnospiraceae;Blautia;uncultured bacterium,Bacteria;Firmicutes;Clostridia;Peptococcales;Peptococcaceae;uncultured;uncultured organism,Bacteria;Firmicutes;Clostridia;Lachnospirales;Lachnospiraceae;uncultured;uncultured bacterium,Bacteria;Firmicutes;Clostridia;Oscillospirales;Oscillospiraceae;Colidextribacter;uncultured Flavonifractor sp.,Bacteria;Firmicutes;Clostridia;Oscillospirales;Oscillospiraceae;UCG-002;uncultured bacterium,Bacteria;Firmicutes;Clostridia;Lachnospirales;Lachnospiraceae;uncultured;uncultured bacterium.1,Bacteria;Firmicutes;Clostridia;Oscillospirales;Oscillospiraceae;Colidextribacter;uncultured organism,Bacteria;Firmicutes;Clostridia;Lachnospirales;Lachnospiraceae;uncultured;uncultured organism,Bacteria;Firmicutes;Clostridia;Lachnospirales;Lachnospiraceae;Frisingicoccus;uncultured bacterium,...,Bacteria;Firmicutes;Clostridia;Lachnospirales;Lachnospiraceae;[Eubacterium] ventriosum group;uncultured bacterium.45,Bacteria;Bacteroidota;Bacteroidia;Bacteroidales;Prevotellaceae;Prevotella;uncultured bacterium.372,Bacteria;Firmicutes;Clostridia;Clostridia UCG-014;uncultured bacterium.503,Bacteria;Firmicutes;Clostridia;Lachnospirales;Lachnospiraceae;Anaerosporobacter;uncultured bacterium.10,Bacteria;Actinobacteriota;Coriobacteriia;Coriobacteriales;Eggerthellaceae;Senegalimassilia;uncultured bacterium.75,Bacteria;Firmicutes;Clostridia;Oscillospirales;Oscillospiraceae;Colidextribacter;uncultured organism.14,Bacteria;Firmicutes;Bacilli;RF39;uncultured bacterium.269,Bacteria;Firmicutes;Clostridia;Lachnospirales;Lachnospiraceae;Roseburia;uncultured bacterium.166,Bacteria;Firmicutes;Clostridia;Lachnospirales;Lachnospiraceae;[Ruminococcus] gauvreauii group;uncultured bacterium.89,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0
1004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [5]:
df.shape

(1006, 17788)

# DATA PREPROCESSING

In [6]:
# Standardize the data to ensure each feature has similar scales
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(X)

# FEATURE SELECTION





# PCA

In [7]:
pca = PCA(n_components=19)
X_pca = pca.fit_transform(df_scaled)
X_pca

array([[-4.7238553e-03, -1.2684126e-01, -5.0361091e-01, ...,
        -1.7983320e-01,  2.0341670e-02,  1.3629045e-01],
       [-6.2149901e-02,  3.6963066e-01,  6.2288004e-01, ...,
         7.7092254e-01, -3.1486828e+00,  2.7067747e+00],
       [-4.2649172e-02, -8.5706264e-02, -2.4378175e-01, ...,
         8.1039540e-02, -1.8604836e-01, -1.2606113e-01],
       ...,
       [-3.1370665e-03,  9.0121783e-02,  1.6537242e-01, ...,
        -2.4991958e-01,  3.5753187e-02, -1.4286962e-01],
       [-4.8783056e-02, -7.6090835e-02, -1.2562120e-01, ...,
         3.7049223e-02,  7.5933576e-02,  6.8284512e-02],
       [-3.5076164e-02, -6.5379225e-02, -6.6299573e-02, ...,
        -1.2702299e-02, -1.1487282e-01, -1.2772624e-01]], dtype=float32)

# SPLIT DATASET

In [8]:
# Split the dataset into training and test set
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=1234)

# SVM with GridSearch and CrossValidation

In [9]:
# Define the hyperparameter grid for SVM
param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['linear', 'rbf', 'poly', 'sigmoid']}

# Create an instance of SVC
svm = SVC()

# Create a GridSearchCV object with 10-fold cross-validation
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=10)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best parameters found: ",grid_search.best_params_)

# Use the best hyperparameters to fit a new SVC model
svm = SVC(C=grid_search.best_params_['C'], 
          gamma=grid_search.best_params_['gamma'], 
          kernel=grid_search.best_params_['kernel'])

svm.fit(X_train, y_train)

# Make predictions using the best model obtained from Grid Search
y_pred = grid_search.best_estimator_.predict(X_test)

Best parameters found:  {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}


# EVALUATE

In [10]:
# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate precision score
precision = precision_score(y_test, y_pred, average='weighted')
print("Precision:", precision)

# Calculate recall score
recall = recall_score(y_test, y_pred, average='weighted')
print("Recall:", recall)

# Calculate f1-score
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1-Score:", f1)

Accuracy: 0.46534653465346537
Precision: 0.4234688003684089
Recall: 0.46534653465346537
F1-Score: 0.37098879290985004


  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
# Print confusion matrix
print(confusion_matrix(y_test, y_pred))

[[79  6  0  0  0]
 [78 15  0  0  0]
 [ 8  4  0  0  0]
 [ 6  2  0  0  0]
 [ 1  3  0  0  0]]


In [12]:
# Print classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.46      0.93      0.61        85
           1       0.50      0.16      0.24        93
           2       0.00      0.00      0.00        12
           3       0.00      0.00      0.00         8
           4       0.00      0.00      0.00         4

    accuracy                           0.47       202
   macro avg       0.19      0.22      0.17       202
weighted avg       0.42      0.47      0.37       202



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
