In [117]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.multiclass import OneVsRestClassifier

In [118]:
# Read in csv file
df = pd.read_csv("/content/drive/MyDrive/HackathonData/train_combined_Species.csv", sep = ',')

In [119]:
# Drop the index column
df.drop(df.columns[0], axis=1, inplace=True)  

In [120]:
display(df)

Unnamed: 0,Bacteria;Firmicutes;Clostridia;Oscillospirales;Oscillospiraceae;UCG-005;uncultured bacterium,Bacteria;Firmicutes;Clostridia;Lachnospirales;Lachnospiraceae;Blautia;uncultured bacterium,Bacteria;Firmicutes;Clostridia;Peptococcales;Peptococcaceae;uncultured;uncultured organism,Bacteria;Firmicutes;Clostridia;Lachnospirales;Lachnospiraceae;uncultured;uncultured bacterium,Bacteria;Firmicutes;Clostridia;Oscillospirales;Oscillospiraceae;Colidextribacter;uncultured Flavonifractor sp.,Bacteria;Firmicutes;Clostridia;Oscillospirales;Oscillospiraceae;UCG-002;uncultured bacterium,Bacteria;Firmicutes;Clostridia;Lachnospirales;Lachnospiraceae;uncultured;uncultured bacterium.1,Bacteria;Firmicutes;Clostridia;Oscillospirales;Oscillospiraceae;Colidextribacter;uncultured organism,Bacteria;Firmicutes;Clostridia;Lachnospirales;Lachnospiraceae;uncultured;uncultured organism,Bacteria;Firmicutes;Clostridia;Lachnospirales;Lachnospiraceae;Frisingicoccus;uncultured bacterium,...,Bacteria;Firmicutes;Clostridia;Lachnospirales;Lachnospiraceae;[Eubacterium] ventriosum group;uncultured bacterium.45,Bacteria;Bacteroidota;Bacteroidia;Bacteroidales;Prevotellaceae;Prevotella;uncultured bacterium.372,Bacteria;Firmicutes;Clostridia;Clostridia UCG-014;uncultured bacterium.503,Bacteria;Firmicutes;Clostridia;Lachnospirales;Lachnospiraceae;Anaerosporobacter;uncultured bacterium.10,Bacteria;Actinobacteriota;Coriobacteriia;Coriobacteriales;Eggerthellaceae;Senegalimassilia;uncultured bacterium.75,Bacteria;Firmicutes;Clostridia;Oscillospirales;Oscillospiraceae;Colidextribacter;uncultured organism.14,Bacteria;Firmicutes;Bacilli;RF39;uncultured bacterium.269,Bacteria;Firmicutes;Clostridia;Lachnospirales;Lachnospiraceae;Roseburia;uncultured bacterium.166,Bacteria;Firmicutes;Clostridia;Lachnospirales;Lachnospiraceae;[Ruminococcus] gauvreauii group;uncultured bacterium.89,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0
1004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [121]:
df.shape

(1006, 17788)

# DATA PREPROCESSING

In [122]:
# Standardize the data to ensure each feature has similar scales
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df)

# Handle missing values
# df.fillna(df.mean(), inplace=True)

# Remove duplicate rows
df.drop_duplicates(inplace=True)


# FEATURE SELECTION





In [123]:
# Selecting only the columns which contain only 2 unique values
n_unique = df.loc[:,df.apply(pd.Series.nunique) == 2]

if 'label' in n_unique.columns:

    # Select only the rows with 'label' value 0
    nicht_kranke = n_unique[n_unique['label'] == 0]

    for i in nicht_kranke.columns:
        if i != 'label':
            display(nicht_kranke[i].value_counts())

    # Select only the rows with 'label' value 1
    kranke = n_unique[n_unique['label'] == 1]
    
    for i in kranke.columns:
        # If the column is not 'label', display the value counts
        if i != 'label':
            print(kranke[i].value_counts())

    # Drop the 'label' column from n_unique
    n_unique.drop('label', axis=1, inplace=True)

# Create a list of columns from df which are not present in n_unique
cols = [col for col in df.columns if col not in n_unique.columns]

# If cols is not an empty list, select only those columns from df
if cols:
    df = df[cols]
else:
    # Create a copy of df and assign it to df
    df = df.copy()


df.shape


(1006, 11715)

# PCA

In [124]:
# PCA PLOT

#xx = df_scaled

# Perform PCA
#pca = PCA()
#pca.fit(xx)

# Plot the explained variance ratio as a function of the number of components
#plt.plot(np.cumsum(pca.explained_variance_ratio_))
#plt.xlabel('Number of components')
#plt.ylabel('Explained Variance Ratio')
#plt.show()

In [125]:
# Apply PCA to the standardized data
pca = PCA(n_components=19)
X_pca = pca.fit_transform(df_scaled)

# SPLIT DATASET

In [127]:
# Split the dataset into training and test set
X_train, X_test, y_train, y_test = train_test_split(X_pca, df["label"], test_size=0.2, random_state=0)

# SVM with GridSearch and CrossValidation

In [128]:
# Define the hyperparameter grid for SVM
param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['linear', 'rbf', 'poly', 'sigmoid']}

# Create an instance of SVC
svm = SVC()

# Create a GridSearchCV object with 10-fold cross-validation
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=10)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best parameters found: ",grid_search.best_params_)

# Use the best hyperparameters to fit a new SVC model
svm = SVC(C=grid_search.best_params_['C'], 
          gamma=grid_search.best_params_['gamma'], 
          kernel=grid_search.best_params_['kernel'])

svm.fit(X_train, y_train)

# Make predictions using the best model obtained from Grid Search
y_pred = grid_search.best_estimator_.predict(X_test)

Best parameters found:  {'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}


# EVALUATE

In [129]:
# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate precision score
precision = precision_score(y_test, y_pred, average='weighted')
print("Precision:", precision)

# Calculate recall score
recall = recall_score(y_test, y_pred, average='weighted')
print("Recall:", recall)

# Calculate f1-score
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1-Score:", f1)

Accuracy: 0.9306930693069307
Precision: 0.9380288028802881
Recall: 0.9306930693069307
F1-Score: 0.931050514739487


In [130]:
# Print confusion matrix
print(confusion_matrix(y_test, y_pred))

[[77  1  0  0  0]
 [ 7 96  0  1  0]
 [ 0  2 11  0  0]
 [ 0  0  0  2  3]
 [ 0  0  0  0  2]]


In [131]:
# Print classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.99      0.95        78
           1       0.97      0.92      0.95       104
           2       1.00      0.85      0.92        13
           3       0.67      0.40      0.50         5
           4       0.40      1.00      0.57         2

    accuracy                           0.93       202
   macro avg       0.79      0.83      0.78       202
weighted avg       0.94      0.93      0.93       202

