In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier

In [None]:
import pandas as pd
df = pd.read_excel("data machine.xlsx", dtype=str)
meta_cols = df.columns[:9]
sample_cols = df.columns[9:]
snp_names = (df.iloc[:,0].astype(str) + "_" + df.iloc[:,1].astype(str)).tolist()
matrix = pd.DataFrame(df[sample_cols].values.T, index=sample_cols, columns=snp_names)
matrix.index.name = "Sample_ID"
matrix.reset_index().to_csv("ml_feature_matrix.csv", index=False)
print("Saved ml_feature_matrix_python.csv — shape:", matrix.shape)

Saved ml_feature_matrix_python.csv — shape: (20, 253)


In [None]:
df = pd.read_csv("samples.txt", sep="\t")
print(df.head())
print(df.dtypes)
for col in df.columns[1:]:
    df[col] = pd.to_numeric(df[col], errors="coerce")
print(df.head())
print(df.dtypes)

        #CHROM   POS ID REF ALT QUAL FILTER      INFO FORMAT  CA_001  ...  \
0  NC_010079.1   874  .   C   T    .   PASS  TYPE=snp     GT       0  ...   
1  NC_010079.1   930  .   C   T    .   PASS  TYPE=snp     GT       1  ...   
2  NC_010079.1   954  .   G   A    .   PASS  TYPE=snp     GT       1  ...   
3  NC_010079.1   966  .   G   A    .   PASS  TYPE=snp     GT       1  ...   
4  NC_010079.1  2608  .   G   A    .   PASS  TYPE=snp     GT       0  ...   

   CA_162  CA_163  CA_164  CA_165  CA_167  CA_168  CA_177  CA_185  CA_188  \
0       0       0       0       0       0       0       0       0       0   
1       1       1       1       1       1       1       1       1       1   
2       1       1       0       1       1       1       1       0       0   
3       1       1       0       1       1       1       1       0       0   
4       0       0       0       1       0       1       1       0       1   

   CA_190  
0       0  
1       1  
2       1  
3       1  
4       1  

[

In [None]:
genotypes = pd.read_csv("samples.txt", sep="\t")
print(genotypes.head(20))
print (genotypes.dtypes)

         #CHROM   POS ID REF ALT QUAL FILTER      INFO FORMAT  CA_001  ...  \
0   NC_010079.1   874  .   C   T    .   PASS  TYPE=snp     GT       0  ...   
1   NC_010079.1   930  .   C   T    .   PASS  TYPE=snp     GT       1  ...   
2   NC_010079.1   954  .   G   A    .   PASS  TYPE=snp     GT       1  ...   
3   NC_010079.1   966  .   G   A    .   PASS  TYPE=snp     GT       1  ...   
4   NC_010079.1  2608  .   G   A    .   PASS  TYPE=snp     GT       0  ...   
5   NC_010079.1  2725  .   C   T    .   PASS  TYPE=snp     GT       0  ...   
6   NC_010079.1  2755  .   G   A    .   PASS  TYPE=snp     GT       0  ...   
7   NC_010079.1  3945  .   T   C    .   PASS  TYPE=snp     GT       0  ...   
8   NC_010079.1  4318  .   C   A    .   PASS  TYPE=snp     GT       0  ...   
9   NC_010079.1  4652  .   G   T    .   PASS  TYPE=snp     GT       0  ...   
10  NC_010079.1  4767  .   A   G    .   PASS  TYPE=snp     GT       0  ...   
11  NC_010079.1  4872  .   A   G    .   PASS  TYPE=snp     GT   

In [None]:
phenotypes = pd.read_excel("biofilm_phenotype.xlsx")
print(phenotypes.head(25))
print(phenotypes.dtypes)

   Sample_ID  BFI
0     CA_001    1
1     CA_002    1
2     CA_005    1
3     CA_007    0
4     CA_008    1
5     CA_009    1
6     CA_010    1
7     CA_011    1
8     CA_012    1
9     CA_013    1
10    CA_014    1
11    CA_016    1
12   CA_017     1
13    CA_018    1
14    CA_019    1
15    CA_020    1
16    CA_021    1
17    CA_022    0
18    CA_023    1
19    CA_024    0
20    CA_025    0
21    CA_026    1
22    CA_027    1
23    CA_028    0
24    CA_029    1
Sample_ID    object
BFI           int64
dtype: object


In [None]:
phenotypes.columns = ["Sample_ID", "Phenotype"]
phenotypes.head(5)

Unnamed: 0,Sample_ID,Phenotype
0,CA_001,1
1,CA_002,1
2,CA_005,1
3,CA_007,0
4,CA_008,1


In [None]:
snp_names = genotypes['#CHROM'].astype(str) + '_' + genotypes['POS'].astype(str)

genotype_matrix = genotypes.iloc[:, 9:].T

genotype_matrix.columns = snp_names

genotype_matrix = genotype_matrix.reset_index()
genotype_matrix = genotype_matrix.rename(columns={'index': 'Sample_ID'})

df = genotype_matrix.merge(phenotypes, on="Sample_ID")
df.head()

Unnamed: 0,Sample_ID,NC_010079.1_874,NC_010079.1_930,NC_010079.1_954,NC_010079.1_966,NC_010079.1_2608,NC_010079.1_2725,NC_010079.1_2755,NC_010079.1_3945,NC_010079.1_4318,...,NC_010079.1_2868295,NC_010079.1_2869057,NC_010079.1_2869342,NC_010079.1_2869519,NC_010079.1_2870368,NC_010079.1_2871139,NC_010079.1_2871315,NC_010079.1_2871562,NC_010079.1_2871937,Phenotype
0,CA_001,0,1,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
1,CA_010,0,1,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
2,CA_011,0,1,1,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,CA_012,0,1,1,1,1,1,0,0,0,...,0,1,0,0,1,0,0,0,0,1
4,CA_013,0,1,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


In [None]:
df = df.fillna(df.mode().iloc[0])
print("Remaining SNPs:", df.shape[1])

Remaining SNPs: 4546


In [None]:
X = df.drop(columns=["Sample_ID", "Phenotype"])
y = df["Phenotype"]

In [None]:
selector = VarianceThreshold(threshold=0.02)
X_sel = selector.fit_transform(X)
print("Remaining SNPs:", X_sel.shape[1])

Remaining SNPs: 2484


In [None]:
print(X_sel[:5])
print(X_sel.dtype)
pd.DataFrame(X_sel).to_csv("final_snps_matrix.csv", index=False)

[[1 1 0 ... 0 1 0]
 [1 1 0 ... 0 1 0]
 [1 1 0 ... 1 0 0]
 [1 1 1 ... 1 0 0]
 [1 1 0 ... 0 1 0]]
int64


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_sel, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

In [None]:
param_grid = {
    "max_depth": [3, 5, 10, 20, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "criterion": ["gini", "entropy"]
}
grid = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    scoring="accuracy"
)
grid.fit(X_train, y_train)
print("Best parameters:", grid.best_params_)
print("Best CV accuracy:", grid.best_score_)

Best parameters: {'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best CV accuracy: 0.8036764705882353


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

best_tree = grid.best_estimator_

y_test_pred = best_tree.predict(X_test)
y_test_prob = best_tree.predict_proba(X_test)[:,1]

print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("Test Precision:", precision_score(y_test, y_test_pred))
print("Test Recall:", recall_score(y_test, y_test_pred))
print("Test F1-score:", f1_score(y_test, y_test_pred))

Test Accuracy: 0.7142857142857143
Test Precision: 0.7777777777777778
Test Recall: 0.875
Test F1-score: 0.8235294117647058
Test ROC-AUC: 0.4875
