In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder

In [12]:
k = 6
strat = "kmer"

In [13]:
if strat == "kmer":
    data = pd.read_parquet(f'../../data/features/{k}-mer_standard.parquet')
elif strat == "chaos":
    data = pd.read_parquet('../../data/features/chaos_standard_128.parquet', engine='pyarrow')  # You can use 'fastparquet' as the engine
elif strat == "rtd":
    data = pd.read_parquet(f'../../data/features/{k}-rtd.parquet', engine='pyarrow')  # You can use 'fastparquet' as the engine
elif strat == "spaced":
    data = pd.read_parquet(f'../../data/features/{k}-spaced.parquet', engine='pyarrow')  # You can use 'fastparquet' as the engine
elif strat == "mash":
    data = pd.read_parquet('../../data/features/mash.parquet', engine='pyarrow')  # You can use 'fastparquet' as the engine
elif strat == "acs":
    data = pd.read_parquet('../../data/features/acs.parquet', engine='pyarrow')  # You can use 'fastparquet' as the engine

In [14]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit the encoder to the labels and transform them to numeric
data["Label"] = label_encoder.fit_transform(data["Target"])

In [15]:
X_train = data[data['Test'] == 0].drop(columns=["Target", "Test", "Label"])
y_train = data[data['Test'] == 0]['Label']
X_test = data[data['Test'] == 1].drop(columns=["Target", "Test", "Label"])
y_test = data[data['Test'] == 1]['Label']

In [16]:
data

Unnamed: 0,TAAAGG,AAAGGT,AAGGTT,AGGTTT,GGTTTA,GTTTAT,TTTATA,TTATAC,TATACC,ATACCT,...,CTCGCC,CGGGGC,CGACCG,CCCCCG,CCGCCG,CCGGGG,CGGATC,Target,Test,Label
0,13.0,18,17,19,17,23,15.0,12.0,6.0,11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,XBB.1.5,0,202
1,12.0,17,18,19,16,23,13.0,10.0,4.0,10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,AY.116,0,3
2,13.0,18,18,19,16,22,13.0,10.0,6.0,10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,B.1.525,1,61
3,11.0,16,17,18,15,23,13.0,10.0,5.0,11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,B.1.1.57,0,46
4,11.0,16,15,17,16,23,14.0,11.0,5.0,10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BQ.1.1,1,142
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22706,12.0,17,17,18,16,23,14.0,11.0,5.0,10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CP.5,1,165
22707,12.0,17,17,18,16,22,13.0,11.0,5.0,10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BE.7,1,135
22708,12.0,17,17,18,16,22,13.0,11.0,5.0,10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BE.7,1,135
22709,13.0,17,15,18,15,22,14.0,10.0,5.0,9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BA.1,1,67


In [17]:
# Create a Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42, verbose=1, n_jobs = 15)

# Train the model using the training sets
rf.fit(X_train, y_train)

# Predict the response for the test dataset
y_pred = rf.predict(X_test)

[Parallel(n_jobs=15)]: Using backend ThreadingBackend with 15 concurrent workers.
[Parallel(n_jobs=15)]: Done  20 tasks      | elapsed:    0.6s
[Parallel(n_jobs=15)]: Done 100 out of 100 | elapsed:    3.4s finished
[Parallel(n_jobs=15)]: Using backend ThreadingBackend with 15 concurrent workers.
[Parallel(n_jobs=15)]: Done  20 tasks      | elapsed:    0.3s
[Parallel(n_jobs=15)]: Done 100 out of 100 | elapsed:    0.7s finished


In [25]:
y_test_decoded = label_encoder.inverse_transform(y_test)
y_pred_decoded = label_encoder.inverse_transform(y_pred)

def split_labels(labels):
    # Split each label by '.' and return a list of lists
    return [label.split('.') for label in labels]

# Split the true and predicted decoded labels
true_labels_split = split_labels(y_test_decoded)
pred_labels_split = split_labels(y_pred_decoded)

max_depth = max(max(len(label) for label in true_labels_split), max(len(label) for label in pred_labels_split))

# Initialize lists to hold F1 scores for each level
f1_scores_micro = []
f1_scores_macro = []

# Calculate F1 scores for each hierarchical level
for level in range(max_depth):
    # Extract the specific level for all labels, using '' for missing levels
    true_level_labels = [label[level] if level < len(label) else '' for label in true_labels_split]
    pred_level_labels = [label[level] if level < len(label) else '' for label in pred_labels_split]
    
    # Calculate and store F1 scores for this level
    f1_micro = f1_score(true_level_labels, pred_level_labels, average='micro', zero_division=0)
    f1_macro = f1_score(true_level_labels, pred_level_labels, average='macro', zero_division=0)
    
    f1_scores_micro.append(f1_micro)
    f1_scores_macro.append(f1_macro)

f1_micro_global = f1_score(y_test, y_pred, average='micro')
f1_macro_global = f1_score(y_test, y_pred, average='macro')

# Append global F1 scores to the lists
f1_scores_micro.append(f1_micro_global)
f1_scores_macro.append(f1_macro_global)

# Define column names for the DataFrame
# Adding 1 to max_depth for the global scores
column_names = [f"Level {i} F1 Score (Micro)" for i in range(1, max_depth + 1)] + ["Global F1 Score (Micro)"]
column_names += [f"Level {i} F1 Score (Macro)" for i in range(1, max_depth + 1)] + ["Global F1 Score (Macro)"]

# Create a DataFrame with the F1 scores
# Since we only have one row of data, we use [f1_scores_micro + f1_scores_macro] to create a single-row DataFrame
df_results = pd.DataFrame([f1_scores_micro + f1_scores_macro], columns=column_names)

Unnamed: 0,Level 1 F1 Score (Micro),Level 2 F1 Score (Micro),Level 3 F1 Score (Micro),Level 4 F1 Score (Micro),Global F1 Score (Micro),Level 1 F1 Score (Macro),Level 2 F1 Score (Macro),Level 3 F1 Score (Macro),Level 4 F1 Score (Macro),Global F1 Score (Macro)
0,0.985945,0.989169,0.962464,0.973377,0.946507,0.822749,0.938969,0.884709,0.888894,0.850385


In [8]:
# from xgboost import XGBClassifier

# xgb = XGBClassifier(n_estimators=100, random_state=42, verbosity=1, n_jobs=15, tree_method='gpu_hist')

# # Define your evaluation set(s)
# eval_set = [(X_train, y_train), (X_test, y_test)]

# # Train the model with the training set and watch the evaluation on eval_set
# xgb.fit(X_train, y_train, eval_set=eval_set, eval_metric="mlogloss", verbose=True)

# # Evaluating the model
# accuracy = accuracy_score(y_test, y_pred)
# precision_micro = precision_score(y_test, y_pred, average='micro')
# precision_macro = precision_score(y_test, y_pred, average='macro')
# recall_micro = recall_score(y_test, y_pred, average='micro')
# recall_macro = recall_score(y_test, y_pred, average='macro')
# f1_micro = f1_score(y_test, y_pred, average='micro')
# f1_macro = f1_score(y_test, y_pred, average='macro')

# print("Classification Report:")
# print(f"Accuracy: {accuracy:.2f}")
# print(f"Precision (Micro): {precision_micro:.2f}")
# print(f"Precision (Macro): {precision_macro:.2f}")
# print(f"Recall (Micro): {recall_micro:.2f}")
# print(f"Recall (Macro): {recall_macro:.2f}")
# print(f"F1 Score (Micro): {f1_micro:.2f}")
# print(f"F1 Score (Macro): {f1_macro:.2f}")