In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from scipy.stats import spearmanr

# Load data
file_path = "Figure_5_data.csv" #Supplementary dataset 5
df = pd.read_csv(file_path)  # Define feature indices (adjust if needed)
start_idx = 2
kmer_count = 84
end_idx = start_idx + kmer_count  # 86
domain_start = end_idx
domain_count = 1288
domain_end = domain_start + domain_count  # 1374
length_cols = ['Plasmid_Length']  # K-mers columns (Excluding Chromosome Length)
print(f"K-mers count: {kmer_count}")
print(f"K-mers columns: {df.columns[start_idx:end_idx].tolist()}")

# Domains columns
print(f"Domains count: {domain_count}")
print(f"Domains columns: {df.columns[domain_start:domain_end].tolist()}")

# Length columns
print(f"Lengths count: {len(length_cols)}")
print(f"Length columns: {length_cols}")  # Prepare feature DataFrames
X_kmers = df.iloc[:, start_idx:end_idx]
X_domains = df.iloc[:, domain_start:domain_end]
X_lengths = df[length_cols]
y = df['Log1p_PIRACopyNumber']  # Combine all features
X_all = pd.concat([X_domains, X_kmers, X_lengths], axis=1)  # Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_all, y, test_size=0.2, random_state=42
)


In [None]:

# Define parameter distributions for Randomized Search
param_distributions = {
    'n_estimators': [int(x) for x in np.linspace(100, 500, 30)],
    'max_features': ['sqrt', 'log2'] + list(np.linspace(0.1, 1.0, 10)),
    'max_depth': [int(x) for x in np.linspace(10, 300, 30)],
    'min_samples_split': [2, 5, 10, 15, 20, 30, 40, 50],
    'min_samples_leaf': [1, 2, 4, 6, 8, 10, 20, 30],
    'bootstrap': [True, False]
}  # Step 1: Perform RandomizedSearchCV for hyperparameter tuning
rf_model = RandomForestRegressor(random_state=42)
random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_distributions,
    n_iter=100,
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)
random_search.fit(X_train, y_train)

print("Best parameters from Randomized Search:")
print(random_search.best_params_)


In [7]:
best_params = random_search.best_params_

param_grid = {
    'n_estimators': [
        max(1, best_params['n_estimators'] - 25),
        best_params['n_estimators'],
        best_params['n_estimators'] + 25
    ],
    'max_features': [
        best_params['max_features'],
        0.1, 0.2, 0.3, 0.4, 0.5,
        0.6, 0.7, 0.8, 0.9, 1.0
    ],
    'max_depth': [
        max(1, best_params['max_depth'] - 10),
        best_params['max_depth'],
        best_params['max_depth'] + 10
    ],
    'min_samples_split': [
        max(2, best_params['min_samples_split'] - 1),
        best_params['min_samples_split'],
        best_params['min_samples_split'] + 1
    ],
    'min_samples_leaf': [
        best_params['min_samples_leaf'],
        best_params['min_samples_leaf'] + 1
    ],
    'bootstrap': [best_params['bootstrap']]
}


In [None]:
# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)

# Setup GridSearchCV
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    verbose=2
)  # Fit GridSearchCV on the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and best cross-validation score
print("Best parameters from Grid Search:")
print(grid_search.best_params_)
print(f"Best cross-validation R2 score from Grid Search: {grid_search.best_score_:.4f}")


In [None]:

#
final_model = RandomForestRegressor(**grid_search.best_params_, random_state=42)
final_model.fit(X_train, y_train)


In [None]:

# Predict on test data
y_pred = final_model.predict(X_test)

#
# Evaluate metrics
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
spearman_corr = spearmanr(y_test, y_pred).correlation


In [None]:

# Save actual vs predicted values with Sample IDs
results_df = pd.DataFrame({
    'SampleID': X_test.index if hasattr(X_test, 'index') else range(len(X_test)),
    'Actual': y_test.values if hasattr(y_test, 'values') else y_test,
    'Predicted': y_pred
})

results_csv_path = "excluding_chromosome_features_actual_vs_predicted.csv"
results_df.to_csv(results_csv_path, index=False)

# Save evaluation metrics summary
metrics_df = pd.DataFrame([{
    'Feature_Set': 'All Features',
    'R2': r2,
    'MAE': mae,
    'MSE': mse,
    'Spearman': spearman_corr
}])

metrics_csv_path = "excluding_chromosome_features_evaluation_metrics.csv"
metrics_df.to_csv(metrics_csv_path, index=False)

print(f"Saved actual vs predicted values to {results_csv_path}")
print(f"Saved evaluation metrics summary to {metrics_csv_path}")


In [None]:
# Save the trained model
joblib.dump(final_model, 'rf_model_final_excluding_chromosome.pkl')