In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import time

# Start timing the script
start_time = time.time()

# Set random seed for reproducibility
np.random.seed(42)

# Define the number of samples
num_samples = 5000

# Function to generate synthetic ncRNA data
def generate_ncRNA_data(samples, features):
    data = np.random.rand(samples, features)
    noise = np.random.normal(0, 0.1, data.shape)
    return data + noise

# Function to generate binary labels
def generate_labels(data, noise_level=0.1):
    weights = np.random.rand(data.shape[1])
    linear_combination = np.dot(data, weights) + np.random.randn(data.shape[0]) * noise_level
    threshold = np.percentile(linear_combination, 50)
    return (linear_combination > threshold).astype(int)

# Generate synthetic data for ncRNAs
ncRNA_data = generate_ncRNA_data(num_samples, 100)

# Generate binary labels
labels = generate_labels(ncRNA_data)

# Create DataFrame with ncRNA features and labels
columns = [f'ncRNA_{i}' for i in range(1, 101)]
df = pd.DataFrame(ncRNA_data, columns=columns)
df['label'] = labels

# Function to standardize data
def standardize_data(df, columns):
    scaler = StandardScaler()
    df[columns] = scaler.fit_transform(df[columns])
    return df

# Standardize ncRNA data
df = standardize_data(df, columns)

# Save the dataset to a CSV file
output_file = 'synthetic_ncRNA_dataset.csv'
df.to_csv(output_file, index=False)

# End timing the script
end_time = time.time()

# Print completion message and runtime
print(f"Dataset saved to {output_file}")
print(f"Runtime: {end_time - start_time:.2f} seconds")

# Display the first few rows of the dataset
print(df.head())


Dataset saved to synthetic_ncRNA_dataset.csv
Runtime: 28.03 seconds
    ncRNA_1   ncRNA_2   ncRNA_3   ncRNA_4   ncRNA_5   ncRNA_6   ncRNA_7  \
0 -0.397403  0.852533  0.683079  0.942799 -1.631654 -1.388351 -1.623454   
1 -1.670529  0.288005 -0.201638 -0.213920  0.888307 -1.039135  0.449562   
2  0.195225 -0.872064 -0.918731  1.450675  0.529820 -1.152498 -1.284768   
3 -2.083645  0.254206 -0.212502  0.999606  0.943054  1.885593  0.418506   
4 -1.094652  1.331924  0.070002  1.231720 -0.276811  1.195791 -0.425053   

    ncRNA_8   ncRNA_9  ncRNA_10  ...  ncRNA_92  ncRNA_93  ncRNA_94  ncRNA_95  \
0  0.903286  0.267438  0.708975  ...  1.403336  0.347992  0.268291  1.077486   
1  1.367111 -1.658048 -1.520280  ...  1.479508  1.540875  0.607620 -0.141194   
2  0.643163 -1.580155 -1.420649  ... -1.051406  1.185486 -0.964028 -0.790912   
3 -0.454507  1.054717 -0.403555  ... -0.362057 -0.026868  1.102053 -0.679077   
4 -1.930361  0.629722 -1.727435  ...  0.357038 -0.690740 -1.150822  0.129779   



In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

# Load the enhanced dataset
df = pd.read_csv('synthetic_ncRNA_dataset.csv')

# Preprocess the dataset
X = df.drop('label', axis=1).values
y = df['label'].values

# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the neural network model function
def create_dnn_model(optimizer='adam', neurons=128, dropout_rate=0.5):
    model = Sequential()
    model.add(Dense(neurons, input_dim=X.shape[1], activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(neurons // 2, activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(neurons // 4, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

# Create the KerasClassifier
dnn_model = KerasClassifier(build_fn=create_dnn_model, verbose=0)

# Define the hyperparameter grid with fewer options
param_dist = {
    'epochs': [50, 100],
    'batch_size': [16, 32],
    'optimizer': ['adam', 'rmsprop'],
    'neurons': [64, 128],
    'dropout_rate': [0.3, 0.4]
}

# Perform RandomizedSearchCV with fewer iterations
random_search = RandomizedSearchCV(estimator=dnn_model, param_distributions=param_dist, n_iter=5, cv=3, verbose=1, random_state=42, n_jobs=-1)

# Fit the RandomizedSearchCV
random_search.fit(X_train, y_train)

# Print the best parameters
print(f'Best parameters: {random_search.best_params_}')

# Train the best model
best_model = random_search.best_estimator_
y_pred_dnn = best_model.predict(X_test)
# DRL Feature Transformation using Autoencoder
# Evaluate the DNN model
accuracy_dnn = accuracy_score(y_test, y_pred_dnn)
precision_dnn = precision_score(y_test, y_pred_dnn)
recall_dnn = recall_score(y_test, y_pred_dnn)
f1_dnn = f1_score(y_test, y_pred_dnn)
roc_auc_dnn = roc_auc_score(y_test, y_pred_dnn)

print(f' Accuracy: {accuracy_dnn * 100:.2f}%')
print(f' Precision: {precision_dnn * 100:.2f}%')
print(f' Recall: {recall_dnn * 100:.2f}%')
print(f' F1 Score: {f1_dnn * 100:.2f}%')
print(f' ROC AUC: {roc_auc_dnn * 100:.2f}%')

# Plot training history
history = best_model.model.history
plt.figure(figsize=(12, 6))
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.title('DNN Training History')
plt.show()

# Visualization of performance metrics
metrics_dnn = {
    'Accuracy': accuracy_dnn,
    'Precision': precision_dnn,
    'Recall': recall_dnn,
    'F1 Score': f1_dnn,
    'ROC AUC': roc_auc_dnn
}

plt.figure(figsize=(10, 5))
plt.bar(metrics_dnn.keys(), metrics_dnn.values())
plt.title('DNN Performance Metrics')
plt.ylabel('Score')
plt.ylim(0, 1)
plt.show()

# Visualizing the confusion matrix
cm_dnn = confusion_matrix(y_test, y_pred_dnn)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_dnn, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('DNN Confusion Matrix')
plt.show()


  dnn_model = KerasClassifier(build_fn=create_dnn_model, verbose=0)


Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best parameters: {'optimizer': 'adam', 'neurons': 64, 'epochs': 50, 'dropout_rate': 0.4, 'batch_size': 16}
 Accuracy: 96.20%
 Precision: 96.48%
 Recall: 96.10%
 F1 Score: 96.29%
 ROC AUC: 96.20%


KeyError: 'accuracy'

<Figure size 864x432 with 0 Axes>