In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder

# Encode the target variable y_train
label_encoder = LabelEncoder()


# Load the dataset
data = pd.read_csv(r"E:\4th sem\IOBS-2\project\Non Coding RNA classification\rna_sequence_features.csv")

# Separate features (X) and target variable (y)
X = data.drop(columns=['RNA_Name'])
y = data['RNA_Name']

y=label_encoder.fit_transform(y)
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize or scale the features if necessary (optional)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



# Train Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train_scaled, y_train)

# Train SVM classifier
svm_classifier = SVC(kernel='linear', random_state=42)
svm_classifier.fit(X_train_scaled, y_train)

# Train XGBoost classifier
xgb_classifier = XGBClassifier(random_state=42)
xgb_classifier.fit(X_train_scaled, y_train)

# Train LightGBM classifier
lgbm_classifier = lgb.LGBMClassifier(random_state=42)
lgbm_classifier.fit(X_train_scaled, y_train)

# Evaluate classifiers
rf_accuracy = rf_classifier.score(X_test_scaled, y_test)
svm_accuracy = svm_classifier.score(X_test_scaled, y_test)
xgb_accuracy = xgb_classifier.score(X_test_scaled, y_test)
lgbm_accuracy = lgbm_classifier.score(X_test_scaled, y_test)

# Display the accuracies
print("Random Forest Accuracy:", rf_accuracy)
print("SVM Accuracy:", svm_accuracy)
print("XGBoost Accuracy:", xgb_accuracy)
print("LightGBM Accuracy:", lgbm_accuracy)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001154 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2400
[LightGBM] [Info] Number of data points in the train set: 505, number of used features: 86
[LightGBM] [Info] Start training from score -2.613641
[LightGBM] [Info] Start training from score -2.510986
[LightGBM] [Info] Start training from score -2.510986
[LightGBM] [Info] Start training from score -2.560997
[LightGBM] [Info] Start training from score -3.005683
[LightGBM] [Info] Start training from score -2.613641
[LightGBM] [Info] Start training from score -2.669210
[LightGBM] [Info] Start training from score -2.560997
[LightGBM] [Info] Start training from score -2.417896
[LightGBM] [Info] Start training from score -2.417896
[LightGBM] [Info] Start training from score -2.417896
[LightGBM] [Info] Start training from score -2.586972
[

In [32]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from hyperopt import hp, tpe, Trials, fmin
import os

# Load the dataset
data = pd.read_csv(r"E:\4th sem\IOBS-2\project\Non Coding RNA classification\rna_sequence_features.csv")

# Split features and labels
X = data.iloc[:, :-2].values  # Features
y = data.iloc[:, -1].values   # Labels (assuming they are in the last column)

# Label Encoding
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Reshape data for CNN input
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Convert labels to one-hot encoding
num_classes = len(np.unique(y))  # Calculate the number of classes dynamically
y_train_one_hot = to_categorical(y_train, num_classes=num_classes)
y_test_one_hot = to_categorical(y_test, num_classes=num_classes)

# Define hyperparameter search space
space = {
    'learning_rate': hp.loguniform('learning_rate', np.log(0.0001), np.log(0.01)),
    'dropout_rate': hp.uniform('dropout_rate', 0, 0.5)
}

# Define objective function
def objective(params):
    # Build the CNN model
    model = Sequential()
    model.add(Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(params['dropout_rate']))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(params['dropout_rate']))
    model.add(Dense(num_classes, activation='softmax'))

    # Compile the model with categorical crossentropy loss
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # Train the model with one-hot encoded labels and early stopping
    model.fit(X_train, y_train_one_hot, epochs=25, batch_size=64, verbose=0, validation_data=(X_test, y_test_one_hot), callbacks=[early_stopping])

    # Evaluate the model
    loss, _ = model.evaluate(X_test, y_test_one_hot, verbose=0)
    
    # Return negative accuracy (minimize negative accuracy = maximize accuracy)
    return -loss

# Perform Bayesian Optimization
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=10, trials=trials)

print("Best hyperparameters:", best)

# Rebuild the best model with the best hyperparameters
best_model = Sequential()
best_model.add(Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
best_model.add(MaxPooling1D(pool_size=2))
best_model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
best_model.add(MaxPooling1D(pool_size=2))
best_model.add(Flatten())
best_model.add(Dense(256, activation='relu'))
best_model.add(Dropout(best['dropout_rate']))
best_model.add(Dense(128, activation='relu'))
best_model.add(Dropout(best['dropout_rate']))
best_model.add(Dense(num_classes, activation='softmax'))

# Compile the best model with categorical crossentropy loss
best_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the best model with one-hot encoded labels and early stopping
best_model.fit(X_train, y_train_one_hot, epochs=25, batch_size=64, validation_data=(X_test, y_test_one_hot), callbacks=[early_stopping])

# Save the weights of the best model
best_model.save_weights('best_model_weights.h5')

# Evaluate the best model with the test data
test_loss, test_accuracy = best_model.evaluate(X_test, y_test_one_hot)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)


  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 10/10 [00:19<00:00,  1.93s/trial, best loss: -1.426284909248352]
Best hyperparameters: {'dropout_rate': 0.02976027759400396, 'learning_rate': 0.0017931008791918584}
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Test Loss: 1.3732311725616455
Test Accuracy: 0.5669291615486145
