In [1]:
# Step 1: Import Necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, r2_score
from scipy.sparse import hstack, csr_matrix
import plotly.graph_objects as go


In [2]:
# Step 2: Load and Preprocess Data
train_sequences = pd.read_csv(r"E:\A\stanford-rna-3d-folding\train_sequences.csv")
train_labels = pd.read_csv(r"E:\A\stanford-rna-3d-folding\train_labels.csv")


In [3]:

# Drop rows with missing sequences
train_sequences.dropna(subset=['sequence'], inplace=True)

# Clean sequences (replace invalid characters with 'N')
allowed_chars = {'A', 'T', 'C', 'G', 'U'}
train_sequences['sequence'] = train_sequences['sequence'].apply(
    lambda x: ''.join([char if char in allowed_chars else 'N' for char in x])
)

# Add sequence length as a feature
train_sequences['sequence_length'] = train_sequences['sequence'].apply(len)

In [4]:
# Merge datasets on 'target_id'
train_labels['target_id'] = train_labels['ID'].apply(lambda x: '_'.join(x.split('_')[:2]))
merged_data = pd.merge(train_sequences, train_labels, on='target_id', how='inner')

# Drop rows with missing labels
merged_data.dropna(subset=['x_1', 'y_1', 'z_1'], inplace=True)

# Step 3: Feature Engineering
vectorizer = CountVectorizer(analyzer='char', ngram_range=(3, 3))  # k-mer encoding (k=3)
X_seq = vectorizer.fit_transform(merged_data['sequence'])


In [5]:
# Convert to sparse matrix format for efficient indexing
X = csr_matrix(hstack([X_seq, np.array(merged_data['sequence_length']).reshape(-1, 1)]))

# Target variables (3D coordinates)
y = merged_data[['x_1', 'y_1', 'z_1']]

In [6]:
# Step 4: Build Optimized Deep Learning Model
def build_model(input_shape):
    model = Sequential([
        Dense(256, activation='relu', input_shape=(input_shape,)),
        Dropout(0.3),
        Dense(128, activation='relu'),
        Dropout(0.2),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(3)  # Output layer for x, y, z coordinates
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

In [7]:
# Step 5: Cross-Validation & Training
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mse_scores, r2_scores = [], []

for train_idx, val_idx in kf.split(X):
    X_train, X_val = X[train_idx].toarray(), X[val_idx].toarray()
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = build_model(X_train.shape[1])
    model.fit(X_train, y_train, epochs=75, batch_size=32, validation_data=(X_val, y_val), verbose=0)

    y_pred = model.predict(X_val)
    mse_scores.append(mean_squared_error(y_val, y_pred))
    r2_scores.append(r2_score(y_val, y_pred))

print(f"Mean MSE: {np.mean(mse_scores):.4f}")
print(f"Mean RÂ²: {np.mean(r2_scores):.4f}")

KeyboardInterrupt: 

In [None]:
# Step 6: Prepare Predictions for Test Data
test_sequences = pd.read_csv('test_sequences.csv')

# Clean test sequences
test_sequences['sequence'] = test_sequences['sequence'].apply(
    lambda x: ''.join([char if char in allowed_chars else 'N' for char in x])
)

# Add sequence length as a feature
test_sequences['sequence_length'] = test_sequences['sequence'].apply(len)


In [None]:
# Transform test sequences using the same vectorizer
X_test_seq = vectorizer.transform(test_sequences['sequence'])
X_test = csr_matrix(hstack([X_test_seq, np.array(test_sequences['sequence_length']).reshape(-1, 1)]))

# Predict 3D coordinates for the test dataset
X_test = X_test.toarray()
test_predictions = model.predict(X_test)

# Prepare submission file
submission = pd.DataFrame({
    'ID': test_sequences['target_id'],
    'x_1': test_predictions[:, 0],
    'y_1': test_predictions[:, 1],
    'z_1': test_predictions[:, 2]
})
submission.to_csv('submission.csv', index=False)
print("Submission file saved as 'submission.csv'")


In [None]:
# Step 7: Advanced Visualization (3D Scatter with Color Gradient)
fig = go.Figure(data=[go.Scatter3d(
    x=test_predictions[:, 0],
    y=test_predictions[:, 1],
    z=test_predictions[:, 2],
    mode='markers',
    marker=dict(
        size=5,
        color=np.linalg.norm(test_predictions, axis=1),  # Color by distance from origin
        colorscale='Viridis',
        opacity=0.8
    )
)])


In [None]:
fig.update_layout(
    scene=dict(
        xaxis_title='X Coordinate',
        yaxis_title='Y Coordinate',
        zaxis_title='Z Coordinate'
    ),
    title='3D RNA Structure Visualization with Gradient Coloring'
)

fig.show()