In [6]:
# Cell 1: All imports
import pandas as pd
import numpy as np
from collections import Counter
from itertools import product
import os

print("✓ All libraries imported!")



✓ All libraries imported!


In [7]:
def generate_kmers(k):
    """Generate all possible k-mers for RNA sequences"""
    bases = ['A', 'U', 'G', 'C']
    kmers = [''.join(p) for p in product(bases, repeat=k)]
    return kmers

# Test it
print(f"Example: For k=3, we have {len(generate_kmers(3))} possible k-mers")
print("First 10 k-mers:", generate_kmers(3)[:10])


Example: For k=3, we have 64 possible k-mers
First 10 k-mers: ['AAA', 'AAU', 'AAG', 'AAC', 'AUA', 'AUU', 'AUG', 'AUC', 'AGA', 'AGU']


In [8]:
def extract_kmer_features(sequence, k=3):
    """Extract k-mer frequency features from an RNA sequence"""
    # Clean sequence
    sequence = sequence.upper().replace('N', '')
    
    # Extract all k-mers
    kmers = [sequence[i:i+k] for i in range(len(sequence) - k + 1)]
    
    # Count frequencies
    kmer_counts = Counter(kmers)
    
    return kmer_counts

# Test with a sample sequence
test_seq = "AUGCAUGCAUG"
print(f"Test sequence: {test_seq}")
print(f"3-mer counts: {extract_kmer_features(test_seq, k=3)}")


Test sequence: AUGCAUGCAUG
3-mer counts: Counter({'AUG': 3, 'UGC': 2, 'GCA': 2, 'CAU': 2})


In [9]:
def read_fasta(filename):
    """Read sequences from FASTA file"""
    sequences = {}
    with open(filename, 'r') as f:
        current_id = None
        current_seq = []
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if current_id:
                    sequences[current_id] = ''.join(current_seq)
                current_id = line[1:]
                current_seq = []
            else:
                current_seq.append(line)
        if current_id:
            sequences[current_id] = ''.join(current_seq)
    return sequences

# Test loading
print("Testing FASTA file loading...")
# Don't run the full file yet - just check if file exists
import os
print("training.fa exists:", os.path.exists('training.fa'))


Testing FASTA file loading...
training.fa exists: True


In [10]:
def create_kmer_features(sequences, k=3, normalize=True):
    """Create k-mer feature matrix for multiple sequences"""
    all_kmers = generate_kmers(k)
    features = []
    
    for seq_id, sequence in sequences.items():
        kmer_counts = extract_kmer_features(sequence, k)
        
        feature_vector = []
        for kmer in all_kmers:
            count = kmer_counts.get(kmer, 0)
            if normalize:
                count = count / (len(sequence) - k + 1) if len(sequence) >= k else 0
            feature_vector.append(count)
        
        features.append(feature_vector)
    
    df_features = pd.DataFrame(features, 
                                columns=all_kmers,
                                index=list(sequences.keys()))
    
    return df_features

print("Feature extraction function ready!")


Feature extraction function ready!


In [12]:
# Cell 6: Load Training Data (no need for import pandas as pd)
print("Loading training sequences...")
train_sequences = read_fasta('training.fa')
print(f"✓ Loaded {len(train_sequences)} training sequences")

print("Loading training labels...")
train_labels = pd.read_csv('training_class.csv')  # pd is already imported from Cell 1
print(f"✓ Loaded {len(train_labels)} training labels")

# Show first sequence as example
first_id = list(train_sequences.keys())[0]
print(f"\nExample sequence ID: {first_id}")
print(f"Sequence length: {len(train_sequences[first_id])} nucleotides")
print(f"First 50 nucleotides: {train_sequences[first_id][:50]}...")



Loading training sequences...
✓ Loaded 22867 training sequences
Loading training labels...
✓ Loaded 22867 training labels

Example sequence ID: ENSDART00000138379
Sequence length: 2567 nucleotides
First 50 nucleotides: TCAAANGGAAAATAATATGTCAGYTGTGATTTTTACTCGANTTAATACGA...


In [13]:
# Cell 7: Extract k-mer features
print("Extracting k-mer features from training data...")
print("This may take a few minutes...")

k = 3  # Start with 3-mers
X_train = create_kmer_features(train_sequences, k=k, normalize=True)

print(f"✓ Feature extraction complete!")
print(f"Feature matrix shape: {X_train.shape}")


Extracting k-mer features from training data...
This may take a few minutes...
✓ Feature extraction complete!
Feature matrix shape: (22867, 64)


In [14]:
# Cell 8: Align Labels with Features
print("Aligning labels with features...")

# Make sure labels match the same sequences as features
y_train = train_labels.set_index('name').loc[X_train.index, 'class']

print(f"✓ Features shape: {X_train.shape}")
print(f"✓ Labels shape: {y_train.shape}")
print(f"\nClass distribution:")
print(y_train.value_counts())
print(f"\n  Class 0 (non-mRNA): {(y_train == 0).sum()} sequences")
print(f"  Class 1 (mRNA):     {(y_train == 1).sum()} sequences")


Aligning labels with features...
✓ Features shape: (22867, 64)
✓ Labels shape: (22867,)

Class distribution:
class
0    14035
1     8832
Name: count, dtype: int64

  Class 0 (non-mRNA): 14035 sequences
  Class 1 (mRNA):     8832 sequences


In [15]:
# Cell 9: Save Features (DO THIS AFTER CELL 8 WORKS)
print("Saving extracted features...")

# Save features and labels
X_train.to_csv('train_kmer_features_k3.csv')
y_train.to_csv('train_labels.csv')

print("✓ Features saved to: train_kmer_features_k3.csv")
print("✓ Labels saved to: train_labels.csv")
print("\nYou can reload them later with:")
print("  X_train = pd.read_csv('train_kmer_features_k3.csv', index_col=0)")
print("  y_train = pd.read_csv('train_labels.csv', index_col=0)['class']")


Saving extracted features...
✓ Features saved to: train_kmer_features_k3.csv
✓ Labels saved to: train_labels.csv

You can reload them later with:
  X_train = pd.read_csv('train_kmer_features_k3.csv', index_col=0)
  y_train = pd.read_csv('train_labels.csv', index_col=0)['class']
