In [1]:
from collections import Counter

def kmer_count(sequence, k):
    kmers = [sequence[i:i+k] for i in range(len(sequence) - k + 1)]
    return Counter(kmers)

# Example usage:
sequence = "ACGTAACGTTAC"
k = 3
features = kmer_count(sequence, k)
print(features)


Counter({'ACG': 2, 'CGT': 2, 'GTA': 1, 'TAA': 1, 'AAC': 1, 'GTT': 1, 'TTA': 1, 'TAC': 1})


In [3]:
import numpy as np

def positional_encoding(sequence_length, d_model):
    position = np.arange(0, sequence_length, 1)
    angle_rates = 1 / np.power(10000, (2 * (np.arange(0, d_model, 2)) / d_model))
    positional_encoding = np.zeros((sequence_length, d_model))

    for pos in range(sequence_length):
        positional_encoding[pos, 0::2] = np.sin(pos * angle_rates)
        positional_encoding[pos, 1::2] = np.cos(pos * angle_rates)

    return positional_encoding

# Example usage:
sequence_length = 10
d_model = 16
pos_encoding = positional_encoding(sequence_length, d_model)
print("Positional Encoding:", pos_encoding)


Positional Encoding: [[ 0.00000000e+00  1.00000000e+00  0.00000000e+00  1.00000000e+00
   0.00000000e+00  1.00000000e+00  0.00000000e+00  1.00000000e+00
   0.00000000e+00  1.00000000e+00  0.00000000e+00  1.00000000e+00
   0.00000000e+00  1.00000000e+00  0.00000000e+00  1.00000000e+00]
 [ 8.41470985e-01  5.40302306e-01  9.98334166e-02  9.95004165e-01
   9.99983333e-03  9.99950000e-01  9.99999833e-04  9.99999500e-01
   9.99999998e-05  9.99999995e-01  1.00000000e-05  1.00000000e+00
   1.00000000e-06  1.00000000e+00  1.00000000e-07  1.00000000e+00]
 [ 9.09297427e-01 -4.16146837e-01  1.98669331e-01  9.80066578e-01
   1.99986667e-02  9.99800007e-01  1.99999867e-03  9.99998000e-01
   1.99999999e-04  9.99999980e-01  2.00000000e-05  1.00000000e+00
   2.00000000e-06  1.00000000e+00  2.00000000e-07  1.00000000e+00]
 [ 1.41120008e-01 -9.89992497e-01  2.95520207e-01  9.55336489e-01
   2.99955002e-02  9.99550034e-01  2.99999550e-03  9.99995500e-01
   2.99999996e-04  9.99999955e-01  3.00000000e-05  1

In [4]:
import pandas as pd
from collections import Counter

# Function to calculate k-mers and their frequencies
def calculate_kmers(sequences, k):
    kmers_dict = {}
    for seq in sequences:
        kmers = [seq[i:i+k] for i in range(len(seq) - k + 1)]
        kmers_dict[seq] = Counter(kmers)
    return kmers_dict

# Sample RNA sequences
rna_sequences = ["AUUGCAUAGCUAGCGUACGUAGC", "ACGUAGCUGCUAGCUAGCUAGC", "UGCUAGCGUAGCUAGCUAGCUAGC"]

# Calculate k-mers and their frequencies (let's take k=3 for demonstration)
k = 3
kmers_dict = calculate_kmers(rna_sequences, k)

# Convert the dictionary to a DataFrame for better visualization
kmers_df = pd.DataFrame.from_dict(kmers_dict, orient='index').fillna(0).astype(int)
print(f"{k}-mers and their frequencies:")
print(kmers_df)

# Now, let's calculate 2-mers (dinucleotides) and their frequencies
# We'll reuse the calculate_kmers function with k=2
k = 2
di_kmers_dict = calculate_kmers(rna_sequences, k)

# Convert the dictionary to a DataFrame for better visualization
di_kmers_df = pd.DataFrame.from_dict(di_kmers_dict, orient='index').fillna(0).astype(int)
print(f"\n{k}-mers and their frequencies:")
print(di_kmers_df)


3-mers and their frequencies:
                          AUU  UUG  UGC  GCA  CAU  AUA  UAG  AGC  GCU  CUA  \
AUUGCAUAGCUAGCGUACGUAGC     1    1    1    1    1    1    3    3    1    1   
ACGUAGCUGCUAGCUAGCUAGC      0    0    1    0    0    0    4    4    4    3   
UGCUAGCGUAGCUAGCUAGCUAGC    0    0    1    0    0    0    5    5    4    4   

                          GCG  CGU  GUA  UAC  ACG  CUG  
AUUGCAUAGCUAGCGUACGUAGC     1    2    2    1    1    0  
ACGUAGCUGCUAGCUAGCUAGC      0    1    1    0    1    1  
UGCUAGCGUAGCUAGCUAGCUAGC    1    1    1    0    0    0  

2-mers and their frequencies:
                          AU  UU  UG  GC  CA  UA  AG  CU  CG  GU  AC
AUUGCAUAGCUAGCGUACGUAGC    2   1   1   4   1   4   3   1   2   2   1
ACGUAGCUGCUAGCUAGCUAGC     0   0   1   5   0   4   4   4   1   1   1
UGCUAGCGUAGCUAGCUAGCUAGC   0   0   1   6   0   5   5   4   1   1   0


In [19]:
import numpy as np
from collections import Counter
from Bio.SeqUtils import molecular_weight
from itertools import product  # Add this import

# Function to calculate k-mers and their frequencies
def calculate_kmers(sequence, k):
    kmers = [sequence[i:i+k] for i in range(len(sequence) - k + 1)]
    return Counter(kmers)

# Function to calculate dinucleotide composition
def calculate_dinucleotide_composition(sequence):
    dinucleotides = [sequence[i:i+2] for i in range(len(sequence) - 1)]
    return Counter(dinucleotides)

# Function to calculate probability of individual nucleotides
def calculate_nucleotide_probabilities(sequence):
    nucleotide_counts = Counter(sequence)
    total_nucleotides = len(sequence)
    probabilities = {nucleotide: count / total_nucleotides for nucleotide, count in nucleotide_counts.items()}
    return probabilities

# Function to extract features for one sequence
def extract_features(sequence):
    features = {}
    
    # Define all possible 3-mers
    all_3_mers = [''.join(nt) for nt in product('ACGU', repeat=3)]
    
    # Initialize feature vector with zeros for all possible 3-mers
    features.update({mer: 0 for mer in all_3_mers})
    
    # Update feature vector with actual 3-mer counts
    three_mer_counts = calculate_kmers(sequence, 3)
    features.update(three_mer_counts)
    
    # Calculate dinucleotide composition
    di_counts = calculate_dinucleotide_composition(sequence)
    features.update(di_counts)
    
    # Calculate molecular weight
    features['Molecular_Weight'] = molecular_weight(sequence, "RNA")
    
    # Calculate sequence length
    features['Sequence_Length'] = len(sequence)
    
    # Calculate probability of individual nucleotides
    nucleotide_probs = calculate_nucleotide_probabilities(sequence)
    features.update(nucleotide_probs)
    
    return features

# Sample RNA sequence
rna_sequence = "AACTTTCAGCAATGGATTTTTAGGTACCCGGTTCGATGAAGATCGGAGCAAAATCCGAAAAGTAGTGTGAATTGCAGATTTCGCGAATCATCGAATTATCGAACGCATATTGCGCCCCACGGTATTCCGTGGAGCATGCCTGTTTGAGCGTCA"
rna_sequence = rna_sequence.replace('T','U')
# Extract features for the sequence
sequence_features = extract_features(rna_sequence)
print("Features for the RNA sequence:")
print(sequence_features)


Features for the RNA sequence:
{'AAA': 4, 'AAC': 2, 'AAG': 2, 'AAU': 5, 'ACA': 0, 'ACC': 1, 'ACG': 2, 'ACU': 1, 'AGA': 2, 'AGC': 4, 'AGG': 1, 'AGU': 2, 'AUA': 1, 'AUC': 5, 'AUG': 3, 'AUU': 6, 'CAA': 2, 'CAC': 1, 'CAG': 2, 'CAU': 3, 'CCA': 1, 'CCC': 3, 'CCG': 3, 'CCU': 1, 'CGA': 5, 'CGC': 3, 'CGG': 3, 'CGU': 2, 'CUA': 0, 'CUC': 0, 'CUG': 1, 'CUU': 1, 'GAA': 6, 'GAC': 0, 'GAG': 3, 'GAU': 4, 'GCA': 5, 'GCC': 2, 'GCG': 3, 'GCU': 0, 'GGA': 3, 'GGC': 0, 'GGG': 0, 'GGU': 3, 'GUA': 3, 'GUC': 1, 'GUG': 3, 'GUU': 2, 'UAA': 0, 'UAC': 1, 'UAG': 2, 'UAU': 3, 'UCA': 3, 'UCC': 2, 'UCG': 5, 'UCU': 0, 'UGA': 3, 'UGC': 3, 'UGG': 2, 'UGU': 2, 'UUA': 2, 'UUC': 4, 'UUG': 3, 'UUU': 6, 'AA': 13, 'AC': 4, 'CU': 2, 'UU': 15, 'UC': 10, 'CA': 9, 'AG': 9, 'GC': 10, 'AU': 15, 'UG': 10, 'GG': 6, 'GA': 13, 'UA': 6, 'GU': 9, 'CC': 8, 'CG': 13, 'Molecular_Weight': 49281.06889999994, 'Sequence_Length': 153, 'A': 0.27450980392156865, 'C': 0.20915032679738563, 'U': 0.2679738562091503, 'G': 0.24836601307189543}


In [28]:
import numpy as np
from collections import Counter
from Bio.SeqUtils import molecular_weight
from itertools import product  # Add this import
import pandas as pd

# Function to calculate k-mers and their frequencies
def calculate_kmers(sequence, k):
    kmers = [sequence[i:i+k] for i in range(len(sequence) - k + 1)]
    return Counter(kmers)

# Function to calculate dinucleotide composition
def calculate_dinucleotide_composition(sequence):
    dinucleotides = [sequence[i:i+2] for i in range(len(sequence) - 1)]
    return Counter(dinucleotides)

# Function to calculate probability of individual nucleotides
def calculate_nucleotide_probabilities(sequence):
    nucleotide_counts = Counter(sequence)
    total_nucleotides = len(sequence)
    probabilities = {nucleotide: count / total_nucleotides for nucleotide, count in nucleotide_counts.items()}
    return probabilities

# Function to extract features for one sequence
def extract_features(sequence):
    features = {}
    
    # Define all possible 3-mers
    all_3_mers = [''.join(nt) for nt in product('ACGU', repeat=3)]
    
    # Initialize feature vector with zeros for all possible 3-mers
    features.update({mer: 0 for mer in all_3_mers})
    
    # Update feature vector with actual 3-mer counts
    three_mer_counts = calculate_kmers(sequence, 3)
    features.update(three_mer_counts)
    
    all_di_mers = [''.join(nt) for nt in product('ACGU', repeat=2)]
    features.update({mer: 0 for mer in all_di_mers})
    # Calculate dinucleotide composition
    di_counts = calculate_dinucleotide_composition(sequence)
    features.update(di_counts)
    
    # Calculate molecular weight
    features['Molecular_Weight'] = molecular_weight(sequence, "RNA")
    
    # Calculate sequence length
    features['Sequence_Length'] = len(sequence)
    
    # Calculate probability of individual nucleotides
    nucleotide_probs = calculate_nucleotide_probabilities(sequence)
    features.update(nucleotide_probs)
    
    return features

# Read the file and extract features for each sequence
features_list = []
rna_name = None
with open(r"E:\4th sem\IOBS-2\project\Non Coding RNA classification\data\Test_0", "r") as file:
    for line in file:
        if line.startswith('>'):
            # Extract RNA name
            rna_name = line.strip().replace('>', '')
        else:
            # Extract RNA sequence and replace T with U
            rna_sequence = line.strip().replace('T', 'U')
            rna_sequence = rna_sequence.replace('N','A')
            rna_sequence = rna_sequence.replace('W','A')
            rna_sequence = rna_sequence.replace('R','A')
            rna_sequence = rna_sequence.replace('K','A')
            rna_sequence = rna_sequence.replace('M','A')
            rna_sequence = rna_sequence.replace('S','A')
            rna_sequence = rna_sequence.replace('Y','A')
            # Extract features for the sequence
            sequence_features = extract_features(rna_sequence)
            # Add RNA name to features
            sequence_features['RNA_Name'] = rna_name
            # Append features to the list
            features_list.append(sequence_features)

# Convert the list of features into a DataFrame
features_df = pd.DataFrame(features_list)

# Save features as a CSV file
features_df.to_csv("rna_sequence_features.csv", index=False)


In [29]:
[''.join(nt) for nt in product('ACGU', repeat=2)]

['AA',
 'AC',
 'AG',
 'AU',
 'CA',
 'CC',
 'CG',
 'CU',
 'GA',
 'GC',
 'GG',
 'GU',
 'UA',
 'UC',
 'UG',
 'UU']