In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [6]:
import pandas as pd
from collections import Counter
from Bio import SeqIO
import os


## Step 1: Load and process HMDD data

In [7]:
fasta_path = "/content/drive/MyDrive/Colab Notebooks/final_year_project/mature.fa"

seq_dict = {}
for record in SeqIO.parse(fasta_path, "fasta"):
    seq_dict[record.id.split()[0]] = str(record.seq)

seq_df = pd.DataFrame(seq_dict.items(), columns=["miRNA", "sequence"])
seq_df.head()


Unnamed: 0,miRNA,sequence
0,cel-let-7-5p,UGAGGUAGUAGGUUGUAUAGUU
1,cel-let-7-3p,CUAUGCAAUUUUCUACCUUACC
2,cel-lin-4-5p,UCCCUGAGACCUCAAGUGUGA
3,cel-lin-4-3p,ACACCUGGGCUCUCCGGGUACC
4,cel-miR-1-5p,CAUACUUCCUUACAUGCCCAUA


##Step 2: Read miRNA sequences from miRBase (FASTA)

In [9]:
hmdd_path = "/content/drive/MyDrive/Colab Notebooks/final_year_project/alldata.txt"
hmdd = pd.read_csv(hmdd_path, sep="\t", names=["miRNA", "Disease", "Evidence", "Year"], encoding='latin-1')
hmdd['label'] = hmdd['Disease'].str.contains('cancer|Carcinoma|Tumor|Leukemia', case=False, na=False)
hmdd_labels = hmdd[['miRNA', 'label']].drop_duplicates()
hmdd_labels.head()


Unnamed: 0,miRNA,label
category,mir,False
circulation_biomarker_diagnosis_down,hsa-mir-15a,True
circulation_biomarker_diagnosis_down,hsa-mir-16,True
circulation_biomarker_diagnosis_down,hsa-mir-143,False
circulation_biomarker_diagnosis_down,hsa-mir-145,False


##Step 3: Merge labels with sequences

In [10]:
print("From HMDD:", hmdd_labels['miRNA'].head(10).tolist())
print("From miRBase:", seq_df['miRNA'].head(10).tolist())


From HMDD: ['mir', 'hsa-mir-15a', 'hsa-mir-16', 'hsa-mir-143', 'hsa-mir-145', 'hsa-mir-223', 'hsa-mir-29a', 'hsa-mir-29c', 'hsa-mir-10b', 'hsa-mir-125b-1']
From miRBase: ['cel-let-7-5p', 'cel-let-7-3p', 'cel-lin-4-5p', 'cel-lin-4-3p', 'cel-miR-1-5p', 'cel-miR-1-3p', 'cel-miR-2-5p', 'cel-miR-2-3p', 'cel-miR-34-5p', 'cel-miR-34-3p']


In [11]:
# Step 1: Lowercase all miRNA names to match HMDD
seq_df['miRNA'] = seq_df['miRNA'].str.lower()

# Step 2: Remove -5p/-3p suffixes and other trailing parts
seq_df['miRNA_clean'] = seq_df['miRNA'].str.extract(r'(^hsa-mir-[a-z0-9]+)')

# Step 3: Same for HMDD — convert to lowercase too
hmdd_labels['miRNA'] = hmdd_labels['miRNA'].str.lower()

# Step 4: Merge again
merged_df = pd.merge(hmdd_labels, seq_df, left_on='miRNA', right_on='miRNA_clean')
merged_df = merged_df.dropna()
merged_df.head()





Unnamed: 0,miRNA_x,label,miRNA_y,sequence,miRNA_clean
0,hsa-mir-15a,True,hsa-mir-15a-5p,UAGCAGCACAUAAUGGUUUGUG,hsa-mir-15a
1,hsa-mir-15a,True,hsa-mir-15a-3p,CAGGCCAUAUUGUGCUGCCUCA,hsa-mir-15a
2,hsa-mir-16,True,hsa-mir-16-5p,UAGCAGCACGUAAAUAUUGGCG,hsa-mir-16
3,hsa-mir-16,True,hsa-mir-16-1-3p,CCAGUAUUAACUGUGCUGCUGA,hsa-mir-16
4,hsa-mir-16,True,hsa-mir-16-2-3p,CCAAUAUUACUGUGCUGCUUUA,hsa-mir-16


In [12]:
# Merge miRNA labels and sequences
merged_df = pd.merge(hmdd_labels, seq_df, on="miRNA")
merged_df = merged_df.dropna()
merged_df.head()


Unnamed: 0,miRNA,label,sequence,miRNA_clean
0,hsa-mir-346,False,UGUCUGCCCGCAUGCCUGCCUCU,hsa-mir-346
1,hsa-mir-484,False,UCAGGCUCAGUCCCCUCCCGAU,hsa-mir-484
2,hsa-mir-518b,False,CAAAGCGCUCCCCUUUAGAGGU,hsa-mir-518b
3,hsa-mir-557,False,GUUUGCACGGGUGGGCCUUGUCU,hsa-mir-557
4,hsa-mir-596,False,AAGCCUGCCCGGCUCCUCGGG,hsa-mir-596


##Step 4: Feature engineering (GC content + sequence length)

In [13]:
# Define GC content function
def gc_content(seq):
    gc = seq.count('G') + seq.count('C')
    return gc / len(seq)

# Apply features
merged_df['gc_content'] = merged_df['sequence'].apply(gc_content)
merged_df['length'] = merged_df['sequence'].apply(len)

merged_df[['miRNA', 'label', 'gc_content', 'length']].head()


Unnamed: 0,miRNA,label,gc_content,length
0,hsa-mir-346,False,0.652174,23
1,hsa-mir-484,False,0.636364,22
2,hsa-mir-518b,False,0.545455,22
3,hsa-mir-557,False,0.608696,23
4,hsa-mir-596,False,0.761905,21


In [14]:
# Split the merged DataFrame into positive and negative samples
merged_df_pos = merged_df[merged_df['label'] == True]
merged_df_neg = merged_df[merged_df['label'] == False]

# Combine both positive and negative samples into one dataset
final_df = pd.concat([merged_df_pos, merged_df_neg], ignore_index=True)

# Convert label from True/False to 1/0 (optional but recommended)
final_df['label'] = final_df['label'].astype(int)

# Preview the class distribution
print(final_df['label'].value_counts())
final_df.head()


label
0    291
1    220
Name: count, dtype: int64


Unnamed: 0,miRNA,label,sequence,miRNA_clean,gc_content,length
0,hsa-mir-575,1,GAGCCAGUUGGACAGGAGC,hsa-mir-575,0.631579,19
1,hsa-mir-451a,1,AAACCGUUACCAUUACUGAGUU,hsa-mir-451a,0.363636,22
2,hsa-mir-484,1,UCAGGCUCAGUCCCCUCCCGAU,hsa-mir-484,0.636364,22
3,hsa-mir-1297,1,UUCAAGUAAUUCAGGUG,hsa-mir-1297,0.352941,17
4,hsa-mir-5100,1,UUCAGAUCCCAGCGGUGCCUCU,hsa-mir-5100,0.590909,22


##Step 5: Save cleaned dataset for modeling


In [19]:
# Define GC content function
def gc_content(seq):
    gc = seq.count('G') + seq.count('C')
    return gc / len(seq)

# Apply features
merged_df['gc_content'] = merged_df['sequence'].apply(gc_content)
merged_df['length'] = merged_df['sequence'].apply(len)

# Use the correct column name ('miRNA' or 'miRNA_clean') instead of 'miRNA_x'
merged_df[['miRNA', 'label', 'gc_content', 'length']].head()  # Use 'miRNA'
# or
# merged_df[['miRNA_clean', 'label', 'gc_content', 'length']].head() # Use 'miRNA_clean' if that's the correct column name from the merge
# Split the merged DataFrame into positive and negative samples
merged_df_pos = merged_df[merged_df['label'] == True]
merged_df_neg = merged_df[merged_df['label'] == False]

# Combine both positive and negative samples into one dataset
final_df = pd.concat([merged_df_pos, merged_df_neg], ignore_index=True)

# Convert label from True/False to 1/0 (optional but recommended)
final_df['label'] = final_df['label'].astype(int)

# Preview the class distribution
print(final_df['label'].value_counts())
final_df.head()

# Save processed dataset as CSV
# Again, use the appropriate column name here, either 'miRNA' or 'miRNA_clean'
merged_df[['miRNA', 'label', 'gc_content', 'length']].to_csv("mirna_ml_dataset.csv", index=False)  # Use 'miRNA'
# or
# merged_df[['miRNA_clean', 'label', 'gc_content', 'length']].to_csv("mirna_ml_dataset.csv", index=False)  # Use 'miRNA_clean'

label
0    291
1    220
Name: count, dtype: int64


In [21]:
# Define AU content function # This line was added
def au_content(seq):
    au = seq.count('A') + seq.count('U')  # or seq.count('T') depending on your data
    return au / len(seq)

# Feature extraction
final_df['length'] = final_df['sequence'].apply(len)
final_df['gc_content'] = final_df['sequence'].apply(gc_content)
final_df['au_content'] = final_df['sequence'].apply(au_content)

# Export for modeling
# Changed 'miRNA_x' to 'miRNA' as 'miRNA_x' might not exist. If you intended to use another column, update accordingly.
final_df[['miRNA', 'label', 'length', 'gc_content', 'au_content']].to_csv("mirna_features.csv", index=False)


In [22]:
from google.colab import files
files.download("mirna_features.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

This feature dataset will be used as input for the model training notebook (02_model_training.ipynb).
