In [None]:
!pip install Datasets MinHash sentence_transformers sourmash

In [None]:
import pandas as pd
import seaborn as sns
from datasets import load_dataset, Dataset
from sentence_transformers import SentenceTransformer, util
import matplotlib.pyplot as plt
from sourmash import MinHash
import random

In [None]:
dataset = load_dataset("Hack90/virus_dna_dataset")
data = dataset['train'].to_pandas()
data = data.drop_duplicates(subset=['sequence']).copy()
data = data[data['seq_length']< 50_000]
data = data[data['seq_length']> 5_000]

Downloading readme:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/175M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/117M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/102M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/101M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/369M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/106M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/69.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/102M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/109M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/95.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/104M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/111M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/642M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/116M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/2602437 [00:00<?, ? examples/s]

In [None]:
plt.figure(figsize=(10, 6))
sns.set_style("whitegrid")
sns.histplot(data=data, x='seq_length', color="skyblue", kde=True, bins=30)

# Setting title and labels
plt.title('Distribution of Sequence Lengths', fontsize=16)
plt.xlabel('Sequence Length', fontsize=14)
plt.ylabel('Frequency', fontsize=14)

# Removing the top and right spines for aesthetics
sns.despine()

# Display the plot
plt.tight_layout()
plt.show()

In [None]:
data['missing_seq_count'] = data.sequence.str.count('n')
data['missingness'] = data['missing_seq_count'] / data['seq_length']
data = data[data.missingness < 0.01].copy()

In [None]:
def replace_non_nucleotide_with_random(seq):
    nucleotides = ['A', 'T', 'C', 'G']
    return ''.join(random.choice(nucleotides) if base not in nucleotides else base for base in seq)
data['sequence'] = data['sequence'].str.upper()

In [None]:
data['seq_filled'] = data['sequence'].apply(replace_non_nucleotide_with_random)

In [None]:
sequences = data['seq_filled'].to_list()

# Create a list of MinHash signatures for each sequence
signatures = []
for k in range(len(sequences)):
  minihash = MinHash(n=1000, ksize=7)
  minihash.add_sequence(sequences[k])
  signatures.append(minihash)

unique_signatures = []
unique_sequences = []

for i, sig in enumerate(signatures):
    # Compare with all previous signatures in unique_signatures list
    is_similar = any([sig.jaccard(uni_sig) > 0.9 for uni_sig in unique_signatures])

    if not is_similar:
        unique_signatures.append(sig)
        unique_sequences.append(sequences[i])

In [None]:
len(unique_sequences)

10884

In [None]:
unique_dataset = pd.DataFrame(unique_sequences, columns=['sequence'])
unique_dataset['similarity_filter'] = 0.9
unique_dataset = unique_dataset.merge(data, left_on='sequence', right_on='seq_filled', how = 'left')
unique_dataset = Dataset.from_pandas(unique_dataset)

In [None]:
!huggingface-cli login
unique_dataset.push_to_hub('Hack90/virus_dna_dedup_minihash_0.9_kmer_7')