In [106]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import (
    roc_auc_score,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix
)
from sklearn.model_selection import StratifiedKFold, cross_val_predict
import random

In [107]:
seq = "agattgcaggcataagaaccaagcacagaccttccggcgacaataagcgataggattttgactggtagacagccgtactgtctttgtcaacacgtcaccttatcccaagtacttcagtagtttcaacgggactgcgtaaccgaccgtacactgactggtgaagacactattaggtcattgcgaacaacggaagatgcccacacatcaaaaggtttttttggactttagaatatccaaggtttattccccgaagatcgttctctgatgtgcctttaagagctgtactgctcggtatggaggagaaggaagtgtctattaactccctttgcccacgcatgataatgtgagtacagtacacccgaacggcctaagtcctcacattctctatcccgttactaaggcaaacgaagtctattcgggagcacaacaaccagctaacgctgaagcgaacccctacaaacgcaccgggctcttgatacggcggatggttcgccatggttgcggggattcaacagggacactatatggt".upper()
k = 4 # in (3,10)
test_count = 400

In [108]:
def count_kmeters(seq, k):
    kmeters = {}
    def construct(seq, n):
        if n == 0:
            kmeters[seq] = 0
            return
        for c in ['A', 'C', 'G', 'T']:
            construct(seq + c, n - 1)
    construct("", k)
        
    for i in range(len(seq) - k):
        kmeters[seq[i:i+k]] += 1

    for k, v in kmeters.items():
        kmeters[k] = v / (len(k))
        
    def get_complement(c):
        if c == 'A':
            return 'T'
        if c == 'T':
            return 'A'
        if c == 'C':
            return 'G'
        if c == 'G':
            return 'C'
        return 'N'

    def get_complement_str(s):
        return "".join([get_complement(c) for c in s][::-1])

    to_delete = set()
    for k, v in kmeters.items():
        if k in to_delete:
            continue
        c = get_complement_str(k)
        if c in kmeters and c != k:
            kmeters[k] += kmeters[c]
            to_delete.add(c)
            
    for k in to_delete:
        del kmeters[k]

    return kmeters

In [109]:
kmeters = count_kmeters(seq, k)
len(kmeters)

136

In [110]:
df = pd.read_csv("data/experiments.tsv", sep="\t")
data = df[(df["curation_status"] == "positive") | (df["curation_status"] == "negative")]
data = data.sample(frac=1).reset_index(drop=True)
data = data.loc[data['seq'].apply(lambda x: isinstance(x, str))]
data["seq"] = data["seq"].apply(lambda x: x.upper())
data = data.loc[data['seq'].apply(lambda x: set(x).issubset(set("ATCG")))]
pos_data = data[data["curation_status"] == "positive"]
neg_data = data[data["curation_status"] == "negative"]
print(len(df), len(data), len(pos_data), len(neg_data))

4646 4302 2300 2002


Negative data V2

In [111]:
length = int(np.mean([len(x) for x in data["seq"]]))
num_samples = len(neg_data)
length

2245

In [112]:
def sample_from_fasta(file_path, num_samples, length):
    sampled_sequences = []
    current_sequence = ""
    
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('>'):
                current_sequence = ""
                continue
            
            line = line.strip()
            if set(line) - {'A', 'C', 'T', 'G'}:
                current_sequence = ""
                continue
            
            current_sequence += line
            while len(current_sequence) >= length and len(sampled_sequences) < num_samples:
                start = random.randint(0, len(current_sequence) - length)
                sampled_sequences.append(current_sequence[start:start + length])
                current_sequence = current_sequence[start + length:]
            
            if len(sampled_sequences) >= num_samples:
                break

    return sampled_sequences

In [113]:
sampled_sequences = sample_from_fasta("data/GRCh38.p14.genome.fa", num_samples, length)

In [133]:
with open("data/negative.txt", "w") as file:
    file.write("\n".join(sampled_sequences))

In [114]:
print(len(sampled_sequences), len(neg_data))

2002 2002


In [115]:
X_test = np.concatenate([
    pos_data["seq"][:test_count].values,
    neg_data["seq"][:test_count].values
])
X_test = [count_kmeters(x, k) for x in X_test]
X_test = [list(x.values()) for x in X_test]
y_test = [1] * test_count + [0] * test_count

X_train = np.concatenate([
    pos_data["seq"][test_count:].values,
    neg_data["seq"][test_count:].values
])
X_train = [count_kmeters(x, k) for x in X_train]
X_train = [list(x.values()) for x in X_train]
y_train = [1] * (len(pos_data) - test_count) + [0] * (len(neg_data) - test_count)

In [116]:
X_test2 = np.concatenate([
    pos_data["seq"][:test_count].values,
    sampled_sequences[:test_count]
])
X_test2 = [count_kmeters(x, k) for x in X_test2]
X_test2 = [list(x.values()) for x in X_test2]
y_test2 = [1] * test_count + [0] * test_count

X_train2 = np.concatenate([
    pos_data["seq"][test_count:].values,
    sampled_sequences[test_count:]
])
X_train2 = [count_kmeters(x, k) for x in X_train2]
X_train2 = [list(x.values()) for x in X_train2]
y_train2 = [1] * (len(pos_data) - test_count) + [0] * (len(neg_data) - test_count)

# Model

In [117]:
def score(X_train, X_test, y_train, y_test):
    svc = SVC(probability=True)
    cv = StratifiedKFold(n_splits=10)
    y_pred_prob = cross_val_predict(svc, X_train, y_train, cv=cv, method='predict_proba')[:, 1]
    y_pred = cross_val_predict(svc, X_train, y_train, cv=cv)

    auc = roc_auc_score(y_train, y_pred_prob)
    accuracy = accuracy_score(y_train, y_pred)
    precision = precision_score(y_train, y_pred)
    recall = recall_score(y_train, y_pred)
    f1 = f1_score(y_train, y_pred)
    conf_matrix = confusion_matrix(y_train, y_pred)

    print(f"AUC-ROC: {auc:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print("Confusion Matrix:")
    print(conf_matrix)

In [118]:
score(X_train, X_test, y_train, y_test)

AUC-ROC: 0.6361
Accuracy: 0.6057
Precision: 0.6028
Recall: 0.8011
F1-Score: 0.6879
Confusion Matrix:
[[ 599 1003]
 [ 378 1522]]


In [120]:
score(X_train2, X_test2, y_train2, y_test2)

AUC-ROC: 0.9916
Accuracy: 0.9612
Precision: 0.9696
Recall: 0.9584
F1-Score: 0.9640
Confusion Matrix:
[[1545   57]
 [  79 1821]]


In [122]:
print(len(X_train), len(X_test), len(y_train), len(y_test))
print(len(X_train2), len(X_test2), len(y_train2), len(y_test2))

3502 800 3502 800
3502 800 3502 800
