In [29]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import LabelEncoder
import numpy as np

DATA_FILE = "/content/drive/MyDrive/Hackathon/fine mast data.csv"
KMER_SIZE = 2
TOP_N = 3
SIM_THRESHOLD = 0.3
df = pd.read_csv(DATA_FILE)

seq_cols = df.columns[1:6]
long_df = df.melt(id_vars=[df.columns[0]], value_vars=seq_cols,
                  var_name="source_col", value_name="sequence")
long_df.dropna(subset=["sequence"], inplace=True)
long_df["sequence"] = long_df["sequence"].astype(str).str.strip()
long_df = long_df[long_df["sequence"] != ""]

def kmer_split(seq, k=2):
    return [seq[i:i+k] for i in range(len(seq) - k + 1)] if len(seq) >= k else [seq]

vectorizer = CountVectorizer(analyzer=lambda x: kmer_split(x, KMER_SIZE))
X = vectorizer.fit_transform(long_df["sequence"])

nn = NearestNeighbors(n_neighbors=TOP_N, metric="cosine")
nn.fit(X)
def predict_sequence(seq):
    vec = vectorizer.transform([seq])
    distances, indices = nn.kneighbors(vec)
    similarities = 1 - distances[0]

    results = []
    for idx, sim in zip(indices[0], similarities):
        results.append({
            "ID": long_df.iloc[idx, 0],
            "Similarity": round(float(sim), 3)
        })
    if results[0]["Similarity"] < SIM_THRESHOLD:
        return "Unknown", results

    return results[0]["ID"], results
user_seq = input("Enter sequence: ").strip()
best_id, matches = predict_sequence(user_seq)

print("\nBest Match:", best_id)
print("\nTop Matches:")
for m in matches:
    print(f"{m['ID']}  | Similarity: {m['Similarity']}")

Enter sequence: dccsyedrreirhiwddvwsssftdrrvaivravfddlfkhyptskalfervkidepesgefkshlvrvanglkllinllddtlvlqshlghladqhiqrkgvtkeyfrgigeafarvlpqvlscfnvdawnrcfhrlvariakdl

Best Match: d1x9fa_ a.1.1.2 (A:) Extracellular dodecameric hemoglobin (erythrocruorin)

Top Matches:
d1x9fa_ a.1.1.2 (A:) Extracellular dodecameric hemoglobin (erythrocruorin)  | Similarity: 0.998
d1blxa_ d.144.1.7 (A:) Cyclin-dependent PK  | Similarity: 0.548
d2btva_ e.28.1.1 (A:) BTV vp3 {Bluetongue virus  | Similarity: 0.547
