In [15]:
pip install pysam
pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [None]:
import pandas as pd
import numpy as np
import gzip
import pysam
from Bio import SeqIO
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load mutation data from VCF file
data1 = pysam.VariantFile('/content/drive/My Drive/Colab Notebooks/raw_variants.vcf.gz', 'r')

mutation_data = []
for record in data1.fetch():
    chrom = record.chrom
    pos = record.pos
    ref = record.ref
    alt = record.alts[0] if record.alts else 'N'
    dp = record.info.get('DP', 0)
    ac = record.info.get('AC', 0)
    fs = record.info.get('FS', 0)
    mutation_data.append([chrom, pos, ref, alt, dp, ac, fs])

mutation_df = pd.DataFrame(mutation_data, columns=["Chromosome", "Position", "Ref", "Alt", "DP", "AC", "FS"])
mutation_df['AC'] = mutation_df['AC'].apply(
    lambda x: int(''.join(filter(str.isdigit, str(x)))) if pd.notna(x) and str(x).strip() else 0
)


# Create labels (1 for Alzheimer's, 0 for healthy)
mutation_df['label'] = (mutation_df['AC'] > 1).astype(int)
y = mutation_df['label']

# Load gene sequences
sequences = []
with gzip.open("/content/drive/My Drive/Colab Notebooks/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz", "rt") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        sequences.append(str(record.seq))

if not sequences:
    raise ValueError("No sequences loaded. Check the FASTA file.")

def one_hot_encode(sequence, length=1000):
    encoding = {'A': [1, 0, 0, 0], 'T': [0, 1, 0, 0], 'C': [0, 0, 1, 0], 'G': [0, 0, 0, 1]}
    unknown_encoding = [0, 0, 0, 0]
    seq = sequence[:length]
    return np.array([encoding.get(nuc, unknown_encoding) for nuc in seq]).flatten()

# Extract and encode sequence features
sequence_features = []
for i in range(len(mutation_df)):
    seq_index = min(i, len(sequences[0]) - 1000)
    seq = sequences[0][seq_index:seq_index + 1000]
    sequence_features.append(one_hot_encode(seq))

sequence_features = np.array(sequence_features)

# Ensure data alignment
X_mutation = mutation_df[['DP', 'AC', 'FS']]
if X_mutation.shape[0] != sequence_features.shape[0]:
    raise ValueError("Mismatch between mutation data and sequence data size!")

X = np.concatenate([X_mutation.values, sequence_features], axis=1)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)

# Evaluate the Model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
