# **PRIMER MACHINE LEARNING MODEL**

## **PREPARATION**

In [171]:
# Library
from collections import deque
import math
import numpy as np
import pandas as pd
import os

# random -> for testing
import random


In [172]:
# DEFINING VALUES (for future development)
SHORT_LOWER_BOUND = 80
SHORT_UPPER_BOUND = 100
MEDIUM_LOWER_BOUND = 100
MEDIUM_UPPER_BOUND = 150
LONG_LOWER_BOUND = 200
LONG_UPPER_BOUND = 250
PRIMER_LENGTH_LOWER_BOUND = 18
PRIMER_LENGTH_UPPER_BOUND = 22
OPTIMAL_TM_LOWER_BOUND = 58
OPTIMAL_TM_UPPER_BOUND = 60
CPG_THRESHOLD = 5

# DEFINING VALUES (for current development)
EXACTA_SHORT_LENGTH = 95
EXACTA_MEDIUM_LENGTH = 150
EXACTA_LONG_LENGTH = 250
EXACTA_PRIMER_LENGTH = 22

# DEFINING VALUES (for easier development)
DICT_GENOME_KEY = ["A","C","G","T"]
DICT_GENOME = {"A":0,"C":0,"G":0,"T":0}
KEY_CPG = "CG"

### **Class** (Kalau perlu, rasanya tidak)

In [173]:
class Primer:
    KEY_CPG = "CG"

    def __init__(self, category: str, start: int, end: int, genome: str):
        self.category = category
        self.start = start
        self.end = end
        self.genome = genome

    def get_cpg_count(self) -> int:
        return sum(1 for i in range(len(self.genome) - 1) if self.genome[i:i+2] == self.KEY_CPG)

    def __repr__(self):
        return f"Primer(category={self.category}, start={self.start}, end={self.end}, cpg_count={self.get_cpg_count()})"

    def toDictionary(self):
        return {
            "Category": self.category,
            "Start": self.start,
            "End": self.end,
            "CpG": self.get_cpg_count()
        }


In [174]:
class PrimerSet:
    def __init__(self, f1: Primer, r1: Primer, r2: Primer, r3: Primer):
        self.f1 = f1
        self.r1 = r1
        self.r2 = r2
        self.r3 = r3

    def get_all_cpg_counts(self):
        """Return CpG counts for each primer."""
        return {
            "f1": self.f1.get_cpg_count(),
            "r1": self.r1.get_cpg_count(),
            "r2": self.r2.get_cpg_count(),
            "r3": self.r3.get_cpg_count()
        }

    def total_cpg(self):
        """Total CpG across all primers."""
        return sum(self.get_all_cpg_counts().values())

    def toDictionary(self):
        """Return a combined dictionary of all primers."""
        return {
            "f1": self.f1.toDictionary(),
            "r1": self.r1.toDictionary(),
            "r2": self.r2.toDictionary(),
            "r3": self.r3.toDictionary(),
            "TotalCpG": self.total_cpg()
        }

    def __repr__(self):
        return f"WholeBase(f1={self.f1}, r1={self.r1}, r2={self.r2}, r3={self.r3}, totalCpG={self.total_cpg()})"


## **TOOLS**

### **TESTCASE**

In [175]:
def create_big_test_case():
  # Make clusters of CpGs + some scattered points
  cpg_positions = []
  # Cluster 1: dense CpGs between 100–300
  cpg_positions += [random.randint(100, 300) for _ in range(50)]
  # Cluster 2: dense CpGs between 1000–1500
  cpg_positions += [random.randint(1000, 1500) for _ in range(120)]
  # Cluster 3: scattered CpGs in 5000–6000
  cpg_positions += [random.randint(5000, 6000) for _ in range(40)]
  # Cluster 4: very dense CpGs 8000–8050
  cpg_positions += [random.randint(8000, 8050) for _ in range(200)]
  # Add some random noise across the genome
  cpg_positions += [random.randint(0, 10000) for _ in range(200)]
  # Sort positions
  cpg_positions = sorted(cpg_positions)
  return cpg_positions

### **TM CALCULATOR**

In [176]:
def calculateTM(primer):
  counter = DICT_GENOME.copy()
  for i in range(len(primer)):
    counter[primer[i]]+=1
  print(counter)
  return ((4*(counter[DICT_GENOME_KEY[2]]+counter[DICT_GENOME_KEY[1]]))+(2*(counter[DICT_GENOME_KEY[0]]+counter[DICT_GENOME_KEY[3]])))

In [177]:
# TEST Calculate_Tm
def test_calculateTM():
  res = calculateTM("ACGTACGTACGTACGTAAAACCCGGTT")
  print(res);

test_calculateTM()

{'A': 8, 'C': 7, 'G': 6, 'T': 6}
80


### **PRUNING**

In [178]:
def find_Cpg(genome: str):
  position = []
  for i in range(len(genome) - 1):
    if genome[i:i+2] == KEY_CPG:
      position.append(i)
  return position

In [179]:
def test_findCpg():
  genome1 = "ATCGATCGCG"   # CG at 2, 6, 8
  genome2 = "AAAAA"        # no CG
  genome3 = "CGCGCG"       # overlapping CGs: 0, 2, 4
  genome4 = "C"            # too short, should return []
  genome5 = "ATCCGGATC"    # CG at 3

  print(find_Cpg(genome1))  # [2, 6, 8]
  print(find_Cpg(genome2))  # []
  print(find_Cpg(genome3))  # [0, 2, 4]
  print(find_Cpg(genome4))  # []
  print(find_Cpg(genome5))  # [3]

test_findCpg()

[2, 6, 8]
[]
[0, 2, 4]
[]
[3]


In [180]:
def find_cpg_intervals(cpg_positions,max_span, min_threshold=CPG_THRESHOLD,start_point = PRIMER_LENGTH_LOWER_BOUND):
  window_queue = deque()
  intervals = []

  for pos in cpg_positions:
    if pos > start_point:
      window_queue.append(pos)

    while window_queue and window_queue[0] < pos - (max_span):
      window_queue.popleft()

    if len(window_queue) >= min_threshold:
      start = window_queue[0]
      end = pos
      intervals.append({"start":start,"end": end,"length": (end-start+1),"cpg_count":len(window_queue)})

  return intervals


In [181]:
def test_find_cpg_intervals():
  cpg_positions = [5, 20, 35, 50, 60, 62, 65, 120, 130, 135]
  short_cpg_span = SHORT_UPPER_BOUND-(2*PRIMER_LENGTH_LOWER_BOUND)
  result = find_cpg_intervals(cpg_positions,short_cpg_span)
  print(result)

test_find_cpg_intervals()

[{'start': 20, 'end': 62, 'length': 43, 'cpg_count': 5}, {'start': 20, 'end': 65, 'length': 46, 'cpg_count': 6}]


In [192]:
def find_nested_intervals(inner, outer):
  # seng iki yo vi

  pass #lek ws diisi comment

## Logic

### Input (Biarin dulu)

In [183]:
# input (ON GOING)
"""
directory_address = ""
for file in os.listdir(directory_address):
    if file.endswith(".fasta"):
        file_path = os.path.join(directory_address, file)
"""

'\ndirectory_address = ""\nfor file in os.listdir(directory_address):\n    if file.endswith(".fasta"):\n        file_path = os.path.join(directory_address, file)\n'

In [184]:
genome = ""

### Process

In [185]:
# cpg_positions = find_Cpg(genome)
cpg_positions = create_big_test_case()
short = find_cpg_intervals(cpg_positions,max_span=SHORT_UPPER_BOUND-(2*PRIMER_LENGTH_LOWER_BOUND),min_threshold=CPG_THRESHOLD)
medium = find_cpg_intervals(cpg_positions,max_span=MEDIUM_UPPER_BOUND-(2*PRIMER_LENGTH_LOWER_BOUND),min_threshold=2*CPG_THRESHOLD)
long = find_cpg_intervals(cpg_positions,max_span=LONG_UPPER_BOUND-(2*PRIMER_LENGTH_LOWER_BOUND),min_threshold=3*CPG_THRESHOLD)

In [186]:
df_short = pd.DataFrame(short)
df_medium = pd.DataFrame(medium)
df_long = pd.DataFrame(long)

#### Display Intervals

In [187]:
df_short

Unnamed: 0,start,end,length,cpg_count
0,119,129,11,5
1,119,137,19,6
2,119,137,19,7
3,119,139,21,8
4,119,143,25,9
...,...,...,...,...
422,8000,8050,51,200
423,8000,8050,51,201
424,8030,8094,65,85
425,8444,8501,58,5


In [188]:
df_medium

Unnamed: 0,start,end,length,cpg_count
0,119,144,26,10
1,119,145,27,11
2,119,149,31,12
3,119,150,32,13
4,119,156,38,14
...,...,...,...,...
379,8000,8050,51,200
380,8000,8050,51,201
381,8000,8094,95,202
382,8015,8129,115,145


In [189]:
df_long

Unnamed: 0,start,end,length,cpg_count
0,119,160,42,15
1,119,162,44,16
2,119,165,47,17
3,119,179,61,18
4,119,185,67,19
...,...,...,...,...
378,8000,8129,130,203
379,8000,8151,152,204
380,8000,8176,177,205
381,8020,8234,215,127
