# **Serial Performance Baseline and Profiling**

### **Section 1: Section A — Vector Operations & Dot Product (CPU- bound, loop overhead)**

In [1]:
import time,random,cProfile,pstats,io,tracemalloc

def gen_data(n,seed=42):
 random.seed(seed)
 x=[random.random() for _ in range(n)]
 y=[random.random() for _ in range(n)]
 return x,y

def vector_add(x,y):
 out=[]
 for a,b in zip(x,y):
  out.append(a+b)
 return out

def dot_product(x,y):
 s=0.0
 for a,b in zip(x,y):
  s+=a*b
 return s

def main():
 N=200_000
 x,y=gen_data(N)
 tracemalloc.start()
 pr=cProfile.Profile()
 pr.enable()
 t0=time.perf_counter()
 s=dot_product(x,y)
 t1=time.perf_counter()
 t2=time.perf_counter()
 z=vector_add(x,y)
 t3=time.perf_counter()
 pr.disable()
 current,peak=tracemalloc.get_traced_memory()
 tracemalloc.stop()
 print(f"Vector length N={N}")
 print(f"Dot product:{s:.6f}|Time:{t1-t0:.3f} s")
 print(f"Vector add:len(z)={len(z)}|Time:{t3-t2:.3f} s")
 print(f"Peak memory:{current/1e6:.2f} MB/{peak/1e6:.2f} MB")
 s_buf=io.StringIO()
 ps=pstats.Stats(pr,stream=s_buf).sort_stats("cumtime")
 ps.print_stats(15)
 print("\n---cProfile (Top 15 by cumulative time)---")
 print(s_buf.getvalue())

if __name__=="__main__":
 main()


Vector length N=200000
Dot product:50114.042270|Time:0.118 s
Vector add:len(z)=200000|Time:1.066 s
Peak memory:6.43 MB/6.43 MB

---cProfile (Top 15 by cumulative time)---
         200009 function calls (200008 primitive calls) in 1.184 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      2/1    0.712    0.356    0.958    0.958 /tmp/ipython-input-3727322166.py:9(vector_add)
   200000    0.353    0.000    0.353    0.000 {method 'append' of 'list' objects}
        1    0.118    0.118    0.118    0.118 /tmp/ipython-input-3727322166.py:15(dot_product)
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
        4    0.000    0.000    0.000    0.000 {built-in method time.perf_counter}
        1    0.000    0.000    0.000    0.000 {built-in method posix.getppid}





### **Section 2: Naïve Matrix Multiplication (O(n3) compute)**

In [None]:
import time,random,cProfile,pstats,io,tracemalloc

def gen_matrix(n,seed=123):
 random.seed(seed)
 return[[random.random() for _ in range(n)] for _ in range(n)]

def matmul_naive(A,B):
 n=len(A)
 C=[[0.0]*n for _ in range(n)]
 for i in range(n):
  for k in range(n):
   aik=A[i][k]
   for j in range(n):
    C[i][j]+=aik*B[k][j]
 return C

def main():
 n=150
 A=gen_matrix(n,seed=1)
 B=gen_matrix(n,seed=2)
 tracemalloc.start()
 pr=cProfile.Profile()
 pr.enable()
 t0=time.perf_counter()
 C=matmul_naive(A,B)
 t1=time.perf_counter()
 pr.disable()
 current,peak=tracemalloc.get_traced_memory()
 tracemalloc.stop()
 print(f"Matrix size n={n} -> {n} x {n}")
 print(f"Time:{t1-t0:.3f} s | C[0][0]={C[0][0]:.6f}")
 print(f"Peak memory:{current/1e6:.2f} MB / {peak/1e6:.2f} MB")
 s_buf=io.StringIO()
 ps=pstats.Stats(pr,stream=s_buf).sort_stats("cumtime")
 ps.print_stats(10)
 print("\n--- cProfile (Top 10 by cumulative time) ---")
 print(s_buf.getvalue())
if __name__=="__main__":
 main()


Matrix size n=150 -> 150 x 150
Time:1.993 s | C[0][0]=37.256203
Peak memory:0.74 MB / 0.74 MB

--- cProfile (Top 10 by cumulative time) ---
         96 function calls (91 primitive calls) in 1.984 seconds

   Ordered by: cumulative time
   List reduced from 42 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    1.006    1.006    1.006    1.006 {built-in method time.sleep}
        1    0.221    0.221    0.221    0.221 /tmp/ipython-input-2250354953.py:7(matmul_naive)
        2    0.005    0.003    0.005    0.003 /usr/local/lib/python3.12/dist-packages/zmq/sugar/socket.py:632(send)
        1    0.000    0.000    0.000    0.000 /usr/local/lib/python3.12/dist-packages/zmq/eventloop/zmqstream.py:654(_rebuild_io_state)
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
    24/22    0.000    0.000    0.000    0.000 {built-in method builtins.isinstance}
        2    0.000    0.000    0.00

### **Section 3: 2D Convolution (Stencil/Blur) on a Grid**

In [None]:
# section3_convolution.py
import time, cProfile, pstats, io, tracemalloc

def blur_5x5(img, h, w):
    kernel_size = 2
    out = [[0.0]*w for _ in range(h)]

    for i in range(kernel_size, h-kernel_size):
        for j in range(kernel_size, w-kernel_size):
            s = 0.0
            for ki in range(-kernel_size, kernel_size+1):
                for kj in range(-kernel_size, kernel_size+1):
                    s += img[i+ki][j+kj]
            out[i][j] = s / 25.0
    return out

def main():
    h, w = 512, 512
    img = [[float((i*j) % 255) for j in range(w)] for i in range(h)]

    tracemalloc.start()
    pr = cProfile.Profile()
    pr.enable()

    t0 = time.perf_counter()
    out = blur_5x5(img, h, w)
    t1 = time.perf_counter()

    pr.disable()
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    print(f"Elapsed time: {t1 - t0:.4f} seconds")
    print(f"Memory peak: {peak/1e6:.2f}MB")

    s = io.StringIO()
    pstats.Stats(pr, stream=s).sort_stats("cumulative").print_stats(10)
    print(s.getvalue())

if __name__ == "__main__":
    main()

### **Section 4: Monte Carlo π Estimation (Random + Reduction)**

In [None]:
import time,random,cProfile,pstats,io,tracemalloc

def monte_carlo_pi(n,seed=42):
 random.seed(seed)
 inside=0
 for _ in range(n):
  x=random.random()
  y=random.random()
  if x*x+y*y<=1.0:
   inside+=1
 return 4.0*inside/n

def main():
 N=5_000_000
 tracemalloc.start()
 pr=cProfile.Profile()
 pr.enable()
 t0=time.perf_counter()
 pi_est=monte_carlo_pi(N)
 t1=time.perf_counter()
 pr.disable()
 current,peak=tracemalloc.get_traced_memory()
 tracemalloc.stop()
 print(f"Iterations N={N}")
 print(f"Estimated Pi={pi_est:.6f}")
 print(f"Time:{t1-t0:.3f} s")
 print(f"Current/Peak memory:{current/1e6:.2f} MB/{peak/1e6:.2f} MB")
 s_buf=io.StringIO()
 ps=pstats.Stats(pr,stream=s_buf).sort_stats("cumtime")
 ps.print_stats(10)
 print("\n---cProfile (Top 10 by cumulative time)---")
 print(s_buf.getvalue())

if __name__=="__main__":
 main()


Iterations N=5000000
Estimated Pi=3.142475
Time:15.601 s
Current/Peak memory:0.01 MB/0.01 MB

---cProfile (Top 10 by cumulative time)---
         10000040 function calls (10000039 primitive calls) in 15.601 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
       15   12.767    0.851   15.088    1.006 {built-in method time.sleep}
 10000000    2.401    0.000    2.401    0.000 {method 'random' of '_random.Random' objects}
      2/1    0.432    0.216    0.102    0.102 /tmp/ipython-input-3211036687.py:3(monte_carlo_pi)
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
        1    0.000    0.000    0.000    0.000 /usr/lib/python3.12/random.py:135(seed)
       16    0.000    0.000    0.000    0.000 {built-in method posix.getppid}
        1    0.000    0.000    0.000    0.000 {function Random.seed at 0x79cc2e062a20}
        2    0.000    0.000    0.000    0.000 {built-in method builtin

### **Section 5: Pairwise Interactions (N2 Kernel, “Mini” N-Body)**

In [None]:
import time,random,cProfile,pstats,io,tracemalloc

def gen_particles(n,seed=42):
 random.seed(seed)
 pos=[(random.random(),random.random()) for _ in range(n)]
 return pos

def pairwise_kernel(pos):
 n=len(pos)
 acc=[0.0]*n
 for i in range(n):
  xi,yi=pos[i]
  s=0.0
  for j in range(n):
   if i!=j:
    xj,yj=pos[j]
    dx=xi-xj
    dy=yi-yj
    r2=dx*dx+dy*dy+1e-12
    s+=1.0/r2
  acc[i]=s
 return acc

def main():
 N=1200
 pos=gen_particles(N)
 tracemalloc.start()
 pr=cProfile.Profile()
 pr.enable()
 t0=time.perf_counter()
 acc=pairwise_kernel(pos)
 t1=time.perf_counter()
 pr.disable()
 current,peak=tracemalloc.get_traced_memory()
 tracemalloc.stop()
 print(f"Particles N={N}")
 print(f"Sample acc[0]={acc[0]:.6f}")
 print(f"Time:{t1-t0:.3f} s")
 print(f"Current/Peak memory:{current/1e6:.2f} MB/{peak/1e6:.2f} MB")
 s_buf=io.StringIO()
 ps=pstats.Stats(pr,stream=s_buf).sort_stats("cumtime")
 ps.print_stats(10)
 print("\n---cProfile (Top 10 by cumulative time)---")
 print(s_buf.getvalue())

if __name__=="__main__":
 main()


Particles N=1200
Sample acc[0]=118330.697148
Time:4.962 s
Current/Peak memory:0.06 MB/0.06 MB

---cProfile (Top 10 by cumulative time)---
         285 function calls (281 primitive calls) in 4.955 seconds

   Ordered by: cumulative time
   List reduced from 68 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.000    0.000    4.949    2.475 /usr/lib/python3.12/asyncio/base_events.py:1922(_run_once)
        2    0.000    0.000    4.949    2.474 /usr/lib/python3.12/selectors.py:451(select)
        2    0.408    0.204    4.949    2.474 {method 'poll' of 'select.epoll' objects}
        4    4.021    1.005    4.021    1.005 {built-in method time.sleep}
        1    0.520    0.520    0.520    0.520 /tmp/ipython-input-1081642626.py:8(pairwise_kernel)
        1    0.000    0.000    0.006    0.006 /usr/local/lib/python3.12/dist-packages/ipykernel/iostream.py:219(<lambda>)
        1    0.000    0.000    0.006    0.006 /usr/local/