Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Increased memory usage #894

Closed
sk1p opened this issue Nov 5, 2020 · 2 comments · Fixed by #1119
Closed

Increased memory usage #894

sk1p opened this issue Nov 5, 2020 · 2 comments · Fixed by #1119

Comments

@sk1p
Copy link
Member

sk1p commented Nov 5, 2020

Compared to 0.5, we are using a bit more memory (see discussion in #814). We need to track memory usage and see if we can fix this easily - I suspect this can be a bit harder to fix, as we can't easily keep buffers between processing partitions, for example.

@sk1p
Copy link
Member Author

sk1p commented Sep 28, 2021

First step for reproducing the problem:

"""
This script has very predictable performance and memory usage, around 1.4GiB RSS on my system,
and does not reproduce the problem.
"""

import time
import os

from libertem.api import Context


def main():
    print(f"running as pid {os.getpid()}")
    ctx = Context()
    ds = ctx.load("k2is", path="/home/alex/Data/K2IS/Capture52/Capture52_.gtg")
    while True:
        t0 = time.time()
        pick_analysis = ctx.create_pick_analysis(dataset=ds, x=7, y=16)
        ctx.run(pick_analysis)
        t1 = time.time()
        print(f"pick took {t1-t0:.2f}s")


if __name__ == "__main__":
    main()

Performing the same picking action in the GUI somehow does show the problem, pointing at a problem with the web API somehow. With the GUI, pick-scrubbing increases memory usage of worker processes somehow.

The main process also grows slightly, which may be related to keeping results around for longer than needed (I'm aware of that issue and I think I know how to fix it)

sk1p added a commit to sk1p/LiberTEM that referenced this issue Sep 30, 2021
@sk1p
Copy link
Member Author

sk1p commented Sep 30, 2021

For reference, here is a script that I used for debugging this:

import time
import os
import gc
import mmap

from pympler import muppy, summary
import numpy as np

from libertem.api import Context
from libertem.executor.inline import InlineJobExecutor
from libertem.executor.dask import DaskJobExecutor, cluster_spec
from libertem.udf.base import UDF, UDFTask
from libertem.udf.raw import PickUDF


def dump_udf_objgraph():
    objs = [
        o
        for o in muppy.get_objects()
        if issubclass(type(o), UDFTask)
    ]

    print(f"objs: {len(objs)}")
    import objgraph

    chain = objgraph.find_backref_chain(objs[-1], objgraph.is_proper_module)

    print([
        type(o)
        for o in chain
    ])
    print(chain[2])
    print(chain[2].cache_info())
    print(chain[2].__wrapped__)
    objgraph.show_chain(
        chain,
        filename='/tmp/refs.png'
    )


def run_cpu_worker(ctx, fn):
    fut = ctx.executor.client.submit(fn, resources={
        "CPU": 1, 'compute': 1, 'ndarray': 1
    })
    return fut.result()


def main():
    print(f"running as pid {os.getpid()}")
    if True:
        ctx = Context(
            executor=DaskJobExecutor.make_local(
                spec=cluster_spec(**{
                    "cpus": [0],
                    "has_cupy": False,
                    "cudas": []
                })
            )
        )
    else:
        ctx = Context(executor=InlineJobExecutor())
    ds = ctx.load("k2is", path="/home/alex/Data/K2IS/Capture52/Capture52_.gtg")

    for i in range(102):
        pick_udf = PickUDF()
        roi = np.zeros(ds.shape.nav, dtype=bool)
        roi.reshape((-1,))[i] = True
        ctx.run_udf(dataset=ds, udf=pick_udf, roi=roi)
    run_cpu_worker(ctx, dump_udf_objgraph)


if __name__ == "__main__":
    main()

And here is another one that loops indefinitely:

import time
import os
import gc
import mmap

import numpy as np
from pympler import muppy, summary

from libertem.api import Context
from libertem.executor.inline import InlineJobExecutor
from libertem.executor.dask import DaskJobExecutor, cluster_spec
from libertem.udf.raw import PickUDF


def profile_memory():
    objects = muppy.get_objects()
    objects = muppy.filter(objects, Type=mmap.mmap)
    summ = summary.summarize(objects)
    return os.getpid(), summ, len(objects)


def get_profile(ctx, old_profile=None):
    fut = ctx.executor.client.submit(profile_memory, resources={
        "CPU": 1, 'compute': 1, 'ndarray': 1
    })
    pid, summ, ll = fut.result()
    summary.print_(summ, sort='#')
    print(ll)
    return summ


def main():
    print(f"running as pid {os.getpid()}")
    if True:
        ctx = Context(
            executor=DaskJobExecutor.make_local(
                spec=cluster_spec(**{
                    "cpus": [0],
                    "has_cupy": False,
                    "cudas": []
                })
            )
        )
    else:
        ctx = Context(executor=InlineJobExecutor())
    ds = ctx.load("k2is", path="/home/alex/Data/K2IS/Capture52/Capture52_.gtg")
    counter = 0

    old_summ = get_profile(ctx)

    while True:
        counter += 1
        for y in range(ds.shape.nav[0]):
            for x in range(ds.shape.nav[1]):
                t0 = time.time()
                pick_udf = PickUDF()
                roi = np.zeros(ds.shape.nav, dtype=bool)
                roi[y, x] = True
                ctx.run_udf(dataset=ds, udf=pick_udf, roi=roi)
                t1 = time.time()
                # ctx.executor.client.run(gc.collect)
                print(f"round {counter} pick took {t1-t0:.2f}s")
                # summ = get_profile(ctx, old_summ)
                # old_summ = summ


if __name__ == "__main__":
    main()

Both need pympler and objgraph installed.

@sk1p sk1p closed this as completed Sep 30, 2021
@sk1p sk1p reopened this Sep 30, 2021
uellue pushed a commit that referenced this issue Sep 30, 2021
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging a pull request may close this issue.

2 participants