Skip to content

Commit

Permalink
Pytorch 2.0 support compile mode (#1539)
Browse files Browse the repository at this point in the history
* adding patch to run pytorch with compile mode

* adding patch to run pytorch with compile mode

* small fix

* small fix

* small fix

* small fix

* fix comment
  • Loading branch information
Tulsishah authored and gargnitingoogle committed Jan 4, 2024
1 parent b70dbb4 commit 8a3b764
Show file tree
Hide file tree
Showing 2 changed files with 109 additions and 11 deletions.
100 changes: 100 additions & 0 deletions perfmetrics/scripts/ml_tests/pytorch/run_model.sh
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,106 @@ sed -i "$x"'r bypassed_code.py' $folder_file
# nproc_per_node - by downloading the model in single thread environment.
python -c 'import torch;torch.hub.list("facebookresearch/xcit:main")'

# (TulsiShah) TODO: Pytorch 2.0 compile mode has issues (https://github.com/pytorch/pytorch/issues/94599),
# which is fixed in pytorch version 2.1.0 (https://github.com/pytorch/pytorch/pull/100071).
# We'll remove this workaround once we update our Docker image to use Pytorch 2.1.0 or greater version.
if [ ${PYTORCH_VESRION} == "v2" ];
then
allowed_functions_file="/opt/conda/lib/python3.10/site-packages/torch/_dynamo/allowed_functions.py"
# Update the pytorch library code to bypass the kernel-cache
echo "Updating the pytorch library code to Disallow_in_graph distributed API.."
echo "
def _disallowed_function_ids():
remove = [
True,
False,
None,
collections.OrderedDict,
copy.copy,
copy.deepcopy,
inspect.signature,
math.__package__,
torch.__builtins__,
torch.autocast_decrement_nesting,
torch.autocast_increment_nesting,
torch.autograd.grad,
torch.clear_autocast_cache,
torch.cuda.current_device,
torch.cuda.amp.autocast_mode.autocast,
torch.cpu.amp.autocast_mode.autocast,
torch.distributions.constraints.is_dependent,
torch.distributions.normal.Normal,
torch.inference_mode,
torch.set_anomaly_enabled,
torch.set_autocast_cache_enabled,
torch.set_autocast_cpu_dtype,
torch.set_autocast_cpu_enabled,
torch.set_autocast_enabled,
torch.set_autocast_gpu_dtype,
torch.autograd.profiler.profile,
warnings.warn,
torch._C._dynamo.eval_frame.unsupported,
]
# extract all dtypes from torch
dtypes = [
obj for obj in torch.__dict__.values() if isinstance(obj, type(torch.float32))
]
remove += dtypes
storage = [
obj
for obj in torch.__dict__.values()
if isinstance(obj, type(torch.FloatStorage))
]
remove += storage
# Distributed APIs don't work well with torch.compile.
if torch.distributed.is_available():
remove.extend(
torch.distributed.distributed_c10d.dynamo_unsupported_distributed_c10d_ops
)
return {id(x) for x in remove}
" > disallowed_function.py

x=$(grep -n "def _disallowed_function_ids():" $allowed_functions_file | cut -f1 -d ':')
y=$(grep -n "def _allowed_function_ids():" $allowed_functions_file | cut -f1 -d ':')
y=$((y - 3))
lines="$x,$y"
sed -i "$lines"'d' $allowed_functions_file
sed -i "$x"'r disallowed_function.py' $allowed_functions_file

distributed_c10d_file="/opt/conda/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py"
echo "# This ops are not friendly to TorchDynamo. So, we decide to disallow these ops
# in FX graph, allowing them to run them on eager, with torch.compile.
dynamo_unsupported_distributed_c10d_ops = [
all_reduce_multigpu,
recv,
all_gather_object,
all_gather_coalesced,
all_to_all_single,
all_reduce,
gather_object,
all_to_all,
all_reduce_coalesced,
gather,
broadcast_object_list,
barrier,
reduce_multigpu,
scatter,
scatter_object_list,
reduce,
reduce_scatter_multigpu,
all_gather,
broadcast_multigpu,
all_gather_multigpu,
reduce_scatter,
all_gather_into_tensor,
broadcast,
reduce_scatter_tensor,
send,
]" >> $distributed_c10d_file
fi

ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/${PYTORCH_VESRION}/dino"
echo "Update status file"
echo "RUNNING" > status.txt
Expand Down
20 changes: 9 additions & 11 deletions perfmetrics/scripts/ml_tests/pytorch/v2/dino/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,16 @@ RUN pip3 install timm
WORKDIR "/pytorch_dino/"

RUN git clone "https://github.com/facebookresearch/dino"
# (TulsiShah) TODO: The current docker image does not support the dino model with compile mode.
# We can unblock the below code whenever the docker image supports the same to run.

# WORKDIR "/pytorch_dino/dino"
# RUN echo '[remote "origin"]' >> .git/config
# RUN echo ' fetch = +refs/pull/262/head:refs/remotes/origin/pr/262' >> .git/config
#
# RUN git fetch origin
# RUN git diff origin/main origin/pr/262 > diff.patch
# RUN git apply diff.patch
#
# WORKDIR "/pytorch_dino/"
WORKDIR "/pytorch_dino/dino"
RUN echo '[remote "origin"]' >> .git/config
RUN echo ' fetch = +refs/pull/262/head:refs/remotes/origin/pr/262' >> .git/config

RUN git fetch origin
RUN git diff origin/main origin/pr/262 > diff.patch
RUN git apply diff.patch

WORKDIR "/pytorch_dino/"

COPY perfmetrics/scripts/ml_tests/pytorch/run_model.sh ./

Expand Down

0 comments on commit 8a3b764

Please sign in to comment.