Skip to content

Commit

Permalink
Upgrade tensorflow model dependencies (#1510)
Browse files Browse the repository at this point in the history
* update docker image and tf-models-official version

* change path for train and controller files

* downgrade tf to version 2.12

* try with tf version 2.13
  • Loading branch information
ashmeenkaur authored and gargnitingoogle committed Nov 29, 2023
1 parent a60dc34 commit 047620a
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ cd "$HOME/github/gcsfuse/"
mkdir container_artifacts && mkdir container_artifacts/logs && mkdir container_artifacts/output

echo "Building tf DLC docker image containing all tensorflow libraries..."
sudo docker build . -f perfmetrics/scripts/ml_tests/tf/resnet/Dockerfile -t tf-dlc-gcsfuse --build-arg DLC_IMAGE_NAME=tf-gpu.2-10
sudo docker build . -f perfmetrics/scripts/ml_tests/tf/resnet/Dockerfile -t tf-dlc-gcsfuse --build-arg DLC_IMAGE_NAME=tf-gpu.2-13

echo "Running the docker image build in the previous step..."
sudo docker run --gpus all --name tf_model_container --privileged -d \
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash

# Installs go1.19 on the container, builds gcsfuse using log_rotation file
# and installs tf-models-official v2.10.0, makes update to include clear_kernel_cache
# Installs go1.21 on the container, builds gcsfuse using log_rotation file
# and installs tf-models-official v2.13.0, makes update to include clear_kernel_cache
# and epochs functionality, and runs the model

# Install go lang
Expand All @@ -20,12 +20,12 @@ echo "Mounting the bucket"
nohup gcsfuse/gcsfuse --foreground --implicit-dirs --debug_fuse --debug_gcs --max-conns-per-host 100 --log-format "text" --log-file /home/logs/gcsfuse.log --stackdriver-export-interval 60s ml-models-data-gcsfuse myBucket > /home/output/gcsfuse.out 2> /home/output/gcsfuse.err &

# Install tensorflow model garden library
pip3 install --user tf-models-official==2.10.0
pip3 install --user tf-models-official==2.13.2

echo "Updating the tensorflow library code to bypass the kernel-cache..."
# Fail building the container image if train_lib.py and controller.py are not at expected location.
if [ -f "/root/.local/lib/python3.7/site-packages/official/core/train_lib.py" ]; then echo "file exists"; else echo "train_lib.py file not present in expected location. Please correct the location. Exiting"; exit 1; fi
if [ -f "/root/.local/lib/python3.7/site-packages/orbit/controller.py" ]; then echo "file exists"; else echo "controller.py file not present in expected location. Please correct the location. Exiting"; exit 1; fi
if [ -f "/root/.local/lib/python3.10/site-packages/official/core/train_lib.py" ]; then echo "file exists"; else echo "train_lib.py file not present in expected location. Please correct the location. Exiting"; exit 1; fi
if [ -f "/root/.local/lib/python3.10/site-packages/orbit/controller.py" ]; then echo "file exists"; else echo "controller.py file not present in expected location. Please correct the location. Exiting"; exit 1; fi

# Adding cache clearing functionality and epochs in controller.py
echo "
Expand Down Expand Up @@ -63,7 +63,7 @@ echo "
self._maybe_save_checkpoint(check_interval=False)
" > bypassed_code.py

controller_file="/root/.local/lib/python3.7/site-packages/orbit/controller.py"
controller_file="/root/.local/lib/python3.10/site-packages/orbit/controller.py"
x=$(grep -n "def train(self, steps: int, checkpoint_at_completion: bool = True):" $controller_file | cut -f1 -d ':')
y=$(grep -n "def evaluate(self, steps: int = -1)" $controller_file | cut -f1 -d ':')
y=$((y - 2))
Expand Down Expand Up @@ -139,7 +139,7 @@ def run_experiment(
return runner.run(epochs=epochs, clear_kernel_cache=clear_kernel_cache)
" > bypassed_code.py

train_lib_file="/root/.local/lib/python3.7/site-packages/official/core/train_lib.py"
train_lib_file="/root/.local/lib/python3.10/site-packages/official/core/train_lib.py"
x=$(grep -n "def run_experiment(" $train_lib_file | cut -f1 -d ':')
y=$(grep -n "return runner.run()" $train_lib_file | cut -f1 -d ':')
lines="$x,$y"
Expand Down

0 comments on commit 047620a

Please sign in to comment.