-
Notifications
You must be signed in to change notification settings - Fork 3.3k
210 lines (181 loc) · 7.02 KB
/
ci_test-mnodes.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
name: Multi Nodes GPU Tests
# Workflow Steps:
# 1. Checkout Pytorch Lightning
# 2. Set up Python
# 3. Configure AWS Credentials
# 4. Install AWS Client
# 5. Get Current Sha Commit
# 6. Create Job Name
# 7. Update Test Configuration File
# 8. Install EKSClient
# 9. Create Gpu Node Pool
# 10. Check Current Node Pool | Current Elatic Pods
# 11. Apply Elastic
# 12. Wait 5 sec
# 13. Find ETCD TCP Address
# 14. Update Test Configuration File
# 15. Apply Multi Node Testing
# 16. Wait 120 secs
# 17. Listen to Jobs Logging
# 18. Statistics
# 19. Upload coverage results
# 20. Upload coverage to Codecov
# 21. Delete Group Node
on:
push:
branches:
- never-ever-run-
#pull_request:
# types: [closed]
env:
AWS_CLUSTER: pl-lightning-torchelastic
NODE_TYPE: g4dn.xlarge
NODES: 2
NUM_GPUS: 1
REGION: us-east-2
MAX_CHECKS: 300
CHECK_SPEEP: 2
jobs:
multi-nodes-gpu-testing:
runs-on: ubuntu-20.04
strategy:
fail-fast: false
matrix:
python-version: [3.7]
pytorch-version: [1.6]
# Timeout: https://stackoverflow.com/a/59076067/4521646
timeout-minutes: 50
# runs only when merged happened.
# if: github.event.pull_request.merged == true
steps:
- name: Checkout Pytorch Lightning
uses: actions/checkout@v2
with:
repository: PyTorchLightning/pytorch-lightning
ref: ${{ github.event.base_ref }}
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Weekly reset caching
run: echo "::set-output name=period::$(python -c 'import time ; days = time.time() / 60 / 60 / 24 ; print(int(days / 7))' 2>&1)"
id: times
# Note: This uses an internal pip API and may not always work
# https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow
- name: Cache pip
uses: actions/cache@v2
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-td${{ steps.times.outputs.period }}-multi-node
restore-keys: |
${{ runner.os }}-pip-td${{ steps.times.outputs.period }}-
- name: Install dependencies
run: |
pip install awscli coverage
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_KEY_ID }}
aws-region: us-east-2
- name: Get Current Sha Commit
id: vars
shell: bash
run: |
echo "::set-output name=SHA::$(git rev-parse --short HEAD)"
echo $PWD
- name: Create Job Name
id: job
shell: bash
run: |
echo "::set-output name=ID::$(echo '${{ steps.vars.outputs.SHA }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}' | tr . - )"
echo "::set-output name=ID_NAME::$(echo 's-${{ steps.vars.outputs.SHA }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-e' | tr . - )"
- name: Install EKSClient
run: |
curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp
sudo mv /tmp/eksctl /usr/local/bin
shell: bash
- name: Create Gpu Node Pool
run: |
aws eks --region $REGION update-kubeconfig --name $AWS_CLUSTER
eksctl create nodegroup --name=${{ steps.job.outputs.ID }} --cluster=$AWS_CLUSTER --node-type=$NODE_TYPE --nodes=$NODES
# eksctl create nodegroup --name=${{ steps.job.outputs.ID }} --cluster=$AWS_CLUSTER --managed --spot --node-type=$NODE_TYPE --nodes=$NODES
shell: bash
- name: Check Current Node Pool | Current Elatic Pods
run: |
eksctl get nodegroups --cluster $AWS_CLUSTER
kubectl get pods -n elastic-job
- name: Apply Elastic
run: |
git clone https://github.com/pytorch/elastic.git
cd elastic/kubernetes
kubectl apply -k config/default
kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/master/nvidia-device-plugin.yml
kubectl apply -f https://raw.githubusercontent.com/pytorch/elastic/master/kubernetes/config/samples/etcd.yaml
- name: Wait
# todo: this shall be dynamic
if: always()
shell: bash
run: |
sleep 5
- name: Find ETCD TCP Address
id: tcp
shell: bash
run: |
echo "::set-output name=TCP_ADDRESS::$(kubectl logs etcd -n elastic-job | grep -Eo '[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}:[0-9]{1,4}' | head -1)"
- name: Update Test Config. File
run: |
import os
from dtrun.configs import prepare_multi_nodes_gpu_config
assert os.path.isfile('./tests/mnode_tests.txt')
prepare_multi_nodes_gpu_config(
'./.github/multi-nodes-gpu.yaml',
'./tests/mnode_tests.txt',
sha="${{ steps.vars.outputs.SHA }}",
tcp_address="${{ steps.tcp.outputs.TCP_ADDRESS }}",
python_version="${{ matrix.python-version }}",
torch_version="${{ matrix.pytorch-version }}",
num_gpus=1,
)
shell: python
- name: Apply Multi Node Testing
run: |
# cat ./.github/multi-nodes-gpu.yaml
kubectl apply -f ./.github/multi-nodes-gpu.yaml
shell: bash
- name: Wait
# todo: this shall be dynamic
if: always()
shell: bash
run: |
sleep 400
- name: Listen to Jobs Logging
shell: bash
run: |
# todo: Enable automatic checking.
# while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl logs ${{ steps.job.outputs.ID_NAME }}-worker-0 -n elastic-job | grep -i "error\|failed"; then status_code=1 && break; elif kubectl logs ${{ steps.job.outputs.ID }}-worker-0 -n elastic-job | grep "TEST END"; then status_code=0 && break; else printf "." ; fi; sleep $CHECK_SPEEP; done && \
# echo "Done waiting. Job status code: $status_code" && \
kubectl logs ${{ steps.job.outputs.ID_NAME }}-worker-0 -n elastic-job > /tmp/full_output.txt
if grep -q 'END_TOKEN' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '/END_TOKEN/'; else mv /tmp/full_output.txt xx00; fi && \
cat xx00
- name: Statistics
if: success()
run: |
cat ./xx01 | tail -n +2 | base64 --decode > /home/runner/work/pytorch-lightning/pytorch-lightning/.coverage
cd /home/runner/work/pytorch-lightning/pytorch-lightning && coverage report && coverage xml
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v1
if: always()
# see: https://github.com/actions/toolkit/issues/399
continue-on-error: true
with:
token: ${{ secrets.CODECOV_TOKEN }}
file: coverage.xml
flags: multi-nodes,pytest
name: multi-nodes-coverage
fail_ci_if_error: false
- name: Delete Group Node
if: always()
run: |
kubectl delete ElasticJob ${{ steps.job.outputs.ID_NAME }} -n elastic-job
eksctl delete nodegroup ${{ steps.job.outputs.ID }} --cluster=$AWS_CLUSTER