-
Notifications
You must be signed in to change notification settings - Fork 330
/
rsync.py
1649 lines (1434 loc) · 72.2 KB
/
rsync.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# -*- coding: utf-8 -*-
# Copyright 2014 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Implementation of Unix-like rsync command."""
from __future__ import absolute_import
import collections
import errno
import heapq
import io
from itertools import islice
import logging
import os
import re
import tempfile
import textwrap
import time
import traceback
import urllib
from boto import config
import crcmod
from gslib.bucket_listing_ref import BucketListingObject
from gslib.cloud_api import NotFoundException
from gslib.command import Command
from gslib.command import DummyArgChecker
from gslib.command_argument import CommandArgument
from gslib.cs_api_map import ApiSelector
from gslib.exception import CommandException
from gslib.metrics import LogPerformanceSummaryParams
from gslib.plurality_checkable_iterator import PluralityCheckableIterator
from gslib.seek_ahead_thread import SeekAheadResult
from gslib.sig_handling import GetCaughtSignals
from gslib.sig_handling import RegisterSignalHandler
from gslib.storage_url import GenerationFromUrlAndString
from gslib.storage_url import IsCloudSubdirPlaceholder
from gslib.storage_url import StorageUrlFromString
from gslib.third_party.storage_apitools import storage_v1_messages as apitools_messages
from gslib.utils import constants
from gslib.utils import copy_helper
from gslib.utils import parallelism_framework_util
from gslib.utils.boto_util import UsingCrcmodExtension
from gslib.utils.cloud_api_helper import GetCloudApiInstance
from gslib.utils.copy_helper import CreateCopyHelperOpts
from gslib.utils.copy_helper import GetSourceFieldsNeededForCopy
from gslib.utils.copy_helper import GZIP_ALL_FILES
from gslib.utils.copy_helper import SkipUnsupportedObjectError
from gslib.utils.hashing_helper import CalculateB64EncodedCrc32cFromContents
from gslib.utils.hashing_helper import CalculateB64EncodedMd5FromContents
from gslib.utils.hashing_helper import SLOW_CRCMOD_RSYNC_WARNING
from gslib.utils.hashing_helper import SLOW_CRCMOD_WARNING
from gslib.utils.metadata_util import CreateCustomMetadata
from gslib.utils.metadata_util import GetValueFromObjectCustomMetadata
from gslib.utils.metadata_util import ObjectIsGzipEncoded
from gslib.utils.posix_util import ATIME_ATTR
from gslib.utils.posix_util import ConvertDatetimeToPOSIX
from gslib.utils.posix_util import ConvertModeToBase8
from gslib.utils.posix_util import DeserializeFileAttributesFromObjectMetadata
from gslib.utils.posix_util import GID_ATTR
from gslib.utils.posix_util import InitializeUserGroups
from gslib.utils.posix_util import MODE_ATTR
from gslib.utils.posix_util import MTIME_ATTR
from gslib.utils.posix_util import NA_ID
from gslib.utils.posix_util import NA_MODE
from gslib.utils.posix_util import NA_TIME
from gslib.utils.posix_util import NeedsPOSIXAttributeUpdate
from gslib.utils.posix_util import ParseAndSetPOSIXAttributes
from gslib.utils.posix_util import POSIXAttributes
from gslib.utils.posix_util import SerializeFileAttributesToObjectMetadata
from gslib.utils.posix_util import UID_ATTR
from gslib.utils.posix_util import ValidateFilePermissionAccess
from gslib.utils.posix_util import WarnFutureTimestamp
from gslib.utils.posix_util import WarnInvalidValue
from gslib.utils.posix_util import WarnNegativeAttribute
from gslib.utils.rsync_util import DiffAction
from gslib.utils.rsync_util import RsyncDiffToApply
from gslib.utils.system_util import IS_WINDOWS
from gslib.utils.translation_helper import CopyCustomMetadata
from gslib.utils.unit_util import CalculateThroughput
from gslib.utils.unit_util import SECONDS_PER_DAY
from gslib.utils.unit_util import TEN_MIB
from gslib.wildcard_iterator import CreateWildcardIterator
_SYNOPSIS = """
gsutil rsync [OPTION]... src_url dst_url
"""
# pylint: disable=anomalous-backslash-in-string
_DETAILED_HELP_TEXT = ("""
<B>SYNOPSIS</B>
""" + _SYNOPSIS + """
<B>DESCRIPTION</B>
The gsutil rsync command makes the contents under dst_url the same as the
contents under src_url, by copying any missing files/objects (or those whose
data has changed), and (if the -d option is specified) deleting any extra
files/objects. src_url must specify a directory, bucket, or bucket
subdirectory. For example, to make gs://mybucket/data match the contents of
the local directory "data" you could do:
gsutil rsync -d data gs://mybucket/data
To recurse into directories use the -r option:
gsutil rsync -d -r data gs://mybucket/data
To copy only new/changed files without deleting extra files from
gs://mybucket/data leave off the -d option:
gsutil rsync -r data gs://mybucket/data
If you have a large number of objects to synchronize you might want to use the
gsutil -m option, to perform parallel (multi-threaded/multi-processing)
synchronization:
gsutil -m rsync -d -r data gs://mybucket/data
The -m option typically will provide a large performance boost if either the
source or destination (or both) is a cloud URL. If both source and
destination are file URLs the -m option will typically thrash the disk and
slow synchronization down.
To make the local directory "data" the same as the contents of
gs://mybucket/data:
gsutil rsync -d -r gs://mybucket/data data
To make the contents of gs://mybucket2 the same as gs://mybucket1:
gsutil rsync -d -r gs://mybucket1 gs://mybucket2
You can also mirror data across local directories:
gsutil rsync -d -r dir1 dir2
To mirror your content across clouds:
gsutil rsync -d -r gs://my-gs-bucket s3://my-s3-bucket
Note 1: Shells (like bash, zsh) sometimes attempt to expand wildcards in ways
that can be surprising. Also, attempting to copy files whose names contain
wildcard characters can result in problems. For more details about these
issues see the section "POTENTIALLY SURPRISING BEHAVIOR WHEN USING WILDCARDS"
under "gsutil help wildcards".
Note 2: If you are synchronizing a large amount of data between clouds you
might consider setting up a
`Google Compute Engine <https://cloud.google.com/products/compute-engine>`_
account and running gsutil there. Since cross-provider gsutil data transfers
flow through the machine where gsutil is running, doing this can make your
transfer run significantly faster than running gsutil on your local
workstation.
<B>BE CAREFUL WHEN USING -d OPTION!</B>
The rsync -d option is very useful and commonly used, because it provides a
means of making the contents of a destination bucket or directory match those
of a source bucket or directory. However, please exercise caution when you
use this option: It's possible to delete large amounts of data accidentally
if, for example, you erroneously reverse source and destination. For example,
if you meant to synchronize a local directory from a bucket in the cloud but
instead run the command:
gsutil -m rsync -r -d ./your-dir gs://your-bucket
and your-dir is currently empty, you will quickly delete all of the objects in
gs://your-bucket.
You can also cause large amounts of data to be lost quickly by specifying a
subdirectory of the destination as the source of an rsync. For example, the
command:
gsutil -m rsync -r -d gs://your-bucket/data gs://your-bucket
would cause most or all of the objects in gs://your-bucket to be deleted
(some objects may survive if there are any with names that sort lower than
"data" under gs://your-bucket/data).
In addition to paying careful attention to the source and destination you
specify with the rsync command, there are two more safety measures you can
take when using gsutil rsync -d:
1. Try running the command with the rsync -n option first, to see what it
would do without actually performing the operations. For example, if
you run the command:
gsutil -m rsync -r -d -n gs://your-bucket/data gs://your-bucket
it will be immediately evident that running that command without the -n
option would cause many objects to be deleted.
2. Enable object versioning in your bucket, which will allow you to restore
objects if you accidentally delete them. For more details see
"gsutil help versions".
<B>BE CAREFUL WHEN SYNCHRONIZING OVER OS-SPECIFIC FILE TYPES (SYMLINKS, DEVICES, ETC.)</B>
Running gsutil rsync over a directory containing operating system-specific
file types (symbolic links, device files, sockets, named pipes, etc.) can
cause various problems. For example, running a command like:
gsutil rsync -r ./dir gs://my-bucket
will cause gsutil to follow any symbolic links in ./dir, creating objects in
my-bucket containing the data from the files to which the symlinks point. This
can cause various problems:
* If you use gsutil rsync as a simple way to backup a directory to a bucket,
restoring from that bucket will result in files where the symlinks used
to be. At best this is wasteful of space, and at worst it can result in
outdated data or broken applications -- depending on what is consuming
the symlinks.
* If you use gsutil rsync over directories containing broken symlinks,
gsutil rsync will abort (unless you pass the -e option).
* gsutil rsync skips symlinks that point to directories.
Since gsutil rsync is intended to support data operations (like moving a data
set to the cloud for computational processing) and it needs to be compatible
both in the cloud and across common operating systems, there are no plans for
gsutil rsync to support operating system-specific file types like symlinks.
We recommend that users do one of the following:
* Don't use gsutil rsync over directories containing symlinks or other OS-
specific file types.
* Use the -e option to exclude symlinks or the -x option to exclude
OS-specific file types by name.
* Use a tool (such as tar) that preserves symlinks and other OS-specific file
types, packaging up directories containing such files before uploading to
the cloud.
<B>EVENTUAL CONSISTENCY WITH NON-GOOGLE CLOUD PROVIDERS</B>
While Google Cloud Storage is strongly consistent, some cloud providers
only support eventual consistency. You may encounter scenarios where rsync
synchronizes using stale listing data when working with these other cloud
providers. For example, if you run rsync immediately after uploading an
object to an eventually consistent cloud provider, the added object may not
yet appear in the provider's listing. Consequently, rsync will miss adding
the object to the destination. If this happens you can rerun the rsync
operation again later (after the object listing has "caught up").
<B>CHECKSUM VALIDATION AND FAILURE HANDLING</B>
At the end of every upload or download, the gsutil rsync command validates
that the checksum of the source file/object matches the checksum of the
destination file/object. If the checksums do not match, gsutil will delete
the invalid copy and print a warning message. This very rarely happens, but
if it does, please contact gs-team@google.com.
The rsync command will retry when failures occur, but if enough failures
happen during a particular copy or delete operation the command will fail.
If the -C option is provided, the command will instead skip the failing
object and move on. At the end of the synchronization run if any failures
were not successfully retried, the rsync command will report the count of
failures, and exit with non-zero status. At this point you can run the rsync
command again, and it will attempt any remaining needed copy and/or delete
operations.
Note that there are cases where retrying will never succeed, such as if you
don't have write permission to the destination bucket or if the destination
path for some objects is longer than the maximum allowed length.
For more details about gsutil's retry handling, please see
"gsutil help retries".
<B>CHANGE DETECTION ALGORITHM</B>
To determine if a file or object has changed, gsutil rsync first checks
whether the file modification time (mtime) of both the source and destination
is available. If mtime is available at both source and destination, and the
destination mtime is different than the source, or if the source and
destination file size differ, gsutil rsync will update the destination. If the
source is a cloud bucket and the destination is a local file system, and if
mtime is not available for the source, gsutil rsync will use the time created
for the cloud object as a substitute for mtime. Otherwise, if mtime is not
available for either the source or the destination, gsutil rsync will fall
back to using checksums. If the source and destination are both cloud buckets
with checksums available, gsutil rsync will use these hashes instead of mtime.
However, gsutil rsync will still update mtime at the destination if it is not
present. If the source and destination have matching checksums and only the
source has an mtime, gsutil rsync will copy the mtime to the destination. If
neither mtime nor checksums are available, gsutil rsync will resort to
comparing file sizes.
Checksums will not be available when comparing composite Google Cloud Storage
objects with objects at a cloud provider that does not support CRC32C (which
is the only checksum available for composite objects). See 'gsutil help
compose' for details about composite objects.
<B>COPYING IN THE CLOUD AND METADATA PRESERVATION</B>
If both the source and destination URL are cloud URLs from the same provider,
gsutil copies data "in the cloud" (i.e., without downloading to and uploading
from the machine where you run gsutil). In addition to the performance and
cost advantages of doing this, copying in the cloud preserves metadata (like
Content-Type and Cache-Control). In contrast, when you download data from the
cloud it ends up in a file, which has no associated metadata, other than file
modification time (mtime). Thus, unless you have some way to hold on to or
re-create that metadata, synchronizing a bucket to a directory in the local
file system will not retain the metadata other than mtime.
Note that by default, the gsutil rsync command does not copy the ACLs of
objects being synchronized and instead will use the default bucket ACL (see
"gsutil help defacl"). You can override this behavior with the -p option (see
OPTIONS below).
<B>SLOW CHECKSUMS</B>
If you find that CRC32C checksum computation runs slowly, this is likely
because you don't have a compiled CRC32c on your system. Try running:
gsutil ver -l
If the output contains:
compiled crcmod: False
you are running a Python library for computing CRC32C, which is much slower
than using the compiled code. For information on getting a compiled CRC32C
implementation, see 'gsutil help crc32c'.
<B>LIMITATIONS</B>
1. The gsutil rsync command will only allow non-negative file modification
times to be used in its comparisons. This means gsutil rsync will resort to
using checksums for any file with a timestamp before 1970-01-01 UTC.
2. The gsutil rsync command considers only the current object generations in
the source and destination buckets when deciding what to copy / delete. If
versioning is enabled in the destination bucket then gsutil rsync's
overwriting or deleting objects will end up creating versions, but the
command doesn't try to make the archived generations match in the source
and destination buckets.
3. The gsutil rsync command does not support copying special file types
such as sockets, device files, named pipes, or any other non-standard
files intended to represent an operating system resource. If you run
gsutil rsync on a source directory that includes such files (for example,
copying the root directory on Linux that includes /dev ), you should use
the -x flag to exclude these files. Otherwise, gsutil rsync may fail or
hang.
4. The gsutil rsync command copies changed files in their entirety and does
not employ the
`rsync delta-transfer algorithm <https://rsync.samba.org/tech_report/>`_
to transfer portions of a changed file. This is because cloud objects are
immutable and no facility exists to read partial cloud object checksums or
perform partial overwrites.
<B>OPTIONS</B>
-a canned_acl Sets named canned_acl when uploaded objects created. See
"gsutil help acls" for further details. Note that rsync will
decide whether or not to perform a copy based only on object
size and modification time, not current ACL state. Also see the
-p option below.
-c Causes the rsync command to compute and compare checksums
(instead of comparing mtime) for files if the size of source
and destination match. This option increases local disk I/O and
run time if either src_url or dst_url are on the local file
system.
-C If an error occurs, continue to attempt to copy the remaining
files. If errors occurred, gsutil's exit status will be
non-zero even if this flag is set. This option is implicitly
set when running "gsutil -m rsync...". Note: -C only applies
to the actual copying operation. If an error occurs while
iterating over the files in the local directory (e.g., invalid
Unicode file name) gsutil will print an error message and
abort.
-d Delete extra files under dst_url not found under src_url. By
default extra files are not deleted. Note: this option can
delete data quickly if you specify the wrong source/destination
combination. See the help section above,
"BE CAREFUL WHEN USING -d OPTION!".
-e Exclude symlinks. When specified, symbolic links will be
ignored. Note that gsutil does not follow directory symlinks,
regardless of whether -e is specified.
-j <ext,...> Applies gzip transport encoding to any file upload whose
extension matches the -j extension list. This is useful when
uploading files with compressible content (such as .js, .css,
or .html files) because it saves network bandwidth while
also leaving the data uncompressed in Google Cloud Storage.
When you specify the -j option, files being uploaded are
compressed in-memory and on-the-wire only. Both the local
files and Cloud Storage objects remain uncompressed. The
uploaded objects retain the Content-Type and name of the
original files.
Note that if you want to use the top-level -m option to
parallelize copies along with the -j/-J options, you should
prefer using multiple processes instead of multiple threads;
when using -j/-J, multiple threads in the same process are
bottlenecked by Python's GIL. Thread and process count can be
set using the "parallel_thread_count" and
"parallel_process_count" boto config options, e.g.:
gsutil -o "GSUtil:parallel_process_count=8" \\
-o "GSUtil:parallel_thread_count=1" \\
-m rsync -j /local/source/dir gs://bucket/path
-J Applies gzip transport encoding to file uploads. This option
works like the -j option described above, but it applies to
all uploaded files, regardless of extension.
Warning: If you use this option and some of the source files
don't compress well (e.g., that's often true of binary data),
this option may result in longer uploads.
-n Causes rsync to run in "dry run" mode, i.e., just outputting
what would be copied or deleted without actually doing any
copying/deleting.
-p Causes ACLs to be preserved when objects are copied. Note that
rsync will decide whether or not to perform a copy based only
on object size and modification time, not current ACL state.
Thus, if the source and destination differ in size or
modification time and you run gsutil rsync -p, the file will be
copied and ACL preserved. However, if the source and
destination don't differ in size or checksum but have different
ACLs, running gsutil rsync -p will have no effect.
Note that this option has performance and cost implications
when using the XML API, as it requires separate HTTP calls for
interacting with ACLs. The performance issue can be mitigated
to some degree by using gsutil -m rsync to cause parallel
synchronization. Also, this option only works if you have OWNER
access to all of the objects that are copied.
You can avoid the additional performance and cost of using
rsync -p if you want all objects in the destination bucket to
end up with the same ACL by setting a default object ACL on
that bucket instead of using rsync -p. See 'gsutil help
defacl'.
-P Causes POSIX attributes to be preserved when objects are
copied. With this feature enabled, gsutil rsync will copy
fields provided by stat. These are the user ID of the owner,
the group ID of the owning group, the mode (permissions) of the
file, and the access/modification time of the file. For
downloads, these attributes will only be set if the source
objects were uploaded with this flag enabled.
On Windows, this flag will only set and restore access time and
modification time. This is because Windows doesn't have a
notion of POSIX uid/gid/mode.
-R, -r The -R and -r options are synonymous. Causes directories,
buckets, and bucket subdirectories to be synchronized
recursively. If you neglect to use this option gsutil will make
only the top-level directory in the source and destination URLs
match, skipping any sub-directories.
-u When a file/object is present in both the source and
destination, if mtime is available for both, do not perform
the copy if the destination mtime is newer.
-U Skip objects with unsupported object types instead of failing.
Unsupported object types are Amazon S3 Objects in the GLACIER
storage class.
-x pattern Causes files/objects matching pattern to be excluded, i.e., any
matching files/objects will not be copied or deleted. Note that
the pattern is a Python regular expression, not a wildcard (so,
matching any string ending in "abc" would be specified using
".*abc$" rather than "*abc"). Note also that the exclude path
is always relative (similar to Unix rsync or tar exclude
options). For example, if you run the command:
gsutil rsync -x "data./.*\.txt$" dir gs://my-bucket
it will skip the file dir/data1/a.txt.
You can use regex alternation to specify multiple exclusions,
for example:
gsutil rsync -x ".*\.txt$|.*\.jpg$" dir gs://my-bucket
NOTE: When using this on the Windows command line, use ^ as an
escape character instead of \ and escape the | character.
""")
# pylint: enable=anomalous-backslash-in-string
_NA = '-'
_OUTPUT_BUFFER_SIZE = 64 * 1024
_PROGRESS_REPORT_LISTING_COUNT = 10000
# Tracks files we need to clean up at end or if interrupted. Because some
# files are passed to rsync's diff iterators, it is difficult to manage when
# they should be closed, especially in the event that we receive a signal to
# exit. Every time such a file is opened, its file object should be appended
# to this list.
_tmp_files = []
# pylint: disable=unused-argument
def _HandleSignals(signal_num, cur_stack_frame):
"""Called when rsync command is killed with SIGINT, SIGQUIT or SIGTERM."""
CleanUpTempFiles()
def CleanUpTempFiles():
"""Cleans up temp files.
This function allows the main (RunCommand) function to clean up at end of
operation, or if gsutil rsync is interrupted (e.g., via ^C). This is necessary
because tempfile.NamedTemporaryFile doesn't allow the created file to be
re-opened in read mode on Windows, so we have to use tempfile.mkstemp, which
doesn't automatically delete temp files.
"""
# First pass: Close all the files. Wrapped iterators result in open file
# objects for the same file, and Windows does not allow removing the file
# at a given path until all its open file handles have been closed.
for fileobj in _tmp_files:
# Windows requires temp files to be closed before unlinking.
if not fileobj.closed:
fileobj.close()
# Second pass: Remove each file, skipping duplicates that have already been
# removed.
for fileobj in _tmp_files:
if os.path.isfile(fileobj.name):
try:
os.unlink(fileobj.name)
except Exception as e: # pylint: disable=broad-except
logging.debug(
'Failed to close and delete temp file "%s". Got an error:\n%s',
fileobj.name, e)
def _DiffToApplyArgChecker(command_instance, diff_to_apply):
"""Arg checker that skips symlinks if -e flag specified."""
if (diff_to_apply.diff_action == DiffAction.REMOVE
or not command_instance.exclude_symlinks):
# No src URL is populated for REMOVE actions.
return True
exp_src_url = StorageUrlFromString(diff_to_apply.src_url_str)
if exp_src_url.IsFileUrl() and os.path.islink(exp_src_url.object_name):
command_instance.logger.info('Skipping symbolic link %s...', exp_src_url)
return False
return True
def _ComputeNeededFileChecksums(logger, src_url_str, src_size, src_crc32c,
src_md5, dst_url_str, dst_size, dst_crc32c,
dst_md5):
"""Computes any file checksums needed by _CompareObjects.
Args:
logger: logging.logger for outputting log messages.
src_url_str: Source URL string.
src_size: Source size
src_crc32c: Source CRC32c.
src_md5: Source MD5.
dst_url_str: Destination URL string.
dst_size: Destination size
dst_crc32c: Destination CRC32c.
dst_md5: Destination MD5.
Returns:
(src_crc32c, src_md5, dst_crc32c, dst_md5)
"""
src_url = StorageUrlFromString(src_url_str)
dst_url = StorageUrlFromString(dst_url_str)
if src_url.IsFileUrl():
if dst_crc32c != _NA or dst_url.IsFileUrl():
if src_size > TEN_MIB:
logger.info('Computing CRC32C for %s...', src_url_str)
with open(src_url.object_name, 'rb') as fp:
src_crc32c = CalculateB64EncodedCrc32cFromContents(fp)
elif dst_md5 != _NA or dst_url.IsFileUrl():
if dst_size > TEN_MIB:
logger.info('Computing MD5 for %s...', src_url_str)
with open(src_url.object_name, 'rb') as fp:
src_md5 = CalculateB64EncodedMd5FromContents(fp)
if dst_url.IsFileUrl():
if src_crc32c != _NA:
if src_size > TEN_MIB:
logger.info('Computing CRC32C for %s...', dst_url_str)
with open(dst_url.object_name, 'rb') as fp:
dst_crc32c = CalculateB64EncodedCrc32cFromContents(fp)
elif src_md5 != _NA:
if dst_size > TEN_MIB:
logger.info('Computing MD5 for %s...', dst_url_str)
with open(dst_url.object_name, 'rb') as fp:
dst_md5 = CalculateB64EncodedMd5FromContents(fp)
return (src_crc32c, src_md5, dst_crc32c, dst_md5)
def _ListUrlRootFunc(cls, args_tuple, thread_state=None):
"""Worker function for listing files/objects under to be sync'd.
Outputs sorted list to out_file_name, formatted per _BuildTmpOutputLine. We
sort the listed URLs because we don't want to depend on consistent sort
order across file systems and cloud providers.
Args:
cls: Command instance.
args_tuple: (base_url_str, out_file_name, desc), where base_url_str is
top-level URL string to list; out_filename is name of file to
which sorted output should be written; desc is 'source' or
'destination'.
thread_state: gsutil Cloud API instance to use.
"""
gsutil_api = GetCloudApiInstance(cls, thread_state=thread_state)
(base_url_str, out_filename, desc) = args_tuple
# We sort while iterating over base_url_str, allowing parallelism of batched
# sorting with collecting the listing.
out_file = io.open(out_filename, mode='w', encoding=constants.UTF8)
try:
_BatchSort(_FieldedListingIterator(cls, gsutil_api, base_url_str, desc),
out_file)
except Exception as e: # pylint: disable=broad-except
# Abandon rsync if an exception percolates up to this layer - retryable
# exceptions are handled in the lower layers, so we got a non-retryable
# exception (like 404 bucket not found) and proceeding would either be
# futile or could result in data loss - for example:
# gsutil rsync -d gs://non-existent-bucket ./localdir
# would delete files from localdir.
cls.logger.error(
'Caught non-retryable exception while listing %s: %s' %
(base_url_str, e))
cls.non_retryable_listing_failures = 1
out_file.close()
def _LocalDirIterator(base_url):
"""A generator that yields a BLR for each file in a local directory.
We use this function instead of WildcardIterator for listing a local
directory without recursion, because the glob.globi implementation called
by WildcardIterator skips "dot" files (which we don't want to do when
synchronizing to or from a local directory).
Args:
base_url: URL for the directory over which to iterate.
Yields:
BucketListingObject for each file in the directory.
"""
for filename in os.listdir(base_url.object_name):
filename = os.path.join(base_url.object_name, filename)
if os.path.isfile(filename):
yield BucketListingObject(StorageUrlFromString(filename), None)
def _FieldedListingIterator(cls, gsutil_api, base_url_str, desc):
"""Iterator over base_url_str formatting output per _BuildTmpOutputLine.
Args:
cls: Command instance.
gsutil_api: gsutil Cloud API instance to use for bucket listing.
base_url_str: The top-level URL string over which to iterate.
desc: 'source' or 'destination'.
Yields:
Output line formatted per _BuildTmpOutputLine.
"""
base_url = StorageUrlFromString(base_url_str)
if base_url.scheme == 'file' and not cls.recursion_requested:
iterator = _LocalDirIterator(base_url)
else:
if cls.recursion_requested:
wildcard = '%s/**' % base_url_str.rstrip('/\\')
else:
wildcard = '%s/*' % base_url_str.rstrip('/\\')
fields = ['crc32c', 'md5Hash', 'name', 'size', 'timeCreated',
'metadata/%s' % MTIME_ATTR]
if cls.preserve_posix_attrs:
fields.extend(['metadata/%s' % ATIME_ATTR, 'metadata/%s' % MODE_ATTR,
'metadata/%s' % GID_ATTR, 'metadata/%s' % UID_ATTR])
iterator = CreateWildcardIterator(
wildcard, gsutil_api, project_id=cls.project_id,
ignore_symlinks=cls.exclude_symlinks, logger=cls.logger).IterObjects(
# Request just the needed fields, to reduce bandwidth usage.
bucket_listing_fields=fields)
i = 0
for blr in iterator:
# Various GUI tools (like the GCS web console) create placeholder objects
# ending with '/' when the user creates an empty directory. Normally these
# tools should delete those placeholders once objects have been written
# "under" the directory, but sometimes the placeholders are left around.
# We need to filter them out here, otherwise if the user tries to rsync
# from GCS to a local directory it will result in a directory/file
# conflict (e.g., trying to download an object called "mydata/" where the
# local directory "mydata" exists).
url = blr.storage_url
if IsCloudSubdirPlaceholder(url, blr=blr):
# We used to output the message 'Skipping cloud sub-directory placeholder
# object...' but we no longer do so because it caused customer confusion.
continue
if (cls.exclude_symlinks and url.IsFileUrl()
and os.path.islink(url.object_name)):
continue
if cls.exclude_pattern:
str_to_check = url.url_string[len(base_url_str):]
if str_to_check.startswith(url.delim):
str_to_check = str_to_check[1:]
if cls.exclude_pattern.match(str_to_check):
continue
i += 1
if i % _PROGRESS_REPORT_LISTING_COUNT == 0:
cls.logger.info('At %s listing %d...', desc, i)
yield _BuildTmpOutputLine(blr)
def _BuildTmpOutputLine(blr):
"""Builds line to output to temp file for given BucketListingRef.
Args:
blr: The BucketListingRef.
Returns:
The output line, formatted as
_EncodeUrl(URL)<sp>size<sp>time_created<sp>atime<sp>mtime<sp>mode<sp>uid<sp>
gid<sp>crc32c<sp>md5 where md5 will only be present for cloud URLs that
aren't composite objects. A missing field is populated with '-', or -1 in
the case of atime/mtime/time_created.
"""
atime = NA_TIME
crc32c = _NA
gid = NA_ID
md5 = _NA
mode = NA_MODE
mtime = NA_TIME
time_created = NA_TIME
uid = NA_ID
url = blr.storage_url
if url.IsFileUrl():
mode, _, _, _, uid, gid, size, atime, mtime, _ = os.stat(url.object_name)
# atime/mtime can be a float, so it needs to be converted to a long.
atime = long(atime)
mtime = long(mtime)
mode = ConvertModeToBase8(mode)
# Don't use atime / mtime with times older than 1970-01-01 UTC.
if atime < 0:
atime = NA_TIME
if mtime < 0:
mtime = NA_TIME
elif url.IsCloudUrl():
size = blr.root_object.size
if blr.root_object.metadata is not None:
found_m, mtime_str = GetValueFromObjectCustomMetadata(blr.root_object,
MTIME_ATTR, NA_TIME)
try:
# The mtime value can be changed in the online console, this performs a
# sanity check and sets the mtime to NA_TIME if it fails.
mtime = long(mtime_str)
if found_m and mtime <= NA_TIME:
WarnNegativeAttribute('mtime', url.url_string)
if mtime > long(time.time()) + SECONDS_PER_DAY:
WarnFutureTimestamp('mtime', url.url_string)
except ValueError:
# Since mtime is a string, catch the case where it can't be cast as a
# long.
WarnInvalidValue('mtime', url.url_string)
mtime = NA_TIME
posix_attrs = DeserializeFileAttributesFromObjectMetadata(blr.root_object,
url.url_string)
mode = posix_attrs.mode.permissions
atime = posix_attrs.atime
uid = posix_attrs.uid
gid = posix_attrs.gid
# Sanitize the timestamp returned, and put it in UTC format. For more
# information see the UTC class in gslib/util.py.
time_created = ConvertDatetimeToPOSIX(blr.root_object.timeCreated)
crc32c = blr.root_object.crc32c or _NA
md5 = blr.root_object.md5Hash or _NA
else:
raise CommandException('Got unexpected URL type (%s)' % url.scheme)
return '%s %d %d %d %d %d %d %d %s %s\n' % (_EncodeUrl(url.url_string), size,
time_created, atime, mtime, mode,
uid, gid, crc32c, md5)
def _EncodeUrl(url_string):
"""Encodes url_str with quote plus encoding and UTF8 character encoding.
We use this for all URL encodings.
Args:
url_string: String URL to encode.
Returns:
encoded URL.
"""
return urllib.quote_plus(url_string.encode(constants.UTF8))
def _DecodeUrl(enc_url_string):
"""Inverts encoding from EncodeUrl.
Args:
enc_url_string: String URL to decode.
Returns:
decoded URL.
"""
return urllib.unquote_plus(enc_url_string).decode(constants.UTF8)
# pylint: disable=bare-except
def _BatchSort(in_iter, out_file):
"""Sorts input lines from in_iter and outputs to out_file.
Sorts in batches as input arrives, so input file does not need to be loaded
into memory all at once. Derived from Python Recipe 466302: Sorting big
files the Python 2.4 way by Nicolas Lehuen.
Sorted format is per _BuildTmpOutputLine. We're sorting on the entire line
when we could just sort on the first record (URL); but the sort order is
identical either way.
Args:
in_iter: Input iterator.
out_file: Output file.
"""
# Note: If chunk_files gets very large we can run out of open FDs. See .boto
# file comments about rsync_buffer_lines. If increasing rsync_buffer_lines
# doesn't suffice (e.g., for someone synchronizing with a really large
# bucket), an option would be to make gsutil merge in passes, never
# opening all chunk files simultaneously.
buffer_size = config.getint('GSUtil', 'rsync_buffer_lines', 32000)
chunk_files = []
try:
while True:
current_chunk = sorted(islice(in_iter, buffer_size))
if not current_chunk:
break
output_chunk = io.open('%s-%06i' % (out_file.name, len(chunk_files)),
mode='w+', encoding=constants.UTF8)
chunk_files.append(output_chunk)
output_chunk.write(unicode(''.join(current_chunk)))
output_chunk.flush()
output_chunk.seek(0)
out_file.writelines(heapq.merge(*chunk_files))
except IOError as e:
if e.errno == errno.EMFILE:
raise CommandException('\n'.join(textwrap.wrap(
'Synchronization failed because too many open file handles were '
'needed while building synchronization state. Please see the '
'comments about rsync_buffer_lines in your .boto config file for a '
'possible way to address this problem.')))
raise
finally:
for chunk_file in chunk_files:
try:
chunk_file.close()
os.remove(chunk_file.name)
except Exception as e: # pylint: disable=broad-except
logging.debug(
'Failed to remove rsync chunk file "%s". Got an error:\n%s',
chunk_file.name, e)
class _DiffIterator(object):
"""Iterator yielding sequence of RsyncDiffToApply objects."""
def __init__(self, command_obj, base_src_url, base_dst_url):
global _tmp_files
self.command_obj = command_obj
self.compute_file_checksums = command_obj.compute_file_checksums
self.delete_extras = command_obj.delete_extras
self.recursion_requested = command_obj.recursion_requested
self.logger = self.command_obj.logger
self.base_src_url = base_src_url
self.base_dst_url = base_dst_url
self.preserve_posix = command_obj.preserve_posix_attrs
self.skip_old_files = command_obj.skip_old_files
self.logger.info('Building synchronization state...')
# Files to track src and dst state should be created in the system's
# preferred temp directory so that they are eventually cleaned up if our
# cleanup callback is interrupted.
temp_src_file = tempfile.NamedTemporaryFile(
prefix='gsutil-rsync-src-', delete=False)
temp_dst_file = tempfile.NamedTemporaryFile(
prefix='gsutil-rsync-dst-', delete=False)
self.sorted_list_src_file_name = temp_src_file.name
self.sorted_list_dst_file_name = temp_dst_file.name
_tmp_files.append(temp_src_file)
_tmp_files.append(temp_dst_file)
# Close the files, but don't delete them. Because Windows does not allow
# a temporary file to be reopened until it's been closed, we close the
# files before proceeding. This allows each step below to open the file at
# the specified path, perform I/O, and close it so that the next step may
# do the same thing.
temp_src_file.close()
temp_dst_file.close()
# Build sorted lists of src and dst URLs in parallel. To do this, pass
# args to _ListUrlRootFunc as tuple (base_url_str, out_filename, desc)
# where base_url_str is the starting URL string for listing.
args_iter = iter([
(self.base_src_url.url_string, self.sorted_list_src_file_name,
'source'),
(self.base_dst_url.url_string, self.sorted_list_dst_file_name,
'destination')
])
# Contains error message from non-retryable listing failure.
command_obj.non_retryable_listing_failures = 0
shared_attrs = ['non_retryable_listing_failures']
command_obj.Apply(
_ListUrlRootFunc, args_iter, _RootListingExceptionHandler, shared_attrs,
arg_checker=DummyArgChecker,
parallel_operations_override=command_obj.ParallelOverrideReason.SPEED,
fail_on_error=True)
if command_obj.non_retryable_listing_failures:
raise CommandException('Caught non-retryable exception - aborting rsync')
# Note that while this leaves 2 open file handles, we track these in a
# global list to be closed (if not closed in the calling scope) and deleted
# at exit time.
self.sorted_list_src_file = open(self.sorted_list_src_file_name, 'r')
self.sorted_list_dst_file = open(self.sorted_list_dst_file_name, 'r')
_tmp_files.append(self.sorted_list_src_file)
_tmp_files.append(self.sorted_list_dst_file)
if (base_src_url.IsCloudUrl() and base_dst_url.IsFileUrl() and
self.preserve_posix):
self.sorted_src_urls_it = PluralityCheckableIterator(
iter(self.sorted_list_src_file))
self._ValidateObjectAccess()
# Reset our file pointers to the beginning.
self.sorted_list_src_file.seek(0)
# Wrap iterators in PluralityCheckableIterator so we can check emptiness.
self.sorted_src_urls_it = PluralityCheckableIterator(
iter(self.sorted_list_src_file))
self.sorted_dst_urls_it = PluralityCheckableIterator(
iter(self.sorted_list_dst_file))
def _ValidateObjectAccess(self):
"""Validates that the user won't lose access to the files if copied.
Iterates over the src file list to check if access will be maintained. If at
any point we would orphan a file, a list of errors is compiled and logged
with an exception raised to the user.
"""
errors = collections.deque()
for src_url in self.sorted_src_urls_it:
(src_url_str, _, _, _, _, src_mode, src_uid, src_gid,
_, _) = (self._ParseTmpFileLine(src_url))
valid, err = ValidateFilePermissionAccess(src_url_str, uid=src_uid,
gid=src_gid, mode=src_mode)
if not valid:
errors.append(err)
if errors:
for err in errors:
self.logger.critical(err)
raise CommandException('This sync will orphan file(s), please fix their '
'permissions before trying again.')
def _ParseTmpFileLine(self, line):
"""Parses output from _BuildTmpOutputLine.
Parses into tuple:
(URL, size, time_created, atime, mtime, mode, uid, gid, crc32c, md5)
where crc32c and/or md5 can be _NA and atime/mtime/time_created can be
NA_TIME.
Args:
line: The line to parse.
Returns:
Parsed tuple: (url, size, time_created, atime, mtime, mode, uid, gid,
crc32c, md5)
"""
(encoded_url, size, time_created, atime, mtime, mode, uid, gid, crc32c,
md5) = line.split()
return (_DecodeUrl(encoded_url), int(size), long(time_created), long(atime),
long(mtime), int(mode), int(uid), int(gid), crc32c,
md5.strip())