/
duplicate_finder.py
256 lines (201 loc) · 8.82 KB
/
duplicate_finder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
import hashlib, os, pprint, sys, time
deleted = 0
bytes_deleted = 0
duplicates = None
def bytes_to_megas(byte_count):
size_megabytes = byte_count / 1024 / 1024
formatted_megas = str("{0:.2f}".format(size_megabytes)) + " Mb"
return formatted_megas
def print_status():
global bytes_deleted, deleted, duplicates
print("You deleted", str(deleted), "duplicates out of", str(duplicates) + ".")
print("You recovered " + bytes_to_megas(bytes_deleted))
def print_separator():
print("-" * 80 + "\n")
def print_iter_help():
print("ITER COMMANDS")
print_separator()
print(" del <#,#,#...> - deletes the file associated with the number")
print(" you entered. To delete file 2, enter 'del 2'")
print(" (without quotes). To delete files 2 and 5,")
print(" enter 'del 2,5' (without quotes).")
print(" help - prints these options ")
print(" quit - exits the program.")
print(" skip - skips the current duplicate.")
print(" status - prints the number of duplicates removed so far")
print_separator()
def get_time_string(start, end):
""" Given two times (a start and an end time in seconds) this function
returns a formatted string showing the difference in hours minutes and
seconds like so: HH:MM:SS
"""
diff = int(end) - int(start)
seconds = 0
minutes = 0
hours = 0
if diff > 60:
seconds = diff % 60
minutes = diff // 60
if minutes > 60:
minutes = minutes % 60
hours = minutes // 60
else:
seconds = diff
# String-ify the values
seconds = str(seconds)
minutes = str(minutes)
hours = str(hours)
# Pad minutes and seconds with leading zero if necessary.
if len(seconds) == 1:
seconds = "0" + seconds
if len(minutes) == 1:
minutes = "0" + minutes
return hours + ":" + minutes + ":" + seconds
def handle_item(item):
""" Prints out the tuple (key, val) pair from the results dict in an easy
to read fashion. Allows a user to delete one or more of the files.
"""
global bytes_deleted, deleted
# Break out the parts of the 'item'
hashcode = item[0] # the hashcode generated by hashing all of the files in the file tuples list.
file_tuples = item[1] # list of two-tuples with file names and formatted sizes
# Show Output
for i in range(len(file_tuples)):
file = file_tuples[i]
fname = file[0]
fsize = file[1]
print(" ", str(i), "- Name: ", fname)
print(" ", " " * len(str(i)), " Size: ", fsize, "\n")
# Get Input
not_finished = True
while(not_finished):
response = input("Option:")
response = response.strip().lower()
# // HELP
if response == "help":
print_iter_help()
not_finished = True
# // SKIP
elif response == "skip":
not_finished = False
# // DEL
elif response.startswith("del"):
# Break up the input command into its valid parts.
parts = response.split(" ")
if len(parts) != 2:
print("No input. Expected file number. Try 'help' for commands list.")
else:
cmd = parts[0]
indices = parts[1]
indices = indices.split(",")
# Calculate the highest valid index in this file_tuple
max_index = len(file_tuples) - 1
if cmd == "del":
for index in indices:
# Strip off any whitespace characters and make it an int.
index = index.strip()
index = int(index)
# If the user entered a valid index...
if index <= max_index:
# Get the file using the index from the file_tuple.
f = file_tuples[index][0]
# Get the size of the file (to be used later) and delete the file
to_delete_size = os.path.getsize(f)
os.remove(f)
print("\nFile", str(index), "deleted.\n\n")
# Increment 'status' command values
deleted += 1
bytes_deleted += to_delete_size
else:
print("\nSelected file index is out of range.")
print("Was '" + str(index) + "' max allowed value is '" + str(max_index) + "'.\n")
not_finished = False
# STATUS
elif response == "status":
print_status()
not_finished = True
# // QUIT
elif response == "quit":
not_finished = False
return 0
def iterate(dictionary, first_time_here):
""" Recursively pops an item from the supplied dict and prints out its data then
prompts the user to confirm if we should pop another item from the dict.
"""
# Print instructions if its our first call to iterate().
if (first_time_here):
print_iter_help()
# Where the real work gets done
response = handle_item(dictionary.popitem())
if response == 0:
return
else:
# Call this recursively...
iterate(dictionary, False)
def main(directory, bytes_to_read, files_per_status, ignored_extensions):
""" Given a directory, this function recursively walks the files of that dir
and creates a checksum of the first 512 bytes of the file. The checksum
is used as a key into a dictionary. Any files with matching checksums
are added to the list used as the dictionary's value.
When all files have been scanned, any key/value pairs with only one file
are removed from the dict and any with duplicates are pretty printed to
the screen.
"""
global duplicates
print("Data Dir:", directory)
print("Bytes to Read", bytes_to_read)
print("Report status every", files_per_status, "files.")
print("Ignoring files with the following extensions:\n - ", ignored_extensions)
print("")
results = {} # md5 signature -> list of file names
files_read = 0
start_time = time.time()
# Roll over all of the files, hash them and dump them into a dict
for path, dirs, files in os.walk(directory):
for filename in files:
fullname = os.path.join(path, filename)
# Ignore files iwth ignore extensions
file_parts = fullname.split('.')
file_ext = file_parts[len(file_parts) - 1]
if len(file_parts) > 1 and file_ext.lower() not in ignored_extensions:
#print("Accepted file: ", filename)
with open(fullname, 'rb') as f:
d = f.read(bytes_to_read)
h = hashlib.md5(d).hexdigest()
filelist = results.setdefault(h, [])
# Get File size in bytes, divide it down to (roughly) megabytes
#size_megabytes = os.path.getsize(fullname) / 1024 / 1024
#formatted_megas = str("{0:.2f}".format(size_megabytes)) + " Mb"
formatted_megas = bytes_to_megas(os.path.getsize(fullname))
filelist.append((fullname, formatted_megas))
files_read += 1
if (files_read % files_per_status == 0):
print("Files read: ", files_read)
print("Time elapsed:", get_time_string(start_time, time.time()))
print_separator()
# Remove the Files that DON'T have duplicates.
print("Removing non-duplicates from results.")
to_remove = []
for key, value in results.items():
if len(value) < 2:
to_remove.append(key)
for key in to_remove:
del results[key]
print("Done removing non-duplicates.")
print_separator()
# Print out Overall Stats
duplicates = len(results)
print("Processing complete.")
print("Checked ", files_read, "files in ", get_time_string(start_time, time.time()), ",")
print("Found ", str(duplicates), "files which likely have duplicates.")
print_separator()
# BEGIN USER INPUT LOOP
iterate(results, True)
if __name__ == "__main__":
directory = "."
ignored = ['png', 'jpg', 'jpeg', 'gif', 'mp3', 'css', 'js', 'txt']
# Run script on current directory,
# read in 2048 bytes fo each byte and do the hash on that.
# print process time/stats every 1000 files.
main(directory, 2048, 1000, ignored)
print_status()