-
Notifications
You must be signed in to change notification settings - Fork 0
/
duplicate_FF
188 lines (174 loc) · 7.23 KB
/
duplicate_FF
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#!/usr/bin/bash
#set -x
function Usage(){
cat << the_end
Duplicate file finder - Finds duplicate files by comparing contents with
output lists to spreadsheet and scripting friendly CSV files.
USAGE: $(basename $0) -f 'filter' -s 'source dir' ... -s 'source dir'
-f Optional case insensitive filter, filter file list by part of the file name.
The filter does not filter on directory names.
Example: if filtered by 'rar' that include both the word 'LIBRARY' and suffix '.rar'
-s Source directory, one or more directories can be checked, each must start with -d
OPTIONAL Maximumfile size. Default is 20 GiB
-k or -K kilobytes (KiB) -m or -M megabytes (Mib) -g or -G gigabytes (GiB)
OUTPUT
Output directory is 'duplicate_chk_<date-time> containing CSV files suitable for
spreadsheet and scripting. All csv files have a checksum column
1. all_files_<date-time>.csv listing all files fully pathed.
2. unique_files_<date-time>.csv - listing all unique files.
3. duplicate_files1_<date-time>.csv - One individual per row
Each row: |checksum|File Size (KiB)|File, fully pathed|Parent Directory|
4. duplicate_files2_<date-time>.csv - One uniue file checksum per row.
Each row: |checksum|File Size (KiB)|File, fully pathed|Parent Directory|..
Each unique checksum row lists all file in ...|File|Parent Dir|.. pairs
5. log_<date-time>.txt - a basic log
NOTES
1.<date-time> is local day and time when run.
2.Temp files created in current directory $PWD
3. All temp files moved to output directory upon completion.
4. Checksum = SHA256
5. 'filter' 'source directory' 'output directory' rquire single or double quotes for
for spaces in the filter and directory input.
6. First instances of -f used, the rest ignored
7. First instances of maximum size used, the rest ignored
MORE : https://github.com/Jim-JMCD/Duplicate-File-Finder/blob/main/README.md
Author: Jim McDonald , Australia.
the_end
exit 1
}
#_____________________________________________________
trap cleanup EXIT
cleanup () {
mkdir $out_dir 2>/dev/null
mv $all_files $out_dir 2>/dev/null
mv $unique_files $out_dir 2>/dev/null
mv $dup_files_1 $out_dir 2>/dev/null
mv $dup_files_2 $out_dir 2>/dev/null
mv $log $out_dir 2>/dev/null
echo "Output:" >> ${out_dir}/log_${now}.txt
echo "$(ls -al ${out_dir}/*) " >> ${out_dir}/log_${now}.txt
echo "=== END : $(date) ========" >> ${out_dir}/log_${now}.txt
rm $cksum_list 2>/dev/null
rm $source_list 2>/dev/null
rm $duplicate_tmp 2>/dev/null
}
#_____________________________________________________
function Is_int(){
if ! [[ $1 =~ ^[0-9]+$ ]] ; then
echo " SIZE requires a positive whole number"
exit 1
fi
}
#_____________________________________________________
function cksum_it(){
[ -z "$1" ] && return
sha256sum "$1" | sed 's/*//g'
# gitbash requires removsl of the * char
}
export -f cksum_it
#_____________________________________________________
function make_file_list(){
# Make csv file of all filtered files: column1 = checksum , column2 = fully pathed file name
source_dirs=$1
while read source_dir ; do
cd $source_dir
find ~+ -size -${max_size}c -type f -iname "*${filter}*" | grep -v '$RECYCLE.BIN'| \
awk '{ printf "\"%s\"\n",$0 }' | xargs -n1 bash -c 'cksum_it "$1"' - |\
awk '{ printf "%s,",$1
for(i=2;i<=NF;i++)
if(i==2)
printf "\"%s",$i
else
printf " %s",$i
printf "\"\n"}'
done < $source_dirs
}
######################################################
now=$(date +%y%m%d-%H%M)
filter=""
source_list=$PWD/duptmp_$now
# Get ready get options and check validity
opt_count=$#
[ $opt_count -eq 0 ] && echo "FAILED. For more info run: $(basename $0) -h" && exit 1
opt_pairs=0
max_size=21474836480 # 20G in bytes
while getopts ":f:s:K:k:M:m:G:g:" opt; do
opt_pairs=$(($opt_pairs + 1))
case $opt in
f) filter=${OPTARG} ;;
s) echo "$OPTARG" >> $source_list ;;
k|K) Is_int $OPTARG && max_size=$(((1024 * "$OPTARG") + 1 )) ;;
m|M) Is_int $OPTARG && max_size=$(((1024 * 1024 * "$OPTARG") +1 )) ;;
g|G) Is_int $OPTARG && max_size=$(((1024 * 1024 * 1024 * "$OPTARG") + 1 )) ;;
:) echo -e "An option requires an argument." ; exit 1 ;;
*) Usage ;;
esac
done
# Check for options that have no preceding -x
# getopt uses pairs for all options
[ $(($opt_count % 2)) == 1 ] && Usage
[ $(($opt_count / 2)) != $opt_pairs ] && Usage
# Test if a source dir is provided
[ ! -s $source_list ] && Usage
# Check all source dirs are valid.
while read dir ; do
if [ ! -d "${dir}" ] ; then
echo "Source directory '${dir}' does not exist "
exit 1
elif [ -z "$(ls -A "${dir}")" ]; then
echo "Source directory '${dir}' is empty "
exit 1
fi
done < $source_list
# All input checks are complete
#_____________________________________________________
# runtime $PWD can change with source dirs, set things now
all_files=$PWD/all_files_$now.csv
unique_files=$PWD/unique_files_$now.csv
dup_files_1=$PWD/duplicate_files1_$now.csv
dup_files_2=$PWD/duplicate_files2_$now.csv
duplicate_tmp=$PWD/duplicate_tmp_$now.csv
cksum_list=$PWD/temp_file_$now
out_dir=$PWD/duplicate_chk_$now
log=$PWD/log_$now.txt
echo === "START : $(date) =========" > $log
make_file_list $source_list >> $all_files
if [ ! -s $all_files ] || [ $(wc -l < "$all_files") -lt 2 ] ; then
echo "Insufficent files found (require a minmum of 2) - all filtered out or source empty" >> $log
echo "Insufficent files found (require a minmum of 2) - all filtered out or source empty"
rm $all_files 2>/dev/null
exit
fi
# Have a list of all files now do lists for
# duplicate files & unique files
awk ' BEGIN { FS=","} { print $1 }' $all_files | sort -u > $cksum_list
while read file_cksum ; do
if [[ $(grep $file_cksum $all_files| wc -l) > 1 ]] ; then
grep $file_cksum $all_files >> $duplicate_tmp
else
grep $file_cksum $all_files >> $unique_files
fi
done < $cksum_list
if [ ! -f "$duplicate_tmp" ]; then
echo "No duplicate files found" > $dup_files_1
echo "No duplicate files found" > $dup_files_2
echo "No duplicate files found" >> $log
exit 0
fi
## Make a spreadsheet friendly file listing all the duplicates files - one file per row
echo "SHA256,File,Directory,\"Size (KB)\"" > $dup_files_1
while IFS=, read -r ck_sum file_name; do
fname=$(sed -e 's/^"//' -e 's/"$//' <<<"${file_name}")
file_size=$(du -b "$fname" 2>> $log | awk '{print $1}')
dir_name_size=$(dirname "${file_name}" | awk -v size=$file_size '{printf "%s\",%.1f\n",$0,size/1024}')
echo "${ck_sum},${file_name},${dir_name_size}" >> $dup_files_1
done < $duplicate_tmp
## Make a spreadsheet friendly file listing all the duplicates files - one checksum per row
awk ' BEGIN { FS=","} { print $1 }' $dup_files_1 |grep -v SHA256 | sort -u > $cksum_list
echo "SHA256,\"Size (KB)\",File,Directory,File,Directory,File,Directory,File,Director," > $dup_files_2
while read -r ck_sum ; do
size=$(grep $ck_sum $dup_files_1 | awk -F"," '{ print $4}'|sort -u)
files_dirs=$(grep $ck_sum $dup_files_1 | awk -F"," '{ printf ",%s,%s",$2,$3}')
echo "${ck_sum},${size}${files_dirs}" >> $dup_files_2
done < $cksum_list
## All done