In [None]:
from file_functions import read_list, save_list, count_urls
from data_cleansing_functions import remove_url_duplicates, remove_all_but_twitter_urls
from web_scraping_functions import query_vehicle_list, twitter_scrape, oryx_scrape, warspotting_scrape

In [None]:
# Twitter Scrape
counter = 0
print('Scraping Twitter:')
for item in query_vehicle_list:
    vehicle_type, _ = item
    new_item_amount, total_amount = twitter_scrape(vehicle_type)
    # print(f'Added {new_item_amount} media URLs of {vehicle_type}s to the list. Total number: {total_amount}')
    # remove_url_duplicates(vehicle_type)
    counter += new_item_amount

print(f'\n\tTotal amount of URLs added: {counter}')

In [None]:
# Oryx Scrape
print('Scraping Oryx:')
for item in query_vehicle_list:
    vehicle_type, _ = item
    new_item_amount, total_amount = oryx_scrape(vehicle_type)
    # print(f'Added {new_item_amount} media URLs of {vehicle_type}s to the list. Total number: {total_amount}')
    # remove_url_duplicates(vehicle_type)

# oryx_scrape('T-62')

In [None]:
# Warspotting Scrape (async)
import concurrent.futures
processes = []

query_vehicle_list = [['T-90',1],['M113',2],['2S1',1],['2S3',2]] # Custom list for testing purposes

print('Scraping Warspotting:')
with concurrent.futures.ProcessPoolExecutor() as executor:
    if __name__ == "__main__":
        for item in query_vehicle_list:
            vehicle_type, _ = item
            p = executor.submit(warspotting_scrape,vehicle_type)
            processes.append([p,vehicle_type])
        for p in processes:
            process, vehicle_type = p
            new_item_amount, total_amount = process.result()
            print(f'Added {new_item_amount} media URLs of {vehicle_type}s to the list. Total number: {total_amount}')

# Warspotting Scrape (sync)
# warspotting_scrape('T-62')

In [None]:
for item in query_vehicle_list:
    vehicle_type, _ = item
    remove_url_duplicates(vehicle_type)
    #count_urls(vehicle_type)
    #remove_all_but_twitter_urls(vehicle_type)

In [None]:
import pandas as pd
url_list = read_list('T-90')
df = pd.DataFrame(url_list, columns = ['URL', 'Status', 'Media Type', 'Source'])
print(df['Source'].value_counts())
print('===')
print(df.count())


In [None]:
import requests
import os

# Function downloads images from URLs and assigns them the original name from the same URL. It's done this way in order to prevent downloading the same image twice.
def download_img(url, vehicle_type):
    if url is not None:
        # url received = [url, status, media_type, source]
        url, status, media_type, source = url
        if media_type == 'video':   
            return 'Video'    
        elif media_type == 'image':   # if 'https://twitter.com/' not in url:
            file_name_temp = url.split('/')
            if source == 'twitter':
                file_name = file_name_temp[-1]
            else: # If source is oryx then we concat the two parts of the URL, since the last one is not unique. 
                file_name = file_name_temp[-2] + file_name_temp[-1]
            full_path = os.path.join('./dataset/',vehicle_type,status,source,file_name) 
            if not os.path.exists(full_path):
                os.makedirs(os.path.dirname(full_path), exist_ok=True) # Creates folder if it doesn't exists previously
                r = requests.get(url)  
                with open(full_path, 'wb') as f:
                    f.write(r.content)
                return 'Downloaded'
            else:
                # Picture already exists
                return 'Exists'
        elif media_type == 'website':
            pass
        else: # media_type == 'unknown'
            pass
    else:
        pass


In [None]:
query_vehicle_list2 = [
    ['M113',1],
    ['T-64',2]
]
for item in query_vehicle_list2:
    counter_downloaded = 0
    counter_pic_exists = 0
    counter_tw_videos = 0

    vehicle_type, _ = item
    remove_url_duplicates(vehicle_type)
    url_list = read_list(vehicle_type)
    print('+ Out of', len(url_list), vehicle_type, 'URLs:')
    for url in url_list:
        outcome = download_img(url, vehicle_type)
        if outcome == 'Downloaded': # Downloaded successfully
            counter_downloaded += 1
        elif outcome == 'Exists':   # Picture already exists
            counter_pic_exists += 1
        else: # outcome == 'Video': - Link belongs to a Twitter video
            counter_tw_videos += 1

    print('   -', counter_downloaded, 'new pictures were downloaded')
    print('   -', counter_pic_exists, 'pictures already existed')
    print('   -', counter_tw_videos, 'URLs belonged to Twitter videos')

#download_img('https://twitter.com/UAWeapons/status/1564984095591088129','2S3')

In [None]:
from PIL import Image
from pathlib import Path
import imagehash
import os
import numpy as np
IMAGE_EXTENSIONS = ['.jpg','.jpeg','.bmp','.png', '.gif', '.tiff']

def find_duplicates(folder_path,delete_duplicates,verbose=False):
        hash_size = 8
        
        fnames = os.listdir(folder_path)
        hashes = {}
        duplicates = []

        for image in fnames:
            if any(x in image for x in IMAGE_EXTENSIONS):
                if os.path.getsize(os.path.join(folder_path,image)) > 0:
                    with Image.open(os.path.join(folder_path,image)) as img:
                        temp_hash = imagehash.average_hash(img, hash_size)
                        if temp_hash in hashes:
                            # print('Duplicate {} \nfound for Image {}!\n'.format(image,hashes[temp_hash]))
                            if image not in duplicates:
                                duplicates.append(image)
                        else:
                            hashes[temp_hash] = image
        
        vehicle_name = folder_path.split('/')[-1]
        if verbose:
            print('\t',vehicle_name,'- Duplicates:')

        if len(duplicates) != 0:
            if delete_duplicates:
                for img in duplicates:
                    os.remove(os.path.join(folder_path,img))
                if verbose:
                    print('\t\t',len(duplicates),'deleted')
            else:
                if verbose:
                    for img in duplicates:
                        print('\t\t',img)
        else:
            if verbose:
                print('\t\tNo duplicates found') 

        return len(duplicates) or 0

def find_similar(image_path,folder_path,delete_duplicates,similarity,verbose=False):
    hash_size = 8

    fnames = os.listdir(folder_path)
    threshold = 1 - similarity/100
    diff_limit = int(threshold*(hash_size**2))

    duplicates = []
    image_name = ''
    
    if os.path.getsize(image_path) > 0:
        with Image.open(image_path) as img:
            hash1 = imagehash.average_hash(img, hash_size).hash
    
    for image in fnames:
        if any(x in image for x in IMAGE_EXTENSIONS):
            if os.path.getsize(os.path.join(folder_path,image)) > 0:
                with Image.open(os.path.join(folder_path,image)) as img:
                    hash2 = imagehash.average_hash(img, hash_size).hash
                    if np.count_nonzero(hash1 != hash2) <= diff_limit:
                        image_name_temp = image_path.split('/')
                        image_name = image_name_temp[-1]
                        if image_name != image: # Makes sure the original picture is not added to the list of duplicates
                            duplicates.append(image)
                            # print('{} image found {}% similar to {}'.format(image,similarity,folder_path))
    
    if verbose:
        print('\tOriginal:')
        print('\t\t',image_name)
        print('\tDuplicates:')

    if len(duplicates) != 0:
        if delete_duplicates:
            for img in duplicates:
                os.remove(os.path.join(folder_path,img))
            if verbose:
                print('\t\t',len(duplicates))
        else:
            if verbose:
                for img in duplicates:
                    print('\t\t',img)
    else:
        if verbose:
            print('\t\tNo close duplicates found') 
    
    return len(duplicates) or 0
    
#find_similar('./dd/2S1 - Copy/R0r6NcvQ5577.png','./dd/2S1 - Copy/',False,97,True)

In [None]:
#find_duplicates('./dd/orig1/',True)

#for item in query_vehicle_list:

folder_path = './dd/c3/'
fnames = os.listdir(folder_path)
for image in fnames:
    image_path = os.path.join(folder_path,image)
    if Path(image_path).is_file(): # Checks if the file exists, since it could've been deleted inside the function
        find_similar(image_path,folder_path,True,97)

In [None]:
for item in query_vehicle_list:
    vehicle_type, _ = item
    folder_path = './dataset/' + vehicle_type
    amount_duplicates = find_duplicates(folder_path, True)
    
    total_close_duplicates = 0
    folder_path += '/'
    fnames = os.listdir(folder_path)
    for image in fnames:
        if any(x in image for x in IMAGE_EXTENSIONS):
            image_path = os.path.join(folder_path,image)
            if Path(image_path).is_file(): # Checks if the file exists, since it could've been deleted inside the function
                amount_close_duplicates = find_similar(image_path,folder_path,True,97)
                total_close_duplicates += amount_close_duplicates
    print(vehicle_type, '-', amount_duplicates, 'duplicates and', total_close_duplicates, 'close duplicates deleted.')

