# Initializing Scraping

##Mounting Google Drive



In [None]:
# Package for mounting google drive
from google.colab import drive
drive.mount('/content/drive')

##Including necessary packages

In [None]:
# Packages for Fetching and parsing web data
from bs4 import BeautifulSoup as bs
import requests

# Packages for managing files
import os
import shutil
import json

# Packages for time management
from datetime import datetime as dati
from pytz import timezone    
import time

# Packages for multiprocessing
from multiprocessing.dummy import Pool
from tqdm.notebook import tqdm

# Packages for image handling
import io
from PIL import Image
from PIL import ImageFile

# Package for sorting list
from operator import itemgetter

##Initiating Necessary variables

In [None]:
# Initiating program execution counter
execution_time_full = time.time()

# Assigning folder names
Today_now = dati.now(timezone('Asia/Kolkata')).strftime('%Y-%m-%d_%H-%M-%S')
Root_folder_Today = "Docs " + Today_now + '/'
Data_folder_Today = Root_folder_Today + 'Data/'

# Creating folder if not exists
alphabetfolderlist = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
alphabetfolderlist = [i for i in alphabetfolderlist]
alphabetfolderlist.extend(['1', 'The'])
for i in alphabetfolderlist:
  if not os.path.exists(Data_folder_Today+i):
    os.makedirs(Data_folder_Today+i)

# Initializing scraper with session enabled and proxy disabled and max retries adapters enabled
MAX_RETRIES = 20
adapter = requests.adapters.HTTPAdapter(max_retries=MAX_RETRIES)

Scraper = requests.Session()
Scraper.trust_env = False

Scraper.mount('https://', adapter)
Scraper.mount('http://', adapter)


# Domain of target site
domain_name = 'https://www.creativeuncut.com/'

# Creating atoz in ascii value
ascii = [chr(i) for i in range(97, 123)]
ascii.extend([chr(i) for i in range(65, 91)])

# True a flag to download truncated images
ImageFile.LOAD_TRUNCATED_IMAGES = True

# Assign number of threads to run
no_of_cores = 2
thread_multiplier = 2
no_of_threads = no_of_cores * thread_multiplier

In [None]:
# Checking the availability of target site
with Scraper.get("https://www.creativeuncut.com/game-art-galleries.html") as scr_response:
  if scr_response.status_code == 200:
    scr_bs = bs(scr_response.content)
    print('Finished Getting Main Page')
  else:
    print('Main Link is not working')

In [None]:
# Finding all videogames div tags
all_VGs_divs = scr_bs.find_all('div',attrs={'class':'ag'})

# Truncating first 2 tag because these are just texts
all_VGs_divs = all_VGs_divs[2:]

# Number of videogames available
no_of_VGs = len(all_VGs_divs)

print('Number Of VideoGames : ',no_of_VGs)

In [None]:
# Creating record for each videogames that contains following
# id    - id of that videogames
# title - title of that videogames
# url   - url of that videogames

all_VG_urls_and_title = []

for i in range(no_of_VGs):
  url = domain_name + all_VGs_divs[i].a['href']
  title = all_VGs_divs[i].text
  all_VG_urls_and_title.append(["{:03d}".format(i+1),title,url])

print('Number of VideoGames records = {}\n\nFirst 10 : \n{}\n\nLast 10 :\n{}'.format(len(all_VG_urls_and_title), '\n'.join([str(i) for i in all_VG_urls_and_title[:11]]), '\n'.join([str(i) for i in all_VG_urls_and_title[-10:]])))

In [None]:
# Checking whether this set contains specific game
check_specific_VG = 'SINoALICE'.lower()
for i in all_VG_urls_and_title:
  if check_specific_VG in i[1].lower() or check_specific_VG in i[2].lower():
    print('Found in ',i)

In [None]:
# Writing VideoGames records to a text file
with open(Root_folder_Today + 'all_posts.txt', 'w', encoding='utf-8') as f1:
  f1.write('\n'.join(['\n\t'.join(i) for i in all_VG_urls_and_title]))

In [None]:

# List to store progress informations
current_r = [       # Current progress status
                [], # 0 - Error Downlaod Page  --- Not Used
                [], # 1 - Error Download Image
                [], # 2 - Empty VGs
                [], # 3 - All Files List
                 0, # 4 - Total Image Count
                 0, # 5 - Total Page Count
                 0, # 6 - Finished VG Count            
]

# Headers to download images
# This site lets users to download image only if it receives a get request with the "Referer" links to that image's page
# Ex : 
# IMG URL               : https://www.creativeuncut.com/art/gallery-32/1bh-logo.jpg
# IMG Page URL (Referer): https://www.creativeuncut.com/gallery-32/1bh-logo.html

# Directly requesting IMG URL leads to the image's page (a html)
# Luckily with "Referer" targets to image's page, it is possible to get actual image(jpg, png, gif)
Headers = {
            'User-agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36',
            'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Encoding' : 'gzip,deflate,sdch',
            'Referer' : 'https://www.creativeuncut.com/gallery-32/1bh-logo.html'
          }

#Scraping

In [None]:
# Function to Scrape every video game's images
def VG_Scrape(VG):

  # Recording time took for each Video Game
  vg_scrape_time = time.time()

  # Assigning folder path
  if VG[1][0].upper() in alphabetfolderlist:
    VG_Initial = VG[1][0].upper()
    if VG[1][:4].upper() == 'THE ':
      VG_Initial = 'The'
  else:
    VG_Initial = '1'

  # Creating folder name for given Video Game
  valid_VG_Title_for_windows = VG[1]
  for i in ['?', ':', '/', '//','\\', '*', '"', '<', '>', '|']:
    valid_VG_Title_for_windows = valid_VG_Title_for_windows.replace(i, '_') 

  VG_folder = Data_folder_Today + VG_Initial + '/' + valid_VG_Title_for_windows + '/'
  
  # Creating folder if not exists
  if not os.path.exists(VG_folder):
    os.makedirs(VG_folder)

  # Checking if this Video game is already downloaded or not
  if not os.path.exists(VG_folder + 'Details.txt'):

    # Initializing counters
    img_no = 0
    current_page_no = 0
    img_url_list = []

    # Getting Video game page URL
    current_page_url = VG[2]

    # Requesting page
    Scr_response = Scraper.get(VG[2],headers=Headers)
    
    # If status code of response is 200 then it is good to go
    if Scr_response.status_code != 200:

      # Error requesting page, so add it to error list
      current_r[0].append([VG[1],VG[2]])
      # Terminating current Video game scraping
      return False
    else:

      # Parsing the page content
      Scr_bs = bs(Scr_response.content, 'html.parser')
      
      # Getting all page URLs
      pages_list = Scr_bs.find('div',attrs={'class':'r_float'})
      next_pages = pages_list.find_all('a',attrs={'class':'gn'})
      no_of_pages = len(next_pages) + 1

      # Traversing through each page
      for _ in range(no_of_pages):

        # Initializing image counter
        all_img_in_current_page = []
        
        # Getting div tag from class "glry"
        # There are two kinds of classes. some VG have glry, while some have gbox. IDK why?
        current_page_img_table = Scr_bs.find('div',attrs={'class':'glry'})
        if current_page_img_table != None:
          all_img_in_current_page = current_page_img_table.find_all('div',attrs={'class':'th'})

        # Getting div tag from class "gbox"
        current_page_img_table_box = Scr_bs.find('div',attrs={'class':'gbox'})
        if current_page_img_table_box != None:
          all_img_in_current_page = current_page_img_table_box.find_all('div',attrs={'class':'gl'})

        # If both divs are not available then add it to "Empy VG" list
        if current_page_img_table == None and current_page_img_table_box == None:
          current_r[2].append([VG[0],img_no,no_of_pages,VG[2],VG[1]])
          print('Empty Posts Found')
          return False

        # Traversing each image
        for each_img in all_img_in_current_page:          
          
          # Skipping image if it's src links to next-page images like "arrow " image that leads to next page instead of an image
          if each_img.div.a.img['src'] not in  ['imgs/next-page.gif', 'imgs/next-page.jpg', 'imgs/next-page.png']:

            # Getting current IMG url
            current_img_url = domain_name + each_img.div.a.img['src'].replace('_s.jpg','.jpg').replace('_s.png','.png').replace('_s.gif','.gif')

            # Increamenting image counter
            img_no += 1

            # Getting current image's referer link
            current_img_page_url = domain_name + each_img.div.a['href']

            # Adding image's url to list
            img_url_list.append(current_img_url)
            current_r[3].append(current_img_url)
            
            # Changing headers of request, so to download image
            Headers['Referer'] = current_img_page_url

            # Requesting image
            res = Scraper.get(current_img_url,headers=Headers)
            
            # Assigning image's name
            VG_img_name = VG_folder + "{:03d}.{}.{}".format(img_no, 'P-{:03d}'.format(current_page_no), current_img_url[current_img_url.rfind('/')+1:])

            # Check if given response is actually an image
            if 'image' not in res.headers.get("content-type", ''):
              
              # Not an image's response, so adding to error list
              current_r[1].append([VG[1],VG[2],current_img_url])
              print(current_img_url,'Image NOt found for ',VG_img_name,' ',VG[1])
            else:

              # Good image, so getting it's binary content
              i = Image.open(io.BytesIO(res.content))
              # Saving image to file
              i.save(VG_img_name)
        
        # Increamenting page counter
        current_page_no += 1

        # Check if last page reached
        if no_of_pages < current_page_no:
          # Getting next page url
          next_page_url = domain_name + next_pages[current_page_no - 1]['href']

          # Assigning current page URL
          current_page_url = next_page_url
          
          # Requesting next page
          Scr_response = Scraper.get(next_page_url,headers=Headers)

          # Check if status code is 200
          if Scr_response.status_code != 200:
            # Error requesting page, so add it to error list
            current_r[0].append([VG[1],current_page_url])
            # Terminating current Video game scraping
            return False
          else:
            # Parsing current response
            Scr_bs = bs(Scr_response.content, 'html.parser')

    # Increamenting page, image counter
    current_r[4] += img_no
    current_r[5] += current_page_no
    current_r[6] += 1

    # Calculating time took for each Video Game
    vg_scrape_time = time.time() - vg_scrape_time

    # Creating a report for Scraping VG
    detail_content_raw = [
                          'Title    : {}'.format(VG[1]),
                          'URL      : {}'.format(VG[2]),
                          'Post ID  : {}'.format(VG[0]),
                          'No IMGs  : {}'.format(img_no),
                          'No Pages : {}'.format(current_page_no),
                          'Time Took: {:.2f} Seconds'.format(vg_scrape_time),
                          '\n',
                          'IMG URLS :\n{}'.format('\n'.join(img_url_list))
                          
                          ]
    # Convert report from list to string
    detail_content = '\n'.join(detail_content_raw)

    # Writing report to file
    with open(VG_folder + 'Details.txt','w') as f1:
      f1.write(detail_content)
    
    # Updating status of Scraping
    status(current_r[6], VG[0], img_no, current_page_no, '{:.2f}'.format(vg_scrape_time), VG[1], VG[2])
  
  # If given Video game is already downloaded
  else:
    # Increament have count and add to list
    have[0] += 1
    have[1].append(VG[2])
    print('Have Count = ',have[0])
    
    # Updating progressbar by 1
    tqdm_bar.update(1)
 
# Function to update status of scraping
def status(a, b, c, d, e, f, g):
  print('{:03d} - ID:{:3s} - IMGs:{:03d} - Dirs:{:03d} - {:6s}Sec - {} - {}'.format(a, b, c, d, e, f, g))

  # Updating progressbar by 1
  tqdm_bar.update(1)

#### Retry this 2 cells if connection error occured

In [None]:
# Creating number of threads 
pool1 = Pool(no_of_threads)

# Creating progressbar wiht "tqdm(full form 'taqaddum' means 'progress' in Arabic)"
tqdm_bar = tqdm(total=len(all_VG_urls_and_title))


In [None]:

# Initializing Already downloaded list
have = [
           0, # Already have count
          [], # Already have post
        ]

# Recording Scraping time
Scraping_imgs_time = time.time()

# Running created threads
_ = pool1.map(VG_Scrape,all_VG_urls_and_title)
# Closing threads
pool1.close()
pool1.join()

# Closing progress bar
tqdm_bar.close()

# Calculating Scraping time
Scraping_imgs_time = time.time() - Scraping_imgs_time

In [None]:

# Report of Scraping
detail_content_raw = [
                          'Scraping took     : {:.2f} Seconds ({})'.format(Scraping_imgs_time, time.strftime('%H-%M-%S', time.gmtime(Scraping_imgs_time))),
                          'Total IMGs        : {}'.format(current_r[4]),
                          'Total Pages       : {}'.format(current_r[5]),
                          'Total Posts       : {}'.format(current_r[6]),
                          'Empty Posts {:03d}  : {}'.format(len(current_r[2]),current_r[2]),
                          'Error Posts {:03d}  : {}'.format(len(current_r[0]),current_r[0]),
                          'Error IMG   {:03d}  : {}'.format(len(current_r[1]),current_r[1]),
                          'Have Posts  {:03d}  : {}'.format(len(have[1]),have[1]),
                      ]
detail_content = '\n'.join(detail_content_raw)
print(detail_content)

# Writing Report to file
with open(Data_folder_Today+'Project-Details.txt','w') as f1:
  f1.write(detail_content)

##Directory indexing using JSON


In [None]:
# Function to return file size in human understandable format
def file_size(size):
  power = 2**10
  n = 0
  power_labels = {0 : 'B', 1: 'KB', 2: 'MB', 3: 'GB', 4: 'TB',5:'PB'}
  while size > power:
    size /= power
    n += 1
  return '{:.2f} {}'.format(size, power_labels[n])


# Function to return a directory index in dictionary format
def dirs_to_dict(i='', previous=''):
  
  # Creating current directory path
  fp = previous + i + '/'
  
  # Creating a sorted list of "Directories" in current directories
  sorted_dir = sorted([k for k in next(os.walk(fp))[1]])
  
  # Assigning current directory name and path
  temp = {
            keys_list[0] : i,   # Current Directory Name
            keys_list[1] : fp,  # Current Directory Path
          }
  
  # Getting a list of files with full path in current directory
  temp[keys_list[7]] =  [fp+j for j in sorted([k for k in next(os.walk(fp))[2]]) ]  

  # Counting number of files in current directory
  temp[keys_list[2]] = len(temp[keys_list[7]])
  
  # Calculating total size of files in current directory
  temp[keys_list[3]] = sum([os.path.getsize(j) for j in temp[keys_list[7]]])

  # Creating an empty list to store subdirectory's data
  temp[keys_list[8]] = []
  
  # Copying number files in current directory to initialize the calculation of total number of files inside current directory (including files in subdirectory)
  temp[keys_list[4]] = temp[keys_list[2]]
  
  # Inititalizing total directorys count to zero
  temp[keys_list[5]] = 0
  
  # Copying size of files in current directory to initialize the calculation of total size of files inside current directory (including files in subdirectory)
  temp[keys_list[6]] = temp[keys_list[3]]
  
  # Check if leaf directory is reached or not
  # If FALSE skip this part
  # If TRUE go inside 
  if sorted_dir != []:
    
    # Traversing through each directory
    for j in sorted_dir:
    
      # Calling recursive function to it's subdirectorys and gets it's directory index
      temp_dict = dirs_to_dict(j, fp)
      
      # Getting files, directorys, file's size
      temp_Files_count, temp_Folders_count, temp_Size = temp_dict[keys_list[4]], temp_dict[keys_list[5]], temp_dict[keys_list[6]]
      
      # Appending subdirectory index to current directory's directory section
      temp[keys_list[8]].append(temp_dict)
      
      # Increamenting total files count inside current directory
      temp[keys_list[4]]  += temp_Files_count
      
      # Increamenting total directorys count inside current directory
      temp[keys_list[5]]  += temp_Folders_count + 1
      
      # Increamenting total file's size inside current directory
      temp[keys_list[6]]  += temp_Size
  if '[' not in fp:
    rename_list.append([fp, '{} [ {} - {} ]'.format(fp[:-1], temp[keys_list[4]], file_size(temp[keys_list[6]]))] )

  # Adding a key that stores file size in readable format
  temp[keys_list[9]]  = file_size(temp[keys_list[6]])

  # Returning index dictionary of current directory
  return temp

# Assigning keys for dictionary
use_short_keys = 0

# IF TRUE use shorter keys, else use longer meaningful keys
if use_short_keys == 1:
  keys_list = 'FoN#P#_CFi#_CS #_TFi#_TFo#_TS #__Fi #__Fo #_TSR#4'.split('#')
else:
  keys_list = 'Folder_Name   #Path          #_Current_Files#_Current_Size #_Total_Files  #_Total_Folders#_Total_Size   #__Files       #__Folders     #_TS_Readable  #14'.split('#')


In [None]:
# List to rename folders
# 1st value in each item points to "Original Folder Name"
# 2nd value in each item points to "Renamed  Folder Name"
rename_list = []

# Directory name to index
dir_name = Root_folder_Today[:-1]

# Calling indexing function and get the directory index
dir_data = dirs_to_dict(dir_name, '')

print('JSON data Created')

##Renaming Folders

  Ex:

    Original Folder name : God of War

    Renamed  Folder name : God of War [ 35 - 3.67 MB]

In [None]:
# Exclude from rename list
excluded_rename_list = [Root_folder_Today, Root_folder_Today[:-1]+'/.ipynb_checkpoints/']
rename_list = [i for i in rename_list if i[0] not in excluded_rename_list]

In [None]:
# Displaying first 10 items
rename_list[:11]

In [None]:
# Function to rename folders
def rename_folders(option):
  j, k = 1-option, 1*option
  for i in rename_list:
    os.rename(i[j], i[k])

# If set 1 Rename folders
# If set 0 Undo Rename Folders
option = 1
rename_folders(option)
print('Folders renamed successfully')

##Writing JSON file

In [None]:
rename_list = []

# Calling indexing function and get the directory index
dir_data = dirs_to_dict(dir_name, '')

# Showing information of current directory from dictionary
for i in dir_data:
  if i not in [keys_list[7], keys_list[8]]:
    temp = '{:' + keys_list[10] + 's} -> {}'
    if i != keys_list[6]:
      print(temp.format(i, dir_data[i]))
    else:
      print(temp.format(i, file_size(dir_data[i])))

# Assigning JSON filename
json_file_name = '{}/{} - Index.json'.format(dir_name, dir_name)

# Writing dictionary in JSON format with "Sorted Keys" and "UNICODE Support" and "Indentation"
with open(json_file_name, 'w') as f1:
  json.dump(dir_data, f1, sort_keys=True, ensure_ascii=False, indent=4)

print('Directory Index created successfully')

#Archiving Data

In [None]:

# Recording Archiving time
archive_time = time.time()

# Archiving useing 7ZIP with no compression
!7z a -m0=Copy "output_archieve_name" "$Root_folder_Today"

# Calculating Archiving time
archive_time = time.time() - archive_time

print('\n\nArchieving took :\n{:.2f} Seconds\n{}'.format(archive_time, time.strftime('%H-%M-%S', time.gmtime(archive_time))))

In [None]:

# Calculating full program execution time
execution_time_full = time.time() - execution_time_full

print('Full program took :\n{:.2f} Seconds\n{}'.format(execution_time_full, time.strftime('%HH-%MM-%SS', time.gmtime(execution_time_full))))

In [None]:
# Creating archieve name
current_time = dati.now(timezone('Asia/Kolkata')).strftime('%Y-%m-%d_%H-%M-%S')
output_archieve_name = '{} - CreativeUncut ({}) ({}) ({}).7z'.format(current_time, current_r[6], current_r[4], time.strftime('%HH-%MM-%SS', time.gmtime(execution_time_full)))

# Renaming Archived file
os.rename('output_archieve_name.7z', output_archieve_name)
print(output_archieve_name)

#Copying Data

In [None]:

# Recording time required to copy archive to Google drive
copy_time = time.time()

# Copy Archive using linux built in copy function
!rsync -ah --progress "$output_archieve_name" "/content/drive/My Drive/"

# Calculating copying time
copy_time = time.time() - copy_time
print('Copying took :\n{:.2f} Seconds\n{}'.format(copy_time, time.strftime('%H-%M-%S', time.gmtime(copy_time))))


# Other Test Features

##Program to Download single image

In [None]:
#####################Program to Download Single image from creativeuncut.com#########################

# import requests as rq
# import io
# from PIL import Image

# Headers = {
#             'User-agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36',
#             'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#             'Accept-Encoding' : 'gzip,deflate,sdch',
#             'Referer' : 'https://www.creativeuncut.com/gallery-39/got-heavy-mongol-rank-4.html'
#           }
# llink = 'https://www.creativeuncut.com/gallery-39/art/got-heavy-mongol-rank-4.jpg'

# rres = rq.get(llink, headers=Headers)
# i = Image.open(io.BytesIO(rres.content))
# i.save('1.jpg')


Sort by file sizes

In [None]:
##########################List all files with respective file size#########################

# all_files_with_size = []
# total_size = 0
# for path, dirs, files in os.walk(start_path):
#   for f in files:
#     fp = os.path.join(path, f)
#     total_size += os.path.getsize(fp)
#     all_files_with_size.append([os.path.getsize(fp),f,fp])
# all_files_with_size = sorted(all_files_with_size, key=itemgetter(0), reverse=True)

# n = 10
# print('First {} :\n{}'.format(n, '\n'.join(['{:10s}{}'.format(file_size(i[0]), i[1]) for i in all_files_with_size[:n+1]])))
# print('Last {} :\n{}'.format(n, '\n'.join(['{:10s}{}'.format(file_size(i[0]), i[1]) for i in all_files_with_size[-n:]])))

In [None]:
########################Function to get total files and total size of given folder ##########################

# def get_size(start_path = '.'):
#   total_size = 0
#   total_count = 0
#   for path, dir, files in os.walk(start_path):
#     for f in files:
#       total_count += 1
#       total_size += os.path.getsize(os.path.join(path, f))
#   print('Total Files Size  = {}\nTotal Files Count = {}'.format(file_size(total_size), total_count))

# get_size(Data_folder_Today)

Copy Speed comparison

In [None]:
############################Speed comparison of !cp vs !rsync##################################


# !rm "/content/2020-09-11_15-46-29 - CreativeUncut (949) (52401).7z"

# test2 = time.time()
# !cp "/content/drive/My Drive/My Files/Project_Data/2020-09-11_15-46-29 - CreativeUncut (949) (52401).7z" "/content/2020-09-11_15-46-29 - CreativeUncut (949) (52401).7z"
# test2 = time.time() - test2

# !rm "/content/2020-09-11_15-46-29 - CreativeUncut (949) (52401).7z"

# test1 = time.time()
# !rsync -ah --progress "/content/drive/My Drive/My Files/Project_Data/2020-09-11_15-46-29 - CreativeUncut (949) (52401).7z" "/content/2020-09-11_15-46-29 - CreativeUncut (949) (52401).7z"
# test1 = time.time() - test1

# print('rsync took {:.2f} Sec\ncp    took {:.2f} Sec'.format(test1, test2))


##########################Test Output###################################
# sending incremental file list
# 2020-09-11_15-46-29 - CreativeUncut (949) (52401).7z
#           5.15G 100%   57.60MB/s    0:01:25 (xfr#1, to-chk=0/1)
# rsync took 85.52 Sec
# cp    took 77.65 Sec


###########################Test result:################################
#  !cp is faster than !rsync with time gap of approximately 5 seconds
#  it is not much gap and !rsync shows progress
#
#  Between 5sec faster vs visual progress I will go for Visual progress so i am using rsync from now on

