# Download files from Google Drive in bulk

In [None]:
%pip install -q requests

In [None]:
import requests

In [None]:
URL_DUMP = 'data/drive_links.txt'
TARGET_FOLDER = './tmp/folders'
TARGET_FILES = './tmp/files'

In [None]:
# Load URLs
with open(URL_DUMP, 'r') as file:
    lines = file.readlines()
    lines = [line.strip() for line in lines if line.strip()]
    
lines[:5]

In [None]:
# Extract access keys and types (file/folder) from URLs

def extract_key(url: str) -> tuple[str,str] | None:
    url = url.strip()
    
    url_type = None
    if url.startswith('https://drive.google.com/drive/folders/'):
        url_type = 'folder'
    elif url.startswith('https://drive.google.com/file/d/'):
        url_type = 'file'
    else:
        return None
    
    # Remove prefixes
    url = url.removeprefix('https://drive.google.com/drive/folders/')
    url = url.removeprefix('https://drive.google.com/file/d/')
    # Remove query
    url = url.split('?')[0]
    # Remove postifxes
    url = url.removesuffix('/view')
    url = url.removesuffix('/edit')
    return url_type, url

keys = [extract_key(line) for line in lines]
keys

In [None]:
def download_file(file_id, file_name):
    url = f'https://drive.google.com/uc?export=download&id={file_id}'
    session = requests.Session()
    response = session.get(url, stream=True)
    
    # Handle large files confirmation token
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            url = url + "&confirm=" + value
            response = session.get(url, stream=True)
            break
    
    with open(TARGET_FILES + '/' + file_name, 'wb') as f:
        for chunk in response.iter_content(32768):
            if chunk:
                f.write(chunk)
    print(f'Downloaded file: {file_name}')

def download_folder_zip(folder_id, zip_name):
    # Construct the folder download URL that triggers zipped download
    url = f'https://drive.google.com/drive/folders/{folder_id}?export=download'
    session = requests.Session()
    response = session.get(url, stream=True)
    
    # Check if redirect to confirm download proper zip file (Google may prompt)
    if 'Content-Disposition' not in response.headers:
        # Try to get the actual download URL by following redirects or parsing
        response = session.get(url + '&confirm=t', stream=True)
    
    # Save the zipped content
    with open(TARGET_FOLDER + '/' + zip_name, 'wb') as f:
        for chunk in response.iter_content(32768):
            if chunk:
                f.write(chunk)
    print(f'Downloaded folder as zip: {zip_name}')

def scrape(tokens):
    for url_type, token in tokens:
        if url_type == 'file':
            file_name = f'{token}.file'
            download_file(token, file_name)
        elif url_type == 'folder':
            zip_name = f'{token}.zip'
            download_folder_zip(token, zip_name)
        else:
            print(f'Unknown type for token: {token}')

scrape(keys)