In [1]:
!gdown https://drive.google.com/uc?id=1F_WDXotjwawHL2HO4FEAKungLN-rPGN6 -O data.tsv

Downloading...
From: https://drive.google.com/uc?id=1F_WDXotjwawHL2HO4FEAKungLN-rPGN6
To: /kaggle/working/data.tsv
100%|███████████████████████████████████████| 16.7M/16.7M [00:00<00:00, 207MB/s]


In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None) 

import warnings
warnings.filterwarnings('ignore') 


In [3]:
df = pd.read_csv('data.tsv', sep='\t')
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Sort by date descending
df = df.sort_values(by='date', ascending=False)

In [4]:
# df

In [5]:
df['doc_type_name'].value_counts()

doc_type_name
extra-gazettes    34770
gazettes           6275
acts               1647
bills              1351
Name: count, dtype: int64

In [6]:
import os
import requests
from urllib.parse import urlparse

def raw_download(url):
    try:
        path = urlparse(url).path
        relative_path = path.split('/view/')[-1]  # e.g., "acts/2021/10/24-2021_E.pdf"
        parts = relative_path.strip('/').split('/')

        folder_name = parts[0]  # e.g., 'acts'
        filename = '-'.join(parts[1:])  # e.g., "2021-10-24-2021_E.pdf"

        folder_path = os.path.join('pdf', folder_name)
        os.makedirs(folder_path, exist_ok=True)

        filepath = os.path.join(folder_path, filename)

        # Skip if file already exists
        if os.path.exists(filepath):
            print(f"[SKIPPED] Already exists: {filepath}")
            return

        response = requests.get(url)
        response.raise_for_status()

        with open(filepath, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)

        print(f"[DOWNLOADED] {filepath}")

    except Exception as e:
        print(f"[ERROR] Failed to download {url} — {type(e).__name__}: {e}")


In [7]:
filtered_df = df[
    (
        df['source_url_en'].isna() | 
        (df['source_url_en'].str.strip() == '')
    ) &
    df['source_url_si'].notna() &
    (df['source_url_si'].str.strip() != '')
].sort_values(by='date', ascending=False).reset_index(drop=True)

filtered_df['doc_type_name'].value_counts()

doc_type_name
extra-gazettes    1512
acts               412
gazettes           287
Name: count, dtype: int64

In [8]:
import time
import pandas as pd

def download_range(doc_type='acts', start=0, end=9):
    # Filter and sort
    filtered_df = df[
        (df['doc_type_name'] == doc_type) &
        (
            df['source_url_en'].isna() | 
            (df['source_url_en'].str.strip() == '')
        ) &
        df['source_url_si'].notna() &
        (df['source_url_si'].str.strip() != '')
    ].sort_values(by='date', ascending=False).reset_index(drop=True)

    # Adjust slice range
    sliced_df = filtered_df.iloc[start:end]

    total_start = time.time()

    for idx, url in enumerate(sliced_df['source_url_en'], start=start + 1):
        start_time = time.time()
        raw_download(url)
        end_time = time.time()
        print(f"  [{idx}] Download time: {end_time - start_time:.2f} seconds")

    total_end = time.time()
    print(f"\n✅ Total time for {len(sliced_df)} downloads: {total_end - total_start:.2f} seconds")


In [9]:
download_range(doc_type='acts', start=0, end=99999)


[ERROR] Failed to download nan — AttributeError: 'float' object has no attribute 'decode'
  [1] Download time: 0.00 seconds
[ERROR] Failed to download nan — AttributeError: 'float' object has no attribute 'decode'
  [2] Download time: 0.00 seconds
[ERROR] Failed to download nan — AttributeError: 'float' object has no attribute 'decode'
  [3] Download time: 0.00 seconds
[ERROR] Failed to download nan — AttributeError: 'float' object has no attribute 'decode'
  [4] Download time: 0.00 seconds
[ERROR] Failed to download nan — AttributeError: 'float' object has no attribute 'decode'
  [5] Download time: 0.00 seconds
[ERROR] Failed to download nan — AttributeError: 'float' object has no attribute 'decode'
  [6] Download time: 0.00 seconds
[ERROR] Failed to download nan — AttributeError: 'float' object has no attribute 'decode'
  [7] Download time: 0.00 seconds
[ERROR] Failed to download nan — AttributeError: 'float' object has no attribute 'decode'
  [8] Download time: 0.00 seconds
[ERROR] 

In [10]:
download_range(doc_type='gazettes', start=0, end=99999)


[ERROR] Failed to download nan — AttributeError: 'float' object has no attribute 'decode'
  [1] Download time: 0.00 seconds
[ERROR] Failed to download nan — AttributeError: 'float' object has no attribute 'decode'
  [2] Download time: 0.00 seconds
[ERROR] Failed to download nan — AttributeError: 'float' object has no attribute 'decode'
  [3] Download time: 0.00 seconds
[ERROR] Failed to download nan — AttributeError: 'float' object has no attribute 'decode'
  [4] Download time: 0.00 seconds
[ERROR] Failed to download nan — AttributeError: 'float' object has no attribute 'decode'
  [5] Download time: 0.00 seconds
[ERROR] Failed to download nan — AttributeError: 'float' object has no attribute 'decode'
  [6] Download time: 0.00 seconds
[ERROR] Failed to download nan — AttributeError: 'float' object has no attribute 'decode'
  [7] Download time: 0.00 seconds
[ERROR] Failed to download nan — AttributeError: 'float' object has no attribute 'decode'
  [8] Download time: 0.00 seconds
[ERROR] 

In [11]:
!du -sh pdf/*/

du: cannot access 'pdf/*/': No such file or directory


In [12]:
import zipfile
import os

def zip_folder(folder_path, output_zip):
    with zipfile.ZipFile(output_zip, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(folder_path):
            for file in files:
                full_path = os.path.join(root, file)
                rel_path = os.path.relpath(full_path, folder_path)
                zipf.write(full_path, arcname=rel_path)

zip_folder('pdf', 'pdf.zip')