In [46]:
import glob

pdf_files = glob.glob('countries/*.pdf')
print(pdf_files)

['countries\\Afghanistan-1.pdf', 'countries\\Afghanistan-2.pdf', 'countries\\Afghanistan-3.pdf', 'countries\\Afghanistan-4.pdf', 'countries\\Afghanistan-5.pdf', 'countries\\Albania-1.pdf', 'countries\\Albania-2.pdf', 'countries\\Albania-3.pdf', 'countries\\Albania-4.pdf', 'countries\\Albania-5.pdf', 'countries\\Algeria-1.pdf', 'countries\\Algeria-2.pdf', 'countries\\Algeria-3.pdf', 'countries\\Algeria-4.pdf', 'countries\\Algeria-5.pdf', 'countries\\Andorra-1.pdf', 'countries\\Andorra-2.pdf', 'countries\\Andorra-3.pdf', 'countries\\Andorra-4.pdf', 'countries\\Andorra-5.pdf', 'countries\\Angola-1.pdf', 'countries\\Angola-2.pdf', 'countries\\Angola-3.pdf', 'countries\\Angola-4.pdf', 'countries\\Angola-5.pdf', 'countries\\Antigua and Barbuda-1.pdf', 'countries\\Antigua and Barbuda-2.pdf', 'countries\\Antigua and Barbuda-3.pdf', 'countries\\Antigua and Barbuda-4.pdf', 'countries\\Antigua and Barbuda-5.pdf', 'countries\\Argentina-1.pdf', 'countries\\Argentina-2.pdf', 'countries\\Argentina-3.

In [47]:
import os

# Extract country names from file paths (assuming filenames are like 'country-1.pdf')

invalid_countries = []
for file in pdf_files:
    country = os.path.splitext(os.path.basename(file))[0]
    if not country.endswith(('-1', '-2', '-3', '-4', '-5')):
        invalid_countries.append(country)

print(invalid_countries)

[]


In [48]:
import re
import pandas as pd

# Extract unique base file names by removing the trailing '-1', '-2', '-3', '-4', or '-5' only if present at the end

base_names = set()
pattern = re.compile(r'^(.*)-[1-5]$')
for f in pdf_files:
    name = os.path.splitext(os.path.basename(f))[0]
    match = pattern.match(name)
    if match:
        base_names.add(match.group(1))
    else:
        base_names.add(name)

print(f"Number of unique base file names: {len(base_names)}")
name_df = pd.DataFrame(list(base_names), columns=['country']).sort_values(by='country').reset_index(drop=True)
name_df

Number of unique base file names: 193


Unnamed: 0,country
0,Afghanistan
1,Albania
2,Algeria
3,Andorra
4,Angola
...,...
188,Venezuela (Bolivarian Republic of)
189,Viet Nam
190,Yemen
191,Zambia


In [49]:
# Read the CSV file
auths_df = pd.read_csv('member_state_auths_2025-03-14.csv')

# Compare 'Member State' column with 'country' column in name_df
csv_countries = set(auths_df['Member State'].str.strip())
pdf_countries = set(name_df['country'].str.strip())

# Find differences
in_csv_not_pdf = csv_countries - pdf_countries
in_pdf_not_csv = pdf_countries - csv_countries

print("Countries in CSV but not in PDF files:", in_csv_not_pdf)
print(" not in CSV:", in_pdf_not_csv)

Countries in CSV but not in PDF files: set()
 not in CSV: set()


In [50]:
# Extract country names without the '-1', '-2', etc. suffix
country_names = set(os.path.splitext(os.path.basename(f))[0].rsplit('-', 1)[0] for f in pdf_files)
print(f"Number of countries: {len(country_names)}")

Number of countries: 193


In [51]:
from collections import Counter

# Count the number of files ending with each suffix
suffix_counts = Counter(os.path.splitext(os.path.basename(f))[0].rsplit('-', 1)[-1] for f in pdf_files if '-' in os.path.splitext(os.path.basename(f))[0])

for suffix in ['1', '2', '3', '4', '5']:
    print(f"-{suffix}: {suffix_counts[suffix]}")

-1: 193
-2: 193
-3: 193
-4: 193
-5: 193


In [52]:
# Find countries that have only the '-1' PDF and none of '-2', '-3', '-4', '-5'
group_1 = set(os.path.splitext(os.path.basename(f))[0].rsplit('-', 1)[0] for f in pdf_files if f.endswith('-1.pdf'))
group_2 = set(os.path.splitext(os.path.basename(f))[0].rsplit('-', 1)[0] for f in pdf_files if any(f.endswith(f'-{n}.pdf') for n in ['2', '3', '4', '5']))

only_1 = group_1 - group_2
print("Countries with only '-1' PDF and none of the others:", only_1)

Countries with only '-1' PDF and none of the others: set()


In [53]:
import shutil

# Create the target directories if they don't exist
for i in range(1, 6):
    os.makedirs(f'countries{i}', exist_ok=True)

# Copy each file to the corresponding countriesN directory and remove the suffix in the copy
for file in pdf_files:
    base = os.path.basename(file)
    name, ext = os.path.splitext(base)
    if '-' in name:
        country, suffix = name.rsplit('-', 1)
        if suffix in {'1', '2', '3', '4', '5'}:
            target_dir = f'countries{suffix}'
            target_path = os.path.join(target_dir, f"{country}{ext}")
            shutil.copyfile(file, target_path)