In [1]:
import requests
from bs4 import BeautifulSoup

In [46]:
def extract_information(div_element):
    country_info = {}

    # Extract country name
    try:
        country_name = div_element.find('a').text
        country_info['country_name'] = country_name
    except AttributeError:
        country_info['country_name'] = None

    # Extract total distance
    try:
        note_element = div_element.find('strong', text='note:').find_next_sibling(text=True).strip()
        country_info['note'] = note_element
    except AttributeError:
        country_info['note'] = None

    # Extract Note element
    try:
        total_distance = div_element.find('strong', text='total:').find_next_sibling(text=True).strip()
        country_info['total_distance'] = total_distance
    except AttributeError:
        country_info['total_distance'] = None

    # Extract bordering countries and their distances
    try:
        border_info = div_element.find('strong', text='border countries (6):').find_next_sibling(text=True).strip()
        border_info = border_info.split(';')
        border_countries = {}
        for border in border_info:
            border_parts = border.strip().split(' ')
            border_country = border_parts[0]
            border_distance = ' '.join(border_parts[1:]).strip()
            border_countries[border_country] = border_distance
        country_info['border_countries'] = border_countries
    except AttributeError:
        country_info['border_countries'] = None

    return country_info


In [9]:
## get the page:
def get_page(url):
    response = requests.get(url)
    if response.status_code == 200:
        return BeautifulSoup(response.content, 'html.parser')

In [10]:
def scrape_webpage(soup):
    contry_data=[]
    selected_elements = soup.select('#index-content-section > div > div.col-lg-9.col-md-12.col-sm-12 div')
    for element in selected_elements:
        contry_data.append(extract_information(element))
    return contry_data

In [11]:
url = "https://www.cia.gov/the-world-factbook/field/land-boundaries/"
soup=get_page(url)

In [47]:
elements=scrape_webpage(soup)
elements

  note_element = div_element.find('strong', text='note:').find_next_sibling(text=True).strip()
  total_distance = div_element.find('strong', text='total:').find_next_sibling(text=True).strip()
  border_info = div_element.find('strong', text='border countries (6):').find_next_sibling(text=True).strip()


[{'country_name': 'Afghanistan',
  'note': None,
  'total_distance': '5,987 km',
  'border_countries': {'China': '91 km',
   'Iran': '921 km',
   'Pakistan': '2,670 km',
   'Tajikistan': '1,357 km',
   'Turkmenistan': '804 km',
   'Uzbekistan': '144 km'}},
 {'country_name': 'Akrotiri',
  'note': None,
  'total_distance': '48 km',
  'border_countries': None},
 {'country_name': 'Albania',
  'note': None,
  'total_distance': '691 km',
  'border_countries': None},
 {'country_name': 'Algeria',
  'note': None,
  'total_distance': '6,734 km',
  'border_countries': {'Libya': '989 km',
   'Mali': '1,359 km',
   'Mauritania': '460 km',
   'Morocco': '1,941 km',
   'Niger': '951 km',
   'Tunisia': '1,034 km'}},
 {'country_name': 'American Samoa',
  'note': None,
  'total_distance': '0 km',
  'border_countries': None},
 {'country_name': 'Andorra',
  'note': None,
  'total_distance': '118 km',
  'border_countries': None},
 {'country_name': 'Angola',
  'note': None,
  'total_distance': '5,369 km',
 

In [48]:
def print_country_info(country_info):
    if 'country_name' in country_info:
        print("Country Name:", country_info['country_name'])
    else:
        print("Country Name: Not Available")

    if 'total_distance' in country_info:
        print("Total Distance:", country_info['total_distance'])
    else:
        print("Total Distance: Not Available")

    if 'border_countries' in country_info:
        print("Border Countries:")
        border_countries = country_info['border_countries']
        if border_countries is None:
            print('NO border countries')
        else:
            for country, distance in border_countries.items():
                print(f"- {country}: {distance}")
    else:
        print("Border Countries: Not Available")
    if 'note' in country_info:
        print("Note:", country_info['note'])
    else:
        print("Note: Not Available")



In [49]:
for element in elements:
    print_country_info(element)
    print("--------------------")

Country Name: Afghanistan
Total Distance: 5,987 km
Border Countries:
- China: 91 km
- Iran: 921 km
- Pakistan: 2,670 km
- Tajikistan: 1,357 km
- Turkmenistan: 804 km
- Uzbekistan: 144 km
Note: None
--------------------
Country Name: Akrotiri
Total Distance: 48 km
Border Countries:
NO border countries
Note: None
--------------------
Country Name: Albania
Total Distance: 691 km
Border Countries:
NO border countries
Note: None
--------------------
Country Name: Algeria
Total Distance: 6,734 km
Border Countries:
- Libya: 989 km
- Mali: 1,359 km
- Mauritania: 460 km
- Morocco: 1,941 km
- Niger: 951 km
- Tunisia: 1,034 km
Note: None
--------------------
Country Name: American Samoa
Total Distance: 0 km
Border Countries:
NO border countries
Note: None
--------------------
Country Name: Andorra
Total Distance: 118 km
Border Countries:
NO border countries
Note: None
--------------------
Country Name: Angola
Total Distance: 5,369 km
Border Countries:
NO border countries
Note: None
--------------

In [22]:
len(elements)

255

In [50]:
import csv
def save_data_2_csv(country_info_list, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Country Name', 'Total Distance', 'Border Countries']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        
        for country_info in country_info_list:
            writer.writerow({
                'Country Name': country_info.get('country_name', 'Not Available'),
                'Border Countries': ', '.join(country_info.get('border_countries', {}).keys()) if country_info.get('border_countries') else 'No border countries',
                'Total Distance': country_info.get('total_distance', 'Not Available')
            })

In [51]:
save_data_2_csv(elements,'country_info.csv')

In [52]:
import xml.etree.ElementTree as ET

def save_country_info_to_xml(country_info_list, filename):
    root = ET.Element("countries")

    for country_info in country_info_list:
        country_element = ET.SubElement(root, "country")
        
        country_name_element = ET.SubElement(country_element, "country_name")
        country_name_element.text = country_info.get('country_name', 'Not Available')

        total_distance_element = ET.SubElement(country_element, "total_distance")
        total_distance_element.text = country_info.get('total_distance', 'Not Available')

        note_element = ET.SubElement(country_element, "notes")
        country_name_element.text = country_info.get('note', 'Not Available')        
    tree = ET.ElementTree(root)
    tree.write(filename)

In [54]:
save_country_info_to_xml(elements, 'country_info.xml')