In [67]:
# Import libraries

import pandas as pd
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import matplotlib.pyplot as plt
import os
import logging
import requests
import bs4

### Scrapin country lists

In [68]:
# Setup chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless")   # disable this if you want to SEE the browser
chrome_options.add_argument("--no-sandbox")

# Install Chrome driver manager
service = Service(ChromeDriverManager().install())

# Create driver using service + options (correct modern way)
driver = webdriver.Chrome(service=service, options=chrome_options)

In [77]:
# Get the page's contents

page_url = "https://simple.m.wikipedia.org/wiki/List_of_countries"
driver.get(page_url)

In [78]:
# Create a collection of the characters

character_elems = driver.find_elements(by=By.CLASS_NAME, value="mf-section-1")

In [79]:
# To see the first element in the object
character_elems[0].text

"A\n Afghanistan –  Albania –  Algeria –  Andorra –  Angola –  Antigua and Barbuda –  Argentina –  Armenia –  Australia –  Austria –  Azerbaijan\nB\n Bahamas –  Bahrain –  Bangladesh –  Barbados –  Belarus –  Belgium –  Belize –  Benin –  Bhutan –  Bolivia –  Bosnia and Herzegovina –  Botswana –  Brazil –  Brunei –  Bulgaria –  Burkina Faso –  Burundi\nC\n Cabo Verde –  Cambodia –  Cameroon –  Canada –  Central African Republic –  Chad –  Chile –  China –  Colombia –  Comoros –  Costa Rica –  Côte d'Ivoire –  Croatia –  Cuba –  Cyprus –  Czechia\nD\n Democratic Republic of the Congo –  Denmark –  Djibouti –  Dominica –  Dominican Republic\nE\n Ecuador –  Egypt –  El Salvador –  Equatorial Guinea –  Eritrea –  Estonia –  Eswatini –  Ethiopia\nF\n Fiji –  Finland –  France\nG\n  Gabon –   Gambia –   Georgia –   Germany –   Ghana –   Greece –   Grenada –   Guatemala –   Guinea –   Guinea-Bissau –   Guyana\nH\n  Haiti –   Honduras –   Hungary\nI\n  Iceland –   India –   Indonesia –   Iran 

In [81]:
# To separate these characters by the \n indicator
list_char = character_elems[0].text.split("\n")
list_char

['A',
 ' Afghanistan –  Albania –  Algeria –  Andorra –  Angola –  Antigua and Barbuda –  Argentina –  Armenia –  Australia –  Austria –  Azerbaijan',
 'B',
 ' Bahamas –  Bahrain –  Bangladesh –  Barbados –  Belarus –  Belgium –  Belize –  Benin –  Bhutan –  Bolivia –  Bosnia and Herzegovina –  Botswana –  Brazil –  Brunei –  Bulgaria –  Burkina Faso –  Burundi',
 'C',
 " Cabo Verde –  Cambodia –  Cameroon –  Canada –  Central African Republic –  Chad –  Chile –  China –  Colombia –  Comoros –  Costa Rica –  Côte d'Ivoire –  Croatia –  Cuba –  Cyprus –  Czechia",
 'D',
 ' Democratic Republic of the Congo –  Denmark –  Djibouti –  Dominica –  Dominican Republic',
 'E',
 ' Ecuador –  Egypt –  El Salvador –  Equatorial Guinea –  Eritrea –  Estonia –  Eswatini –  Ethiopia',
 'F',
 ' Fiji –  Finland –  France',
 'G',
 '  Gabon –   Gambia –   Georgia –   Germany –   Ghana –   Greece –   Grenada –   Guatemala –   Guinea –   Guinea-Bissau –   Guyana',
 'H',
 '  Haiti –   Honduras –   Hungary',

In [89]:
#To create a data frame with category and country columns

records = []

# loop in steps of 2 (category, countries string)
for i in range(0, len(list_char), 2):
    category = list_char[i].strip()
    countries_str = list_char[i+1]
    countries = [c.strip() for c in countries_str.split("–") if c.strip()]
    for country in countries:
        records.append({"Category": category, "Country": country})

# Create dataframe
df = pd.DataFrame(records)


In [90]:
# Preview
print(df.head(20))

   Category              Country
0         A          Afghanistan
1         A              Albania
2         A              Algeria
3         A              Andorra
4         A               Angola
5         A  Antigua and Barbuda
6         A            Argentina
7         A              Armenia
8         A            Australia
9         A              Austria
10        A           Azerbaijan
11        B              Bahamas
12        B              Bahrain
13        B           Bangladesh
14        B             Barbados
15        B              Belarus
16        B              Belgium
17        B               Belize
18        B                Benin
19        B               Bhutan


In [None]:
# Put the characters into a dataframe

df = pd.DataFrame(list_char, columns =  ["character"])

In [91]:
# Save the dataframe as csv

df.to_csv('country_list.csv')

### Scraping Wikipedia content with requests and bs4

In [2]:
import requests
from bs4 import BeautifulSoup

In [39]:
# Get URL

# Add a User-Agent header so Wikipedia returns the real page
headers = {
    "User-Agent": "Mozilla/5.0 (compatible; MyWikiScraper/1.0; +https://example.com/mybot)"
}

# Get the URL
page = requests.get("https://en.wikipedia.org/wiki/Key_events_of_the_20th_century", headers=headers)


In [40]:
# Create soup and get title

soup = BeautifulSoup(page.text, 'html.parser')
print(soup.title)

<title>Key events of the 20th century - Wikipedia</title>


In [41]:
text = soup.get_text()

In [42]:
text = text.encode('utf-8')

In [43]:
with open('20th_century_Wiki.txt', 'wb') as f: 
    f.write(text)