In [1]:
import json
import os
import pickle
import requests
import time
import bs4 as bs
import pandas as pd

from collections import defaultdict

In [2]:
# HELPERS
from datetime import datetime

def get_time():
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    return f"{current_time}: "

In [3]:
headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }

url = "https://www2.bgs.ac.uk/mineralsuk/statistics/wms.cfc?method=searchWMS"
req = requests.get(url, headers)
soup = bs.BeautifulSoup(req.content, 'html.parser')

In [4]:
#print(soup.prettify())

In [5]:
soup_options = soup.find_all('select')
soup_activities = soup_options[0].find_all('option')[1:]
activities = []

for activity in soup_activities:
    activities.append(activity.get("value"))

In [6]:
print(activities)

['Imports', 'Exports', 'Production']


In [7]:
soup_options = soup.find_all('select')
soup_activities = soup_options[0].find_all('option')[1:]
activities = []

for activity in soup_activities:
    activities.append(activity.get("value"))

In [8]:
soup_minerals = soup_options[1].find_all('option')[1:]
minerals = {}

for mineral in soup_minerals:
    mineral_value = mineral.get("value")
    mineral_name = mineral.text
    minerals[mineral_value] = mineral_name

In [9]:
minerals

{'177': 'aggregates, primary',
 '2': 'alumina',
 '3': 'aluminium, primary',
 '8': 'antimony',
 '6': 'antimony, mine',
 '11': 'arsenic',
 '9': 'arsenic, white',
 '12': 'asbestos',
 '14': 'asbestos, unmanufactured',
 '17': 'barytes',
 '1': 'bauxite',
 '5': 'bauxite, alumina and aluminium',
 '20': "bentonite and fuller's earth",
 '160': 'beryl',
 '23': 'bismuth',
 '21': 'bismuth, mine',
 '161': 'borates',
 '26': 'bromine',
 '29': 'cadmium',
 '179': 'cement',
 '174': 'cement  clinker',
 '175': 'cement, finished',
 '32': 'chromium',
 '30': 'chromium ores and concentrates',
 '35': 'coal',
 '39': 'cobalt',
 '36': 'cobalt, mine',
 '37': 'cobalt, refined',
 '44': 'copper',
 '40': 'copper, mine',
 '42': 'copper, refined',
 '41': 'copper, smelter',
 '47': 'diamond',
 '50': 'diatomite',
 '53': 'feldspar',
 '74': 'ferro-alloys',
 '56': 'fluorspar',
 '737': 'gallium, primary',
 '743': 'gemstones',
 '162': 'germanium metal',
 '59': 'gold',
 '57': 'gold, mine',
 '62': 'graphite',
 '65': 'gypsum and pl

In [10]:
# with open('minerals.json', 'w') as fp:
#     json.dump(minerals, fp)

In [11]:
soup_from = soup_options[2].find_all('option')[1:]
years_from = []

for year in soup_from:
    years_from.append(year.get("value"))

In [12]:
soup_to = soup_options[3].find_all('option')[1:]
years_to = []

for year in soup_to:
    years_to.append(year.get("value"))

In [13]:
soup_countries = soup_options[4].find_all('option')[1:]
countries = {}

for country in soup_countries:
    country_value = country.get("value")
    country_name = country.text
    countries[country_value] = country_name

In [14]:
file_name = "countries.pkl"

open_file = open(file_name, "wb")
pickle.dump(list(countries.values()), open_file)
open_file.close()

In [15]:
measures_imports = defaultdict(lambda: defaultdict(dict))
measures_exports = defaultdict(lambda: defaultdict(dict))
measures_production = defaultdict(lambda: defaultdict(dict))

In [16]:
for activity in activities:
    print(f"{get_time()}Looping through {activity}")
    for mineral_id in minerals.keys():
        print(f"{get_time()}Looping through {mineral_id} ({minerals[mineral_id]})")
        start_year = years_from[0]
        while int(start_year) <= int(years_to[-1]):
            time.sleep(15)
            if int(start_year) <= 2011:
                end_year = str(int(start_year) + 9)
            else:
                end_year = "2020"
            print(f"{get_time()}Start year: {start_year}")
            print(f"{get_time()}End year: {end_year}")
            url_structure = f"https://www2.bgs.ac.uk/mineralsUK/statistics/wms.cfc?method=listResults&dataType={activity}&commodity={mineral_id}&dateFrom={start_year}&dateTo={end_year}&country=&agreeToTsAndCs=agreed"
            req_query = requests.get(url_structure, headers)
            soup_query = bs.BeautifulSoup(req_query.content, 'html.parser')
            if soup_query.find("h1").text == "No results":
                print(f"{get_time()}No results for range {start_year} to {end_year}")
                start_year = str(int(start_year) + 10)
                continue
            else:
                print(f"{get_time()}Results for {start_year} to {end_year}: {soup_query.find('h1').text}")
                measure = soup_query.find("h1").find_next_sibling().text
                if measure == "":
                    measure = "missing"
                globals()[f"measures_{activity.lower()}"][activity][mineral_id + "-" + minerals[mineral_id]][start_year + "-" + end_year] = measure
                spreadsheet_url = url_structure + "&exportToSpreadsheet=Yes"
                r = requests.get(spreadsheet_url)
                file_name = f"{activity}_{minerals[mineral_id]}_{start_year}_{end_year}.xlsx".lower().replace("/", "_")
                file_path = os.path.join(f"/app/mineral_resources/raw_{activity.lower()}", file_name)
                file = open(file_path, 'wb')
                file.write(r.content)
                file.close()
                print(f"{get_time()}File saved as <{file_name}>")
                start_year = str(int(start_year) + 10)


13:43:33: Looping through Imports
13:43:33: Looping through 177 (aggregates, primary)
13:43:48: Start year: 1970
13:43:48: End year: 1979
13:43:48: No results for range 1970 to 1979
13:44:03: Start year: 1980
13:44:03: End year: 1989
13:44:03: No results for range 1980 to 1989
13:44:18: Start year: 1990
13:44:18: End year: 1999
13:44:18: Results for 1990 to 1999: Imports of aggregates, primary
13:44:18: File saved as <imports_aggregates, primary_1990_1999.xlsx>
13:44:33: Start year: 2000
13:44:33: End year: 2009
13:44:34: Results for 2000 to 2009: Imports of aggregates, primary
13:44:34: File saved as <imports_aggregates, primary_2000_2009.xlsx>
13:44:49: Start year: 2010
13:44:49: End year: 2019
13:44:49: Results for 2010 to 2019: Imports of aggregates, primary
13:44:49: File saved as <imports_aggregates, primary_2010_2019.xlsx>
13:45:04: Start year: 2020
13:45:04: End year: 2020
13:45:05: Results for 2020 to 2020: Imports of aggregates, primary
13:45:05: File saved as <imports_aggreg

In [None]:
#for activity in activities:
#    if activity != "Production":
#        continue
#    print(f"Looping through {activity}")
#    for mineral_id in minerals.keys():
#        print(f"Looping through {mineral_id} ({minerals[mineral_id]})")
#        start_year = years_from[0]
#        time.sleep(60)
#        while int(start_year) <= int(years_to[-1]):
#            if int(start_year) <= 2011:
#                end_year = str(int(start_year) + 9)
#            else:
#                end_year = "2020"
#            print(f"Start year: {start_year}")
#            print(f"End year: {end_year}")
#            #country = "All countries" 
#            url_structure = f"https://www2.bgs.ac.uk/mineralsUK/statistics/wms.cfc?method=listResults&dataType={activity}&commodity={mineral_id}&dateFrom={start_year}&dateTo={end_year}&country=&agreeToTsAndCs=agreed"
#            req_query = requests.get(url_structure, headers)
#            soup_query = bs.BeautifulSoup(req_query.content, 'html.parser')
#            if soup_query.find("h1").text == "No results":
#                print(f"No results for range {start_year} to {end_year}")
#                start_year = str(int(start_year) + 1)
#                continue
#            else:
#                print(f"Results for {start_year} to {end_year}: {soup_query.find('h1').text}")
#                spreadsheet_url = url_structure + "&exportToSpreadsheet=Yes"
#                r = requests.get(spreadsheet_url)
#                file_name = f"{activity}_{minerals[mineral_id]}_{start_year}_{end_year}.xlsx".lower().replace("/", "_")
#                file_path = os.path.join("/analytics_env/resources/mineral_resources/" + activity.lower(), file_name)
#                file = open(file_path, 'wb')
#                file.write(r.content)
#                file.close()
#                print(f"File saved as <{file_name}>")
#                start_year = str(int(start_year) + 10)
#

In [18]:
with open('measures_imports.json', 'w') as fp:
    json.dump(measures_imports, fp)

In [19]:
with open('measures_exports.json', 'w') as fp:
    json.dump(measures_exports, fp)

In [20]:
with open('measures_production.json', 'w') as fp:
    json.dump(measures_production, fp)