In [17]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import re
import folium

#Values used to standardize column naming variations
age,diameter = 'Age','Diameter (Km)'
mapping = {
           'Age(ka)':age,
           'Age(Million years)':age,
           'Age  (million years)':age,
           'Age (million years)':age,
           'Diameter(km)': diameter,
           'Diameter  (km)': diameter,
           'Diameter (km)': diameter
          }

link_regex = re.compile('\[[0-9]+\]')
bad_chars = ['+','<','>','~','(disputed)','(uncertain, estimated to be Late Palaeozoic)']

In [18]:
'''
    Parses an HTML table into a Pandas Dataframe
    @param table: the html table to parse into a dataframe
    @return parsed dataframe
'''
def gen_df(table):
    
    #Retrieve column headers and create empty dataframe
    ths = [heading.text.strip() for heading in table.tbody.find_all('th')]
    df = pd.DataFrame(columns=ths)
    
    #Convert rows in table to dataframe rows and append
    for row in table.tbody.find_all('tr'):
        
        #Get row values
        row_values = row.find_all('td')
        if row_values == []:
            continue
        
        #Put row values into a set
        new_row = {}
        for key,value in zip(df.columns,row_values):
            new_row[key] = value.text.strip()
        
        #Append set to dataframe
        df = df.append(new_row,ignore_index=True)
        
    #Rename dataframe to match standard mapping
    df = df.rename(columns=mapping)
    return df

'''
    This is a very specialized validation function to remove known bad characters in this one specific source
    and normalize date ranges and variations in age. Due to the nature of the source, date/age are not 100% and are often
    represented in terms of x-x years old or x years old +/- x years. For the sake of the project, I have used averages and 
    dropped variations.
    
    @param value: value to validate
'''
def validate(value):
    
    #Remove Wikipedia Link Scripts
    if re.search(link_regex,value):
        value = re.sub(pattern=link_regex,repl='',string=value)
        
    #Take avg value of ranges 
    range_vals = value.split('–')
    if len(range_vals) > 1:
        value = str(float(range_vals[0]) + float(range_vals[1]) / 2)
    
    #Remove variation factors (+/- number of years since dates are not 100%)
    variation_vals = value.split('±')
    value = variation_vals[0]
    
    #Remove bad characters
    for bad_char in bad_chars:
        value = value.replace(bad_char,'')

    return value

'''
    This function takes a DMS format from the source and returns decimal coordinates in a tuple
    @param value: value to parse
'''
def parse_coodinates(value):
    
    dir_factors = {
                   'N':1,
                   'E':1,
                   'S':-1,
                   'W':-1
                  }
    
    try:
        
        #Split value into lat/lon and remove excess value
        lat,lon = value.split('/')[0].strip().split(' ')
        lon = lon.replace('\ufeff','')
        
        #Split lat/lon into degrees, minutes, and direction
        lat_deg,lat_min,lat_dir = (re.split('[°′"]', lat))
        lon_deg,lon_min,lon_dir = (re.split('[°′"]', lon))
        
        #Convert values to decimal 
        lat = round((float(lat_deg) + (float(lat_min)/60)) * dir_factors[lat_dir],5)
        lon = round((float(lon_deg) + (float(lon_min)/60)) * dir_factors[lon_dir],5)

        return (lat,lon)
    except:
        #If it fails, we return this so it can be filtered out
        return 'FAIL'
'''
    Used to create Circles on a Folium map
    @param row: data frame row
    @param map: folium map to add marker to 
'''
def build_marker(row,map):
    folium.Circle(
        location=[row['Lat'],row['Lon']],
        radius=float(row['Diameter (Km)'])*1000,
        popup=row['Name'],
        fill=True,
    ).add_to(map)

In [19]:
craters_wiki = 'https://en.wikipedia.org/wiki/List_of_impact_craters_on_Earth'

#Retrieve HTML
content = requests.get(craters_wiki).text
soup = bs(content,'html.parser')

#Parse all Sortable Wikitables from HTML
tables =[table for table in soup.find_all('table') if table.get('class') == ['wikitable', 'sortable']]

#Create dataframe from each table, ignore last one since it doesn't fit our goal
dfs = [gen_df(table) for table in tables[:-1]]

#Concatenate all dataframes together
master_df = pd.DataFrame()
master_df = pd.concat(dfs,ignore_index=True)

#Drop the date column since we don't need it
master_df.drop('Date',axis=1,inplace=True)

#Validate values needed to map
master_df['Name'] = master_df.apply(lambda x : validate(x['Name']),axis=1)
master_df[diameter] = master_df.apply(lambda x : validate(x[diameter]),axis=1)
master_df['Age'] = master_df.apply(lambda x : validate(x['Age']),axis=1)

#Parse coordinates into a Lat,Lon column
master_df['Coordinates'] = master_df.apply(lambda x : parse_coodinates(x['Coordinates']),axis=1)
master_df = master_df[master_df['Coordinates'] != 'FAIL']
master_df[['Lat','Lon']] = master_df['Coordinates'].tolist()
master_df.drop('Coordinates',axis=1,inplace=True)

#Build map and add markers
map = folium.Map(location=[0,0],zoom_start=2)
master_df.apply(lambda x : build_marker(x,map),axis=1)

#Display map
map