In [1]:
import requests as re
# requests is a library to make HTTP requests
from bs4 import BeautifulSoup
# beautiful soup is a library for parsing html 
import pandas as pd
# panadas provides an R-like DataFrame object, ideal for working with tabular data
import string
# string provides us with a list of ascii lowercase letters (the english alphabet)

In [2]:
"""
First, we make a HTTP GET request to the wikipedia page, and extract its text (html)
We then point beautiful soup at our html, and get it ready for parsing
"""

page = re.get('https://en.wikipedia.org/wiki/Letter_frequency').text
soup = BeautifulSoup(page, 'lxml')

In [3]:
"""
The table we want on the wikipedia page has a class of 'wikitable sortable'
(found via inspect element in a browser)
So we tell our parser to look for everything with that class.
"""

table = soup.findAll('table',{'class':'wikitable sortable'})
# note: from inspection, the 3rd table with this class is the one we want; hence using table[2] later

In [4]:
"""
Having obtained the html of the table we want, we now convert it to a pandas dataframe;
"""

df = pd.read_html(str(table[2]), header=0)[0]
# note: we take the first element of pd.read_html() as it returns a *list* of dataframes

df.columns = [i.split()[0] for i in df.columns]
# note: this is to remove the citations in some column names

annoying_symbols = '%~()*'
# some entries contain tildes, asterisks, or brackets, and almost all end in '%'
# we need to get rid of these to be able to interpret the entries as numerical types

for col in df.columns[1:]:
    for s in annoying_symbols:
        df[col] = df[col].str.replace(s,'')
    df[col] = df[col].astype('float')/100

In [5]:
"""
Finally, we drop any rows corresponding to letters outside the english alphabet and then normalise the probabilities
such that each column adds up to 1. 
See report for implications of this re-normalisation.
"""

df = df.loc[df['Letter'].isin(list(string.ascii_lowercase))]
for col in df.columns[1:]:
    df[col] = df[col]/(df[col].sum())

In [6]:
"""
Having cleaned the data, we save it to a .csv file to be used in future
"""

df.to_csv('LetterFrequenciesFromWiki.csv')

In [12]:
print(df.to_latex(index=False, columns=['Letter','English','French','Italian','German']))

\begin{tabular}{lrrrr}
\toprule
Letter &   English &    French &   Italian &    German \\
\midrule
     a &  0.081671 &  0.078537 &  0.118743 &  0.066708 \\
     b &  0.014920 &  0.009267 &  0.009372 &  0.019308 \\
     c &  0.027820 &  0.033529 &  0.045506 &  0.027969 \\
     d &  0.042530 &  0.037736 &  0.037771 &  0.051966 \\
     e &  0.127021 &  0.151345 &  0.119218 &  0.167856 \\
     f &  0.022280 &  0.010964 &  0.011657 &  0.016953 \\
     g &  0.020150 &  0.008907 &  0.016621 &  0.030805 \\
     h &  0.060941 &  0.007580 &  0.006430 &  0.046858 \\
     i &  0.069661 &  0.077437 &  0.102547 &  0.067056 \\
     j &  0.001530 &  0.006305 &  0.000111 &  0.002744 \\
     k &  0.007720 &  0.000761 &  0.000091 &  0.014507 \\
     l &  0.040250 &  0.056116 &  0.065817 &  0.035187 \\
     m &  0.024060 &  0.030526 &  0.025397 &  0.025942 \\
     n &  0.067491 &  0.072973 &  0.069588 &  0.100083 \\
     o &  0.075071 &  0.059612 &  0.099402 &  0.026556 \\
     p &  0.019290 &  0.025929 