In [None]:
# Needed for the stopwords and for the word_tokenize
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet') # download for lemmatization

In [29]:
# Get data from servers on the web
import requests

# Parse HTML data
from bs4 import BeautifulSoup

# Deal with regular expressions
import re

# Tools to wrangle natural language
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

import pickle

# Extract data from Wikipedia
# import wikipedia

In [35]:
def get_paragraphs(link):
    """Get all the text from paragraphs from a webpage, clean it by removing every character but letters. Tokenize the text, remove stopwords and return a list of unique words in the page.
    
    Args:
        link (str): url link for the webpage.
    
    Returns:
        glossary (set): set of unique words in the page's text."""
    
    response = requests.get(link)

    # Make the soup out of the HTML text
    soup = BeautifulSoup(response.text, 'lxml')

    # Extract the plain text from paragraphs
    text = ''
    for paragraph in soup.find_all('p'):
        text += paragraph.text

    ######## NORMALIZATION ########
After 
    # Consider only letters and numbers and make the whole text lower case
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('\ufeff', '', text)
    text = re.sub('\u200b', '', text)
    text = re.sub('[‘’“”…]', '', text)
    text = text.replace("–", "")
    # Get rid of numbers
    text = re.sub('\d*', '', text)

    ######## TOKENIZATION AND STOPWORDS REMOVAL ########

    # Tokenize the text, which means to make each word a token. It is basically
    # a more fancy split
    glossary = word_tokenize(text)

    # Remove stop words
    glossary = [w for w in glossary if w not in stopwords.words("english")]
    
    # Reduce words to their root form
    lemmed_glossary = [WordNetLemmatizer().lemmatize(w) for w in glossary]

    # Since we want to have a glossary of words it is reasonable to consider
    # only unique words. Therefore I'll make glossary a set.
    lemmed_glossary = set(lemmed_glossary)
    
    return lemmed_glossary

In [36]:
link_wikipedia = 'https://en.wikipedia.org/wiki/Financial_technology'
link_investopedia = 'https://www.investopedia.com/terms/f/fintech.asp'

gloss_wiki = get_paragraphs(link_wikipedia)
gloss_inv = get_paragraphs(link_investopedia)

full_glossary = sorted(list(set.union(gloss_wiki, gloss_inv)))
len(full_glossary), full_glossary

(1070,
 ['abbreviated',
  'ability',
  'able',
  'accept',
  'access',
  'accessed',
  'accessible',
  'accion',
  'according',
  'accordingly',
  'account',
  'accurate',
  'accurately',
  'act',
  'acting',
  'active',
  'actively',
  'activity',
  'actor',
  'ad',
  'adaptor',
  'adding',
  'addition',
  'additional',
  'address',
  'adhere',
  'administration',
  'adopt',
  'adoption',
  'advance',
  'advent',
  'advice',
  'adviser',
  'affair',
  'affected',
  'affirm',
  'afternoon',
  'agenda',
  'aggressive',
  'ai',
  'aim',
  'aiming',
  'algorithm',
  'alliance',
  'allow',
  'allowed',
  'allowing',
  'allows',
  'alone',
  'alphabet',
  'also',
  'alternative',
  'america',
  'among',
  'amount',
  'amsterdam',
  'analytics',
  'analyze',
  'annual',
  'another',
  'answer',
  'apace',
  'apis',
  'app',
  'application',
  'applied',
  'applies',
  'apply',
  'applying',
  'approach',
  'approval',
  'apps',
  'april',
  'area',
  'around',
  'art',
  'artificial',
  'asi

In [37]:
# pickle the glossary for later use
# path_linux = '/home/marcelo/Dropbox/Marcelo/Mestrado/NCI/Data_Mining_and_Machine_Learning_2/project/pickles'
# path_windows = "C:\\Users\\marce\\Dropbox\\Marcelo\\Mestrado\\NCI\\Data_Mining_and_Machine_Learning_2\\project\\pickles"
# with open(path_windows + '\\glossary_lemmed.pkl', 'wb') as f:
#     pickle.dump(full_glossary, f)

# Some alternative ways

## <center>Wikipedia Content</center>

In [2]:
# Easy way using the wikipedia package, but it works only 
# for Wikipedia

######## WEB SCRAPING ########
# Get a page from it's title
page = wikipedia.page(title='Financial technology - Wikipedia')
text_wikipedia = page.content
text_wikipedia

######## NORMALIZATION ########

# Consider only letters and numbers and make the whole text lower case
text_wikipedia = re.sub(r"[^a-zA-Z0-9]", " ", text_wikipedia.lower())
# Get rid of numbers
text_wikipedia = re.sub('\d*', '', text_wikipedia)

######## TOKENIZATION AND STOPWORDS REMOVAL ########
# Tokenize the text, which means to make each word a token
wikipedia_glossary = word_tokenize(text_wikipedia)

# Remove stop words
wikipedia_glossary = [w for w in wikipedia_glossary if w not in stopwords.words("english")]

# Since we want to have a glossary of words it is reasonable to consider
# only unique words. Therefore I'll make wiki_glossary a set.
wikipedia_glossary = set(wikipedia_glossary)

In [5]:
######## WEB SCRAPING ########
link_wikipedia = 'https://en.wikipedia.org/wiki/Financial_technology'

response_wiki = requests.get(link_wikipedia)

# Make the soup out of the HTML text
soup = BeautifulSoup(response_wiki.text, 'lxml')
# print(soup.prettify())

# Go through each text (string) in the whole soup. For each
# of them get the parent name and store in a set. The result is
# a set of all the tags in the soup. Our objective is to get the
# paragraphs, tags 'p'.
# print(set([text.parent.name for text in soup.find_all(string=True)]))

# Extract the plain text from paragraphs
text_requests_wiki = ''
for paragraph in soup.find_all('p'):
    text_requests_wiki += paragraph.text
    
# print(text_requests_wiki)

######## NORMALIZATION ########

# Consider only letters and numbers and make the whole text lower case
text_requests_wiki = re.sub(r"[^a-zA-Z0-9]", " ", text_requests_wiki.lower())
# Get rid of numbers
text_requests_wiki = re.sub('\d*', '', text_requests_wiki)

######## TOKENIZATION AND STOPWORDS REMOVAL ########

# Tokenize the text, which means to make each word a token. It is basically
# a more fancy split
wiki_glossary = word_tokenize(text_requests_wiki)

# Remove stop words
wiki_glossary = [w for w in wiki_glossary if w not in stopwords.words("english")]

# Since we want to have a glossary of words it is reasonable to consider
# only unique words. Therefore I'll make wiki_glossary a set.
wiki_glossary = set(wiki_glossary)

In [6]:
# Make the union of both sets created earlier
combined_glossary_wikipedia = wikipedia_glossary.union(wiki_glossary)

## <center>Investopedia Content</center>

In [7]:
######## WEB SCRAPING ########
link_investopedia = 'https://www.investopedia.com/terms/f/fintech.asp'

response_inv = requests.get(link_investopedia)

# Make the soup out of the HTML text
soup = BeautifulSoup(response_inv.text, 'lxml')
# print(soup.prettify())

# Go through each text (string) in the whole soup. For each
# of them get the parent name and store in a set. The result is
# a set of all the tags in the soup. Our objective is to get the
# paragraphs, tags 'p'.
# print(set([text.parent.name for text in soup.find_all(string=True)]))

# Extract the plain text from paragraphs
text_requests_inv = ''
for paragraph in soup.find_all('p'):
    text_requests_inv += paragraph.text
    
# print(text_requests_inv)

######## NORMALIZATION ########

# Consider only letters and numbers and make the whole text lower case
text_requests_inv = re.sub(r"[^a-zA-Z0-9]", " ", text_requests_inv.lower())
# Get rid of numbers
text_requests_inv = re.sub('\d*', '', text_requests_inv)

######## TOKENIZATION AND STOPWORDS REMOVAL ########

# Tokenize the text, which means to make each word a token. It is basically
# a more fancy split
inv_glossary = word_tokenize(text_requests_inv)

# Remove stop words
inv_glossary = [w for w in inv_glossary if w not in stopwords.words("english")]

# Since we want to have a glossary of words it is reasonable to consider
# only unique words. Therefore I'll make inv_glossary a set.
inv_glossary = set(inv_glossary)

In [8]:
# Make the union of all the sets created so far to make the final one
complete_glossary_wikipedia = set.union(wiki_glossary, inv_glossary, wikipedia_glossary)

In [9]:
len(complete_glossary_wikipedia), len(combined_glossary_wikipedia), len(wiki_glossary), len(wikipedia_glossary), len(inv_glossary)

(1191, 676, 640, 674, 726)