In [1]:
import regex as re
import requests
import wikipediaapi
from difflib import Differ 
import spacy
from spacy.lang.en import English
vocab = English()
from nltk.tokenize import sent_tokenize, word_tokenize 
import nltk
from nltk.stem.porter import *
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ID\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Parse and clear

In [2]:
def clear_html_tags(text):
    text = re.sub("<a href=[^>]+>([^<]+)</a>", r"\1", text)                     # clear all references, keeping text inside
    text = re.sub("<span\s([^<]+)</span>", r"\1", text)                         # clear all containers, keeping text inside
    text = re.sub("<sup\s([^<]+)</sup>", "", text)                              # clear all superscripts
    text = re.sub(r"<b>([^<]+)</b>", r"\1", text)                               # clear all bold tags
    text = re.sub(r"<i>([^<]+)</i>", r"\1", text)                               # clear all italic tags

    return text                 

In [3]:
def parse_wiki(page="Machine_learning"):
    r = requests.get(url=f"https://en.wikipedia.org/wiki/{page}")
    wiki_text = r.text

    paragraph = re.findall(r"<p>(.*)\n", wiki_text)[:40]                        # grab first n paragraphs
    parsed_text = ""

    for p in paragraph:
        parsed_text += clear_html_tags(p)
        
    return parsed_text

In [4]:
parsed_text = parse_wiki()

In [5]:
with open("parsed_text.txt", "w") as f:
    f.write(parsed_text)

### Compare with wiki

In [6]:
def find_wiki_sections(sections, text, level=0):
    for s in sections:
        text += "%s" % s.text
        text = find_wiki_sections(s.sections, text, level + 1)
    return text

In [7]:
def create_wiki_text(page_name = "Machine learning"):
    wiki_wiki = wikipediaapi.Wikipedia(
    user_agent='Wiki parser',
        language='en',
        extract_format=wikipediaapi.ExtractFormat.WIKI
    )

    p_wiki = wiki_wiki.page(page_name)
    text = p_wiki.summary

    wiki_text = find_wiki_sections(p_wiki.sections, text)

    wiki_text = re.sub(":\u200a([^.]+)\u200a", "", wiki_text)                      # remove references

    return wiki_text

In [8]:
def split_by_sentences(text):
    text = re.split(r"(?<!\w\.\s\w.)(?<=\.|\?|\!)\s", text)                        # split sentences after '.', '?' or '!' except 'e. g.'
    return text

In [9]:
def compare_texts(parsed_text, real_text):
    parsed_text = re.sub(r"(?<=[.,])(?=[^\s])", r" ", parsed_text)                 # add space after '.' or ',', except if there is one already
    real_text = re.sub(r"(?<=[.,])(?=[^\s])", r" ", real_text)

    parsed_text_sent = split_by_sentences(parsed_text)
    parsed_text = "\n".join(parsed_text_sent[:40])

    real_text_sent = split_by_sentences(real_text)
    real_text = "\n".join(real_text_sent[:40])

    with open("parsed_text.txt", "w") as f:
        f.write(parsed_text)
    with open("real_text.txt", "w") as f:
        f.write(real_text)
        
    with open("parsed_text.txt") as file_1, open("real_text.txt") as file_2:
        differ = Differ()
        for line in differ.compare(file_1.readlines(), file_2.readlines()):
            print(line)

In [10]:
compare_texts(parsed_text, create_wiki_text())

  Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.

  Recently, artificial neural networks have been able to surpass many previous approaches in performance.

  Machine learning approaches have been applied to many fields including natural language processing, computer vision, speech recognition, email filtering, agriculture, and medicine.

  ML is known in its application across business problems under the name predictive analytics.

  Although not all machine learning is statistically based, computational statistics is an important source of the field's methods.

  The mathematical foundations of ML are provided by mathematical optimization (mathematical programming) methods.

  Data mining is a related (parallel) field of study, focusing on exploratory data analysis (EDA) through unsupervi

### Splitting sentences

In [11]:
parsed_text = re.sub(r"(?<!\(\.)(?<=[.,])(?=[^\s])", r" ", parsed_text)                 # add space after '.' or ',', except if there is one already

In [12]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(parsed_text)

In [13]:
sent_regex = split_by_sentences(parsed_text)
print(len(sent_regex))

123


In [14]:
sent_nltk = sent_tokenize(parsed_text)
print(len(sent_nltk))

122


In [15]:
sent_spacy = [str(sent) for sent in doc.sents]
print(len(sent_spacy))

121


#### Differences

In [16]:
sent_spacy[21:22]

['This follows Alan Turing\'s proposal in his paper "Computing Machinery and Intelligence", in which the question "Can machines think?" is replaced with the question "Can machines do what we (as thinking entities) can do?".']

In [17]:
sent_regex[22:23]

['This follows Alan Turing\'s proposal in his paper "Computing Machinery and Intelligence", in which the question "Can machines think?" is replaced with the question "Can machines do what we (as thinking entities) can do?".']

In [18]:
sent_nltk[20:22]

['This follows Alan Turing\'s proposal in his paper "Computing Machinery and Intelligence", in which the question "Can machines think?"',
 'is replaced with the question "Can machines do what we (as thinking entities) can do?".']

In [19]:
sent_nltk[19:20]

['Tom M. Mitchell provided a widely quoted, more formal definition of the algorithms studied in the machine learning field: "A computer program is said to learn from experience E with respect to some class of tasks T and performance measure P if its performance at tasks in T, as measured by P,  improves with experience E. " This definition of the tasks in which machine learning is concerned offers a fundamentally operational definition rather than defining the field in cognitive terms.']

In [20]:
sent_spacy[19:20]

['Tom M. Mitchell provided a widely quoted, more formal definition of the algorithms studied in the machine learning field: "A computer program is said to learn from experience E with respect to some class of tasks T and performance measure P if its performance at tasks in T, as measured by P,  improves with experience E. "']

In [21]:
sent_regex[19:21]

['Tom M.',
 'Mitchell provided a widely quoted, more formal definition of the algorithms studied in the machine learning field: "A computer program is said to learn from experience E with respect to some class of tasks T and performance measure P if its performance at tasks in T, as measured by P,  improves with experience E.']

In [22]:
sent_regex[46]

'For each compressor C(.) we define an associated vector space ℵ, such that C(.) maps an input string x, corresponding to the vector norm ||~x||.'

In [23]:
sent_nltk[45]

'For each compressor C(.)'

In [24]:
sent_spacy[45]

'For each compressor C(.)'

### Tokenization

In [25]:
def split_by_words(text):
    text = re.findall(r"([\w]+)", text)
    #text = re.findall(r"([\w]+|\(|\)|\.|,)", text)       # also include '(', ')', ',', '.'
    return text

In [26]:
word_regex = split_by_words(sent_regex[0])
print(word_regex) 
print(len(word_regex))

['Machine', 'learning', 'ML', 'is', 'a', 'field', 'of', 'study', 'in', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'development', 'and', 'study', 'of', 'statistical', 'algorithms', 'that', 'can', 'learn', 'from', 'data', 'and', 'generalize', 'to', 'unseen', 'data', 'and', 'thus', 'perform', 'tasks', 'without', 'explicit', 'instructions']
37


In [27]:
word_nltk = word_tokenize(sent_nltk[0])
print(word_nltk) 
print(len(word_nltk))

['Machine', 'learning', '(', 'ML', ')', 'is', 'a', 'field', 'of', 'study', 'in', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'development', 'and', 'study', 'of', 'statistical', 'algorithms', 'that', 'can', 'learn', 'from', 'data', 'and', 'generalize', 'to', 'unseen', 'data', ',', 'and', 'thus', 'perform', 'tasks', 'without', 'explicit', 'instructions', '.']
41


In [28]:
sent_spacy = [sent for sent in doc.sents]
word_spacy = [token.text for token in sent_spacy[0]]
print(word_spacy) 
print(len(word_spacy))

['Machine', 'learning', '(', 'ML', ')', 'is', 'a', 'field', 'of', 'study', 'in', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'development', 'and', 'study', 'of', 'statistical', 'algorithms', 'that', 'can', 'learn', 'from', 'data', 'and', 'generalize', 'to', 'unseen', 'data', ',', 'and', 'thus', 'perform', 'tasks', 'without', 'explicit', 'instructions', '.']
41


### Text normalization with SpaCy

In [29]:
doc

Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions. Recently, artificial neural networks have been able to surpass many previous approaches in performance. Machine learning approaches have been applied to many fields including natural language processing, computer vision, speech recognition, email filtering, agriculture, and medicine. ML is known in its application across business problems under the name predictive analytics. Although not all machine learning is statistically based, computational statistics is an important source of the field's methods. The mathematical foundations of ML are provided by mathematical optimization (mathematical programming) methods. Data mining is a related (parallel) field of study, focusing on exploratory data analysis (EDA) through unsupervised learning. From a

In [30]:
print(list(doc))                                    # tokenized text

[Machine, learning, (, ML, ), is, a, field, of, study, in, artificial, intelligence, concerned, with, the, development, and, study, of, statistical, algorithms, that, can, learn, from, data, and, generalize, to, unseen, data, ,, and, thus, perform, tasks, without, explicit, instructions, ., Recently, ,, artificial, neural, networks, have, been, able, to, surpass, many, previous, approaches, in, performance, ., Machine, learning, approaches, have, been, applied, to, many, fields, including, natural, language, processing, ,, computer, vision, ,, speech, recognition, ,, email, filtering, ,, agriculture, ,, and, medicine, ., ML, is, known, in, its, application, across, business, problems, under, the, name, predictive, analytics, ., Although, not, all, machine, learning, is, statistically, based, ,, computational, statistics, is, an, important, source, of, the, field, 's, methods, ., The, mathematical, foundations, of, ML, are, provided, by, mathematical, optimization, (, mathematical, prog

In [31]:
lemmas = []
for token in doc:
    lemmas.append(token.lemma_)
print(lemmas)                                       # lemmatized text

['machine', 'learning', '(', 'ML', ')', 'be', 'a', 'field', 'of', 'study', 'in', 'artificial', 'intelligence', 'concern', 'with', 'the', 'development', 'and', 'study', 'of', 'statistical', 'algorithm', 'that', 'can', 'learn', 'from', 'datum', 'and', 'generalize', 'to', 'unseen', 'datum', ',', 'and', 'thus', 'perform', 'task', 'without', 'explicit', 'instruction', '.', 'recently', ',', 'artificial', 'neural', 'network', 'have', 'be', 'able', 'to', 'surpass', 'many', 'previous', 'approach', 'in', 'performance', '.', 'machine', 'learning', 'approach', 'have', 'be', 'apply', 'to', 'many', 'field', 'include', 'natural', 'language', 'processing', ',', 'computer', 'vision', ',', 'speech', 'recognition', ',', 'email', 'filtering', ',', 'agriculture', ',', 'and', 'medicine', '.', 'ML', 'be', 'know', 'in', 'its', 'application', 'across', 'business', 'problem', 'under', 'the', 'name', 'predictive', 'analytic', '.', 'although', 'not', 'all', 'machine', 'learning', 'be', 'statistically', 'base', ',

In [32]:
normalized_tokens = [] 
for word in lemmas:
    lexeme = nlp.vocab[word]
    if not lexeme.is_stop:
        normalized_tokens.append(word) 
print(normalized_tokens)                            # text without stop words

['machine', 'learning', '(', 'ML', ')', 'field', 'study', 'artificial', 'intelligence', 'concern', 'development', 'study', 'statistical', 'algorithm', 'learn', 'datum', 'generalize', 'unseen', 'datum', ',', 'perform', 'task', 'explicit', 'instruction', '.', 'recently', ',', 'artificial', 'neural', 'network', 'able', 'surpass', 'previous', 'approach', 'performance', '.', 'machine', 'learning', 'approach', 'apply', 'field', 'include', 'natural', 'language', 'processing', ',', 'computer', 'vision', ',', 'speech', 'recognition', ',', 'email', 'filtering', ',', 'agriculture', ',', 'medicine', '.', 'ML', 'know', 'application', 'business', 'problem', 'predictive', 'analytic', '.', 'machine', 'learning', 'statistically', 'base', ',', 'computational', 'statistic', 'important', 'source', 'field', 'method', '.', 'mathematical', 'foundation', 'ML', 'provide', 'mathematical', 'optimization', '(', 'mathematical', 'programming', ')', 'method', '.', 'datum', 'mining', 'relate', '(', 'parallel', ')', '

In [33]:
stemmer = PorterStemmer()
stems = []
for token in normalized_tokens: 
    stems.append(stemmer.stem(token))
print(stems)                                        # text after stemming

['machin', 'learn', '(', 'ml', ')', 'field', 'studi', 'artifici', 'intellig', 'concern', 'develop', 'studi', 'statist', 'algorithm', 'learn', 'datum', 'gener', 'unseen', 'datum', ',', 'perform', 'task', 'explicit', 'instruct', '.', 'recent', ',', 'artifici', 'neural', 'network', 'abl', 'surpass', 'previou', 'approach', 'perform', '.', 'machin', 'learn', 'approach', 'appli', 'field', 'includ', 'natur', 'languag', 'process', ',', 'comput', 'vision', ',', 'speech', 'recognit', ',', 'email', 'filter', ',', 'agricultur', ',', 'medicin', '.', 'ml', 'know', 'applic', 'busi', 'problem', 'predict', 'analyt', '.', 'machin', 'learn', 'statist', 'base', ',', 'comput', 'statist', 'import', 'sourc', 'field', 'method', '.', 'mathemat', 'foundat', 'ml', 'provid', 'mathemat', 'optim', '(', 'mathemat', 'program', ')', 'method', '.', 'datum', 'mine', 'relat', '(', 'parallel', ')', 'field', 'studi', ',', 'focu', 'exploratori', 'datum', 'analysi', '(', 'eda', ')', 'unsupervis', 'learn', '.', 'theoret', 'vi