In [59]:
import spacy
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
import unicodedata

#nlp = spacy.load('en_core', parse=True, tag=True, entity=True)
#nlp_vec = spacy.load('en_vecs', parse = True, tag=True, entity=True)
tokenizer = ToktokTokenizer()
stopword_list = stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

In [68]:
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # strip HTML
        if html_stripping:
            doc = strip_html_tags(doc)
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
            
        normalized_corpus.append(doc)
        
    return normalized_corpus

In [69]:
from urllib import request
url = "https://www.uva.nl/binaries/content/assets/programmas/information-studies/txt-for-assignment-data-science.txt"
response = request.urlopen(url)
raw = response.read().decode('utf8')
type(raw)

str

In [70]:
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

strip_html_tags('<html><h2>Some important text</h2></html>')

'Some important text'

In [71]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

remove_accented_chars('Sómě Áccěntěd těxt')

'Some Accented text'

In [72]:
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

remove_special_characters("Well this was fun! What do you think? 123#@!", 
                          remove_digits=True)

'Well this was fun What do you think '

In [73]:
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

simple_stemmer("My system keeps crashing his crashed yesterday, ours crashes daily")

'My system keep crash hi crash yesterday, our crash daili'

In [66]:
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

lemmatize_text("My system keeps crashing! his crashed yesterday, ours crashes daily")

NameError: name 'nlp' is not defined

In [75]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

remove_stopwords("The, and, if are stopwords, computer is not")

', , stopwords , computer not'

In [81]:
articles = normalize_corpus(raw)
len(articles)

18613

In [103]:
text = []
collection = open('uva_text.txt', 'r')
text = collection.read()
collection.close()

In [104]:
text = re.sub('<doc>', ' ', text)
text = text.split('</doc>')[:-1]

In [105]:
text[1]

'\n \n<docno> LA010189-0013 </docno>\n<docid> 31 </docid>\n<date>\n<p>\nJanuary 1, 1989, Sunday, Home Edition \n</p>\n</date>\n<section>\n<p>\nBook Review; Page 10; Book Review Desk \n</p>\n</section>\n<length>\n<p>\n146 words \n</p>\n</length>\n<headline>\n<p>\nCURRENT PAPERBACKS: WAITING FOR CHILDHOOD BY SUMNER LOCKE ELLIOTT (PERENNIAL \nLIBRARY/ HARPER &amp; ROW: $7.95) \n</p>\n</headline>\n<byline>\n<p>\nBy ELENA BRUNET \n</p>\n</byline>\n<text>\n<p>\nSet in Australia at the turn of the 20th Century, "Waiting for Childhood" is \nthe story of seven children left to cope for themselves after their parents \ndie. Their father, The Rev. William Lord, expires at the breakfast table one \nmorning. After the family leaves for a ramshackle house owned by a wealthy \ncousin, the mother loses her mind and then her life in an accident. \n</p>\n<p>\nThe eldest daughter, Lily, takes charge of the entire household, as Jess \nbecomes a favorite of her rich cousin Jackie and watches her rival for 

In [106]:
for i in range(len(text)):
	# Seperate article items based on tags
	text[i] = re.sub('<.?p>|</.*?>', ' ', text[i])
	text[i] = re.sub('\n', '', text[i])
	text[i] = re.sub('> ', ' <', text[i])
	text[i] = text[i][2:]
	text[i] = text[i].split(' <')

In [101]:
text

[[''], [''], ['']]