In [4]:
import pandas as pd
import numpy as np
import spacy
import os
import nltk
from nltk.tokenize.treebank import TreebankWordDetokenizer
import tqdm

Load the language model.

In [5]:
nlp = spacy.load('en_core_web_lg')

Add customized stopwords to Spacy's stop words.

In [None]:
def find(name, path):
    for root, dirs, files in os.walk(path):
        if name in files:
            return os.path.join(root, name)

In [6]:
path = find('customstopwords.csv','C:/')
stopwords = pd.read_csv(path)

In [7]:
stopwords = list(stopwords['words'])

In [8]:
nlp.Defaults.stop_words.update(stopwords)

In [9]:
for word in nlp.Defaults.stop_words:
    if type(word) is str:
        lexeme = nlp.vocab[word]
        lexeme.is_stop=True

Load the website data that has been matched to the deal metadata.

In [10]:
path = find('ProcessedMatchedAll.xlsx', 'C:/')
scraper = pd.read_excel(path)

In [11]:
scraper = scraper.dropna(subset=['text'])
scraper = scraper.reset_index(drop=True)

In [12]:
scraper.shape

(105264, 25)

Drop rows without text.

In [13]:
scraper = scraper[scraper['text'].notna()]

Get the columns with texts.

In [14]:
texts = list(scraper['text'])

Function that removes words that start with a capital letter, but are not at the beginning of a sentence, such as street names, cities, first names, last names.

In [15]:
def cap_words(sentence):
    sentence = nltk.word_tokenize(sentence)
    sentence = [w for w in sentence if (sentence.index(w)==0) or (w[0].islower())or (w.isalpha()==False)]
    sentence = [w.lower() for w in sentence if w.isalpha()]
    return " ".join(sentence)

Function that checks if the token is not a space, a stopword or punctuation.

In [16]:
def is_token_allowed(token):
    if(not token or not token.string.strip() or token.is_stop or token.is_punct):
        return False
    
    return True

Function that returns a lower case lemma of the word that has been stripped of leading and trailing spaces.

In [17]:
def preprocess_token(token):
    return token.lemma_.strip().lower()

Function that checks if the token does not contain a . in the middle of the token, which would indicate that it is a URL.

In [18]:
def valid_word(token):
    if (token.find('.')==-1) or (token.find('.')==len(token)-1):
        return token

Loop over all texts. First tokenize the text into sentences and check if only the first word starts with a capital. Then tokenize the sentence into tokens and check if the token does not contain a . in the token. Finally preprocess the token to its lemma form if it is of the valid string type.

In [19]:
scraperdocs = list()
for text in tqdm.tqdm(texts):
    text = nltk.tokenize.sent_tokenize(text)
    text =[cap_words(s) for s in text]
    text = " ".join(text)
    tokenize_text = nltk.word_tokenize(text)
    tokenize_text = [word for word in tokenize_text if (word.find('.')==-1) or (word.find('.')==len(word)-1)]
    text = TreebankWordDetokenizer().detokenize(tokenize_text)
    doc = nlp(text)
    doc = [preprocess_token(token) for token in doc if is_token_allowed(token)]
    doc =TreebankWordDetokenizer().detokenize(doc)
    scraperdocs.append(doc)

100%|████████████████████████████████████████████████████████████████████████| 105264/105264 [1:09:07<00:00, 25.38it/s]


Add the cleaned documents as a new column.

In [37]:
scraper['clean_text'] = scraperdocs

Drop empty values and reset the indices. Then save to Excel.

In [38]:
scraper = scraper.dropna(subset=['clean_text'])
scraper = scraper.reset_index(drop=True)

In [39]:
scraper.to_excel('ProcessedCleanedScraper.xlsx')