# The Second Data Preprocessing

In [7]:
# Built-in packages
import string
import os
import time as t
import warnings
import sys

# Basic Packages for Data Wrangling
import spacy
import pandas as pd
import numpy as np
from numpy import array
import datetime as dt

# NLTK for processing stop words
from nltk.corpus import stopwords
import nltk
from nltk.stem import PorterStemmer # Great stemmer
from nltk.stem import LancasterStemmer # over-stemming easily, more aggresive stemmer
from nltk.stem.snowball import SnowballStemmer # stemmer for non-english languages
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer # lemmatizer (词形还原)
from sklearn.feature_extraction.text import TfidfVectorizer # get tf-idf matrix
from pyinflect import getAllInflections # get inflect words

import re # For regular expression
from tqdm import tqdm # Processing bar

import gensim
from gensim.utils import simple_preprocess

In [8]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])
stop_words.extend( [ "",
                    "a",
                    "able",
                    "about",
                    "above",
                    "according",
                    "accordingly",
                    "across",
                    "actually",
                    "after",
                    "afterwards",
                    "again",
                    "against",
                    "all",
                    "allow",
                    "almost",
                    "alone",
                    "along",
                    "already",
                    "also",
                    "although",
                    "always",
                    "am",
                    "among",
                    "amongst",
                    "an",
                    "and",
                    "another",
                    "any",
                    "anybody",
                    "anyhow",
                    "anyone",
                    "anything",
                    "anyway",
                    "anyways",
                    "anywhere",
                    "apart",
                    "appear",
                    "appreciate",
                    "appropriate",
                    "are",
                    "around",
                    "as",
                    "aside",
                    "ask",
                    "asking",
                    "associated",
                    "at",
                    "available",
                    "away",
                    "awfully",
                    "b",
                    "be",
                    "became",
                    "because",
                    "become",
                    "becomes",
                    "becoming",
                    "been",
                    "before",
                    "beforehand",
                    "behind",
                    "being",
                    "believe",
                    "below",
                    "beside",
                    "besides",
                    "best",
                    "better",
                    "between",
                    "beyond",
                    "both",
                    "brief",
                    "but",
                    "by",
                    "c",
                    "came",
                    "can",
                    "cannot",
                    "cant",
                    "cause",
                    "causes",
                    "certain",
                    "certainly",
                    "changes",
                    "clearly",
                    "co",
                    "com",
                    "come",
                    "comes",
                    "concerning",
                    "consequently",
                    "consider",
                    "considering",
                    "contain",
                    "containing",
                    "contains",
                    "corresponding",
                    "could",
                    "course",
                    "currently",
                    "d",
                    "definitely",
                    "described",
                    "despite",
                    "did",
                    "different",
                    "do",
                    "does",
                    "doing",
                    "done",
                    "down",
                    "downwards",
                    "during",
                    "dr",
                    "e",
                    "each",
                    "edu",
                    "eg",
                    "eight",
                    "either",
                    "else",
                    "elsewhere",
                    "enough",
                    "entirely",
                    "especially",
                    "et",
                    "etc",
                    "even",
                    "ever",
                    "every",
                    "everybody",
                    "everyone",
                    "everything",
                    "everywhere",
                    "ex",
                    "exactly",
                    "example",
                    "except",
                    "f",
                    "far",
                    "few",
                    "fifth",
                    "first",
                    "five",
                    "followed",
                    "following",
                    "follows",
                    "for",
                    "former",
                    "formerly",
                    "forth",
                    "four",
                    "from",
                    "further",
                    "furthermore",
                    "g",
                    "get",
                    "gets",
                    "getting",
                    "give",
                    "given",
                    "gives",
                    "go",
                    "goes",
                    "going",
                    "gone",
                    "got",
                    "gotten",
                    "greetings",
                    "h",
                    "had",
                    "happens",
                    "hardly",
                    "has",
                    "have",
                    "having",
                    "he",
                    "hello",
                    "help",
                    "hence",
                    "her",
                    "here",
                    "hereafter",
                    "hereby",
                    "herein",
                    "hereupon",
                    "hers",
                    "herself",
                    "hi",
                    "him",
                    "himself",
                    "his",
                    "hither",
                    "hopefully",
                    "how",
                    "howbeit",
                    "however",
                    "i",
                    "ie",
                    "if",
                    "ignored",
                    "immediate",
                    "in",
                    "inasmuch",
                    "inc",
                    "indeed",
                    "indicate",
                    "indicated",
                    "indicates",
                    "inner",
                    "insofar",
                    "instead",
                    "into",
                    "inward",
                    "is",
                    "it",
                    "its",
                    "itself",
                    "j",
                    "just",
                    "k",
                    "km",
                    "keep",
                    "keeps",
                    "kept",
                    "know",
                    "knows",
                    "known",
                    "l",
                    "last",
                    "lately",
                    "later",
                    "latter",
                    "latterly",
                    "least",
                    "less",
                    "lest",
                    "let",
                    "like",
                    "liked",
                    "likely",
                    "little",
                    "look",
                    "looking",
                    "looks",
                    "ltd",
                    "m",
                    "mainly",
                    "many",
                    "may",
                    "maybe",
                    "me",
                    "ms",
                    "mean",
                    "meanwhile",
                    "merely",
                    "might",
                    "more",
                    "moreover",
                    "most",
                    "mostly",
                    "much",
                    "must",
                    "my",
                    "myself",
                    "n",
                    "name",
                    "namely",
                    "nd",
                    "near",
                    "nearly",
                    "necessary",
                    "need",
                    "needs",
                    "neither",
                    "never",
                    "nevertheless",
                    "new",
                    "next",
                    "nine",
                    "no",
                    "nobody",
                    "non",
                    "none",
                    "noone",
                    "nor",
                    "normally",
                    "not",
                    "nothing",
                    "novel",
                    "now",
                    "nowhere",
                    "o",
                    "obviously",
                    "of",
                    "off",
                    "often",
                    "oh",
                    "ok",
                    "okay",
                    "old",
                    "on",
                    "once",
                    "one",
                    "ones",
                    "only",
                    "onto",
                    "or",
                    "other",
                    "others",
                    "otherwise",
                    "ought",
                    "our",
                    "ours",
                    "ourselves",
                    "out",
                    "outside",
                    "over",
                    "overall",
                    "own",
                    "p",
                    "particular",
                    "particularly",
                    "per",
                    "perhaps",
                    "placed",
                    "please",
                    "plus",
                    "possible",
                    "presumably",
                    "probably",
                    "provides",
                    "q",
                    "que",
                    "quite",
                    "qv",
                    "r",
                    "rather",
                    "rd",
                    "re",
                    "really",
                    "reasonably",
                    "regarding",
                    "regardless",
                    "regards",
                    "relatively",
                    "respectively",
                    "right",
                    "s",
                    "said",
                    "same",
                    "saw",
                    "say",
                    "saying",
                    "says",
                    "second",
                    "secondly",
                    "see",
                    "seeing",
                    "seem",
                    "seemed",
                    "seeming",
                    "seems",
                    "seen",
                    "self",
                    "selves",
                    "sensible",
                    "sent",
                    "serious",
                    "seriously",
                    "seven",
                    "several",
                    "shall",
                    "she",
                    "should",
                    "since",
                    "six",
                    "so",
                    "some",
                    "somebody",
                    "somehow",
                    "someone",
                    "something",
                    "sometime",
                    "sometimes",
                    "somewhat",
                    "somewhere",
                    "soon",
                    "sorry",
                    "specified",
                    "specify",
                    "specifying",
                    "still",
                    "sub",
                    "such",
                    "sup",
                    "sure",
                    "t",
                    "take",
                    "taken",
                    "tell",
                    "tends",
                    "th",
                    "than",
                    "thank",
                    "thanks",
                    "thanx",
                    "that",
                    "thats",
                    "the",
                    "their",
                    "theirs",
                    "them",
                    "themselves",
                    "then",
                    "thence",
                    "there",
                    "thereafter",
                    "thereby",
                    "therefore",
                    "therein",
                    "theres",
                    "thereupon",
                    "these",
                    "they",
                    "think",
                    "third",
                    "this",
                    "thorough",
                    "thoroughly",
                    "those",
                    "though",
                    "three",
                    "through",
                    "throughout",
                    "thru",
                    "thus",
                    "to",
                    "together",
                    "too",
                    "took",
                    "toward",
                    "towards",
                    "tried",
                    "tries",
                    "truly",
                    "try",
                    "trying",
                    "twice",
                    "two",
                    "u",
                    "un",
                    "under",
                    "unfortunately",
                    "unless",
                    "unlikely",
                    "until",
                    "unto",
                    "up",
                    "upon",
                    "us",
                    "use",
                    "used",
                    "useful",
                    "uses",
                    "using",
                    "usually",
                    "uucp",
                    "v",
                    "value",
                    "various",
                    "very",
                    "via",
                    "viz",
                    "vs",
                    "w",
                    "want",
                    "wants",
                    "was",
                    "way",
                    "we",
                    "welcome",
                    "well",
                    "went",
                    "were",
                    "what",
                    "whatever",
                    "when",
                    "whence",
                    "whenever",
                    "where",
                    "whereafter",
                    "whereas",
                    "whereby",
                    "wherein",
                    "whereupon",
                    "wherever",
                    "whether",
                    "which",
                    "while",
                    "whither",
                    "who",
                    "whoever",
                    "whole",
                    "whom",
                    "whose",
                    "why",
                    "will",
                    "willing",
                    "wish",
                    "with",
                    "within",
                    "without",
                    "wonder",
                    "would",
                    "would",
                    "x",
                    "y",
                    "yes",
                    "yet",
                    "you",
                    "your",
                    "yours",
                    "yourself",
                    "yourselves",
                    "z",
                    "zero",
                    "project",
                    "overall project summary",
                    "abstract",
                    "project description",
                    "description",
                    "summary",
                    "description provided by applicant:",
                    "overall",
                    "applicant",
                    "purpose",
                    "summaryabstract",
                    "descriptionabstract",
                    "made",
                    "highly",
                    "research",
                    "important",
                    "study",
                    "examine",
                    "questions",
                    "range",
                    "funding",
                    "funded",
                    "program",
                    "large",
                    "based",
                    "areas",
                    "high",
                    "field",
                    "show",
                    "provide",
                    "successful",
                    "application",
                    "proposal",
                    "lead",
                    "approach",
                    "closely",
                    "knowledge",
                    "continued",
                    "support",
                    "receive",
                    "method",
                    "david",
                    "greatly",
                    "seminar",
                    "face",
                    "shown",
                    "needed",
                    "area",
                    "academic",
                    "worldwide",
                    "proposed",
                    "great",
                    "goal",
                    "focus",
                    "specific",
                    "remains",
                    "essential",
                    "small",
                    "big",
                    "large",
                    "recently",
                    "supports",
                    "successfully",
                    "require",
                    "students",
                    "training",
                    "support",
                    "program",
                    'https',
                    'doi',
                    'org',
                    'www',
                    "http",
                    "identify",
                    "function",
                    "call",
                    "measure",
                    "understand",
                    "china",
                    "uk",
                    "easet",
                    "north",
                    "africa",
                    "india",
                    "west",
                    "south",
                    "january",
                    "february",
                    "march",
                    "april",
                    "may",
                    "june",
                    "july",
                    "august",
                    "september",
                    "october",
                    "november",
                    "december",
                    "measure",
                    "call",
                    "local",
                    "investigate",
                    "english",
                    "chinese",
                    "apply",
                    "cover",
                    "programme",
                    "propose",
                    "award",
                    "enable",
                    "participation",
                    "decide",
                    "pursue",
                    "work",
                    "survey",
                    "underly",
                    "objective",
                    "contact",
                    "progress",
                    "due",
                    "extend",
                    "induce",
                    "test",
                    "count",
                    "detected",
                    "recent", "research", 'project', "study", "cell", "model",
                    "th", "nd", "rd",
                    "i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "lt",
                    "relate", "data", "significant", "develop", "make", "base"
                    "oxford", "programme", "japanese", "durham", "aim", "include", 
                    "british", "london", "sector", "affect", "sheet", "aspect", "main", 'increase', 'input', 'vast', 'amount'
                    "detail", 'future', 'dataset', 'unique', 'linkage', "predict", 'channel', 'experience', 
                    "bed", "poorly", 'estimate'])
stop_words = set(stop_words)

In [9]:
def stemmed_stop_words(sw = stop_words):
    port = PorterStemmer()
    return set([port.stem(token) for token in stop_words])

stemmed_stop_words = stemmed_stop_words()

def save_word_list(path, word_list):
    # Save high- and low- freqency words
    with open(path, 'w') as filehandle:
        for listitem in word_list:
            filehandle.write('%s\n' % listitem)

def sent_to_words(sentences):
    for txt in tqdm(sentences):
        txt = re.sub('&[A-Za-z]+;[A-Za-z]+;[A-Za-z]+;', ' ', txt)
        txt = re.sub('&[A-Za-z]+;[A-Za-z]+;', ' ', txt)
        txt = re.sub('&[A-Za-z]+;', ' ', txt)
        txt = re.sub('[^A-Za-z\s]', ' ', txt)
        txt = gensim.utils.simple_preprocess(str(txt), deacc=True) 
        yield(txt)  
        
# !python3 -m spacy download en  # run in terminal once
def process_words(texts, stop_words=stop_words, stemmed_stop_words = stemmed_stop_words):
    """Remove Stopwords, Form Bigrams, Trigrams and Lemmatization"""
    print("Starting to remove stopwords...")
    st = t.time()
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    print("Stopwords removed, it takes {} seconds to run.".format(t.time() - st))
    print("Starting to create bigrams and trigrams...")
    st = t.time()
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    print("Bigrams and trigrams created, it takes {} seconds to run.".format(t.time() - st))
    print("Starting to stem words...")
    st = t.time()
    texts_out = []
    for sent in tqdm(texts):
#         doc = (" ".join(sent))
        port = PorterStemmer()
        texts_out.append([port.stem(token) for token in sent])
    print("Words lemmatized, it takes {} seconds to run.".format(t.time() - st))
    print("Starting to remove stop words after stemming...")
    # remove stopwords once more after stemming
    texts_out_stemmed_noStopWords = []
    for doc_words in tqdm(texts_out):
#         texts_out = [[word for word in doc_words if word not in stemmed_stop_words] for doc_words in texts_out]   
        texts_out_stemmed_noStopWords.append([word for word in doc_words if word not in stemmed_stop_words])
    print("All finished!")
    return texts_out_stemmed_noStopWords

In [10]:
def remove_short_text(data, numWords=5):
    col = data['data_ready'].apply(lambda s: True if len(s) > numWords else False).tolist()
    data = data[data.index.isin([i for i, x in enumerate(col) if x])]
    data.reset_index(drop = True, inplace = True)
    return data

def read_word_list(path = '../Data/unimportant_stemmed_words_ALL.txt'):
    with open(path, 'r') as f:
        content = f.readlines()
    # you may also want to remove whitespace characters like `\n` at the end of each line
    return [x.replace("\n", '').strip() for x in content]

In [11]:
df = pd.read_csv("../Data/Cleaned_Data.csv")
data = df.abstract.tolist()

In [12]:
# Convert to list
data_words = list(sent_to_words(data))

100%|██████████████████████████████████████████████████████████████████████████████████████████████| 145787/145787 [02:05<00:00, 1160.59it/s]


In [13]:
len(data_words)

145787

In [14]:
%%time
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

CPU times: user 4min 20s, sys: 5.27 s, total: 4min 25s
Wall time: 4min 25s


In [15]:
data_ready = process_words(data_words)  # processed Text Data!

Starting to remove stopwords...
Stopwords removed, it takes 79.84433794021606 seconds to run.
Starting to create bigrams and trigrams...
Bigrams and trigrams created, it takes 64.1935670375824 seconds to run.
Starting to stem words...


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 145787/145787 [07:24<00:00, 327.98it/s]


Words lemmatized, it takes 444.49759316444397 seconds to run.
Starting to remove stop words after stemming...


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 145787/145787 [00:08<00:00, 16854.85it/s]


All finished!


In [16]:
df["data_ready"] = data_ready
df = remove_short_text(df, 5)
data_ready = df.data_ready.tolist()

In [26]:
df.to_csv("../Data/Cleaned_Data2.csv", index = False)

In [27]:
doc_ready = [" ".join(wl) for wl in data_ready]

In [28]:
save_word_list(path = '../Data/data_ready/Cleaned_Documents.txt', word_list = doc_ready)

In [29]:
len(data_ready)

145712

In [21]:
len([word for wl in data_ready for word in wl])

24944683

In [22]:
all_unique_words = list(set([word for wl in data_ready for word in wl]))
len(all_unique_words)

210655