In [3]:
"""
Converts Abstract text into a list of terms
"""

import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer


class Preprocessor(object):
    """
    Use nlp techniques to process abstract text.
    The case of the text is lowered and the punctuation is removed
    The words are lemmatize
    Keywords replaced with *keywords*
    """
    def __init__(self, keywords, corpus):
        self.keywords = keywords.lower()
        self.corpus = corpus
        self.abstracts = self.load()
    
    def load(self):
        """
        Read in corpus
        """
        abstracts = pd.read_csv(self.corpus, index_col=0)
        abstracts['AB'] = abstracts['AB'].str.lower()
        abstracts['AB'] = abstracts['AB'].str.replace(self.keywords, '')
        return abstracts
    
    def preprocess(self, abstract):
        """
        These are the steps for normalizing the abstract text
        Convert an abstract to word tokens.  This is done by tokenizing the text,
        removing english stopwords and punctuation,and finally lemmatizing the words. 
        
        Args:
            abstract(str): Indivdual abstracts
        
        Return:
            words(list): list of normalized words
        
        """
        STOPWORDS = set(stopwords.words('english'))
        
        # Instantiate Lemmanizer
        WNL = WordNetLemmatizer()

        # tokenize words, remove punctuation
        tokenizer = RegexpTokenizer(r'\w[\w-]+')
        tokens = tokenizer.tokenize(abstract)
        # print(tokens)

        # Remove stopwords and lemmatize tokens
        words = [WNL.lemmatize(word) for word in tokens if word not in STOPWORDS]
        words.append('*{}*'.format(self.keywords))
        return words

    def process(self):
        """
        Entry point into Class
        Generates list of terms in each document
        
        Args:
            keywords(str): The keyword term from HPO phenotype
            corpus(str): The path to corpus
        
        Returns:
            documents(Pandas.Series.Series): Series of words in each document
            
        """
        documents = self.abstracts.AB.apply(self.preprocess)
        return documents
    

In [4]:
a = Preprocessor('diabetic', 'initial_corpus.csv')
a.process()

0      [document, represents, official, position, ame...
1      [background, randomised, controlled, trial, rc...
2      [background, ketoacidosis, dka, serious, compl...
3      [role, arsenic, trioxide, as2o3, inhibiting, i...
4      [study, aimed, analyze, scientific, literature...
5      [introduction, clear, unmet, clinical, need, p...
6      [aim, compare, continuous, subcutaneous, insul...
7      [background, objective, describe, frequency, c...
8      [introduction, dyslipidemia, hyperglycemia, me...
9      [introduction, nephropathy, leading, cause, mo...
10     [objective, offspring, pregnancy, affected, ge...
11     [background, prader-willi, syndrome, pws, ofte...
12     [context, lipodystrophy, syndrome, rare, disor...
13     [objective, decline, insulin, sensitivity, si,...
14     [purpose, anti-mullerian, hormone, amh, propos...
15     [worrisome, rise, pediatric, type, diabetes, t...
16     [background, congenital, hyperinsulinism, chi,...
17     [background, prospective