In [2]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

class TextPreprocessor:
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.stopwords_english = stopwords.words('english')

    def process_text(self, text):
        """
        Process text function.
        Input:
            text: a string containing the text to be processed
        Output:
            text_clean: a list of words containing the processed text
        """
        # remove hyperlinks    
        text = re.sub(r'https?://[^\s\n\r]+', '', text)
        # remove hashtags
        # only removing the hash # sign from the word
        text = re.sub(r'#', '', text)
        # tokenize text
        tokenizer = word_tokenize(text)
        text_tokens = tokenizer

        text_clean = []
        for word in text_tokens:
            if (word not in self.stopwords_english and  # remove stopwords
                    word not in string.punctuation):  # remove punctuation
                # text_clean.append(word)
                stem_word = self.stemmer.stem(word)  # stemming word
                text_clean.append(stem_word)

        return text_clean