In [None]:
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from nltk.corpus import stopwords 
sw = stopwords.words("english")

from stop_words import get_stop_words
en_stop = get_stop_words('en')

stop_words = sw + en_stop
stop_words.append('let')
stop_words.append('gon')
stop_words.append('dhe')
stop_words.extend(['car', 'like', 'got', 
                   'get', 'one', 'well', 
                   'back', 'bit', 'drive', 
                   'look', 'see', 'good', 
                   'quite', 'think', 'little', 
                   'right', 'know', 'thing', 'want'])


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def preprocessing(docs:list, vocab=False) -> list:
  """
  Document for processing a list of documents.

  Returns:
  - if vocab == False:
    returns the preprocessed documents
    
  - if vocab == True:
    returns list of words (vocubalary)

  """

  vocabulary = []
  new_docs = [None for _ in range(len(docs))]
  for i, doc in enumerate(docs):
    
    doc = doc.lower()

    tkns = word_tokenize(doc)

    # remove all tokens that are >= 3
    tkns = [w for w in tkns if len(w) > 2]

    # remove stop words
    tkns = [w for w in tkns if w not in stop_words]
    tkns = [w for w in tkns if w not in ["\'re", "n\'t", 
                                         "n\'t", "'ve", "really"]]

    # remove all tokens that are just digits
    tkns = [w for w in tkns if w.isalpha()]

    # lemmatizing
    tkns = [WordNetLemmatizer().lemmatize(w) for w in tkns]

    # stemming
    #tkns = [PorterStemmer().stem(w) for w in tkns]

    # remove all words that are not nouns
    tkns = [w for (w, pos) in nltk.pos_tag(tkns) if pos in ['NN',
                                                            'NNP', 
                                                            'NNS']]
    #print(tkns)
    new_docs[i] = list(tkns)
    vocabulary.extend(tkns)

  if vocab:
    return list(set(vocabulary))
  else:
    return new_docs
