In [3]:
import pandas as pd
import numpy as np
import nltk
import os
import nltk.corpus # sample text for performing tokenization
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

nltk.download('punkt')

In [2]:
# words that I want to remove from a text because they contribute to no meaning

stop_words = {'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 're',
 's',
 'same',
 'shan',
 "shan't",
 'she',
 "she's",
 'should',
 "should've",
 'shouldn',
 "shouldn't",
 'so',
 'some',
 'such',
 't',
 'than',
 'that',
 "that'll",
 'the',
 'their',
 'theirs',
 'them',
 'themselves',
 'then',
 'there',
 'these',
 'they',
 'this',
 'those',
 'through',
 'to',
 'too',
 'under',
 'until',
 'up',
 've',
 'very',
 'was',
 'wasn',
 "wasn't",
 'we',
 'were',
 'weren',
 "weren't",
 'what',
 'when',
 'where',
 'which',
 'while',
 'who',
 'whom',
 'why',
 'will',
 'with',
 'won',
 "won't",
 'wouldn',
 "wouldn't",
 'y',
 'you',
 "you'd",
 "you'll",
 "you're",
 "you've",
 'your',
 'yours',
 'yourself',
 'yourselves'}

In [7]:
# given a string of text, returns a dictionary of the frequency/proportion of each word
# see example below

def get_freq(text):
    tokens = word_tokenize(text.lower()) # needs punkt package
    # I also turned text to lower case
    num_tokens = len(tokens)
    
    fdist = FreqDist(tokens) # make frequency dictionary
    # remove unwanted words
    for word in stop_words:
        if word in fdist:
            fdist.pop(word)
    
    for t in fdist:
        fdist[t] = fdist[t] / num_tokens
        
    return fdist
    

In [5]:
# EXAMPLE

text = 'Are you looking to make anywhere from 600-115,000$ a month? Are you looking to be paid to take vacations? Are you tired of working a 9-5 with no ability to grow in your field. Do you want to be your own boss and set your own schedule of when you work? If you answer yes to any of these than we have the perfect opportunity for you. Our company is based on 3 principles 1.Saving Money 2.Making Money 3.Managing Money. We offer financial soulutions to help you live financially free while making money showing others. We have over 5,800 national brand retailers where we recieve discounts.'
text

'Are you looking to make anywhere from 600-115,000$ a month? Are you looking to be paid to take vacations? Are you tired of working a 9-5 with no ability to grow in your field. Do you want to be your own boss and set your own schedule of when you work? If you answer yes to any of these than we have the perfect opportunity for you. Our company is based on 3 principles 1.Saving Money 2.Making Money 3.Managing Money. We offer financial soulutions to help you live financially free while making money showing others. We have over 5,800 national brand retailers where we recieve discounts.'

In [8]:
get_freq(text)

FreqDist({'.': 0.043478260869565216, 'money': 0.034782608695652174, '?': 0.02608695652173913, 'looking': 0.017391304347826087, 'make': 0.008695652173913044, 'anywhere': 0.008695652173913044, '600-115,000': 0.008695652173913044, '$': 0.008695652173913044, 'month': 0.008695652173913044, 'paid': 0.008695652173913044, ...})

In [11]:
# num_tokens of this text is 115

tokens = word_tokenize(text.lower())
num_tokens = len(tokens)
num_tokens

115