# This is a notebook that contains notes and example code on NLP 

Project ideas:
Sentiment analysis of historical wikipeda articles
* Explain the limitations and bias within the model
* Use this as a way of showing the importance of how a model is trained

# PREPROCESSING FUNCTION BLOCK

In [None]:
import nltk, re
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter

%load_ext google.colab.data_table
nltk.download('stopwords')
stop_words = stopwords.words('english')
normalizer = WordNetLemmatizer()

def get_part_of_speech(word):
  probable_part_of_speech = wordnet.synsets(word)
  pos_counts = Counter()
  pos_counts["n"] = len(  [ item for item in probable_part_of_speech if item.pos()=="n"]  )
  pos_counts["v"] = len(  [ item for item in probable_part_of_speech if item.pos()=="v"]  )
  pos_counts["a"] = len(  [ item for item in probable_part_of_speech if item.pos()=="a"]  )
  pos_counts["r"] = len(  [ item for item in probable_part_of_speech if item.pos()=="r"]  )
  most_likely_part_of_speech = pos_counts.most_common(1)[0][0]
  return most_likely_part_of_speech

def preprocess_text(text, stop_words_remove = False, join = False):
  cleaned = re.sub(r'\W+', ' ', text).lower()
  tokenized = word_tokenize(cleaned)
  if stop_words_remove is True:
    st = [words for words in tokenized if words not in stop_words]
    tokenized = st
  if join is True:
      normalized = " ".join([normalizer.lemmatize(token, get_part_of_speech(token)) for token in tokenized])
  else:
     normalized = [normalizer.lemmatize(token, get_part_of_speech(token)) for token in tokenized]
  return normalized

The google.colab.data_table extension is already loaded. To reload it, use:
  %reload_ext google.colab.data_table
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Stemming
Stemming is the process of reducing a word to its base form by removing prefixs and suffixes. Example: stemming the word "***going***" would give you "***go***"

NLTK  package uses PorterStemmer,  the mode most often used is NLTK extensions

## USEFUL WHEN:
* When there is a bunch of different forms of the word but you are interested in just the how many times the base is used

In [None]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
nltk.download('punkt')
# making a stemmer object
stemmer = PorterStemmer()
# text to stem
populated_island = 'Java is an Indonesian island in the Pacific Ocean. It is the most populated island in the world, with over 140 million people.'
# tokenizing the word
island_tokenized = word_tokenize(populated_island)
print(f"tokenized:{ island_tokenized}")
# stemmed version
stemmed = [stemmer.stem(token) for token in island_tokenized]
print(f"stemmed: {stemmed}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
tokenized:['Java', 'is', 'an', 'Indonesian', 'island', 'in', 'the', 'Pacific', 'Ocean', '.', 'It', 'is', 'the', 'most', 'populated', 'island', 'in', 'the', 'world', ',', 'with', 'over', '140', 'million', 'people', '.']
stemmed: ['java', 'is', 'an', 'indonesian', 'island', 'in', 'the', 'pacif', 'ocean', '.', 'It', 'is', 'the', 'most', 'popul', 'island', 'in', 'the', 'world', ',', 'with', 'over', '140', 'million', 'peopl', '.']


# Getting part of speech

--- Load in before running the lemmatization example --

A non-optimized way of obtaining the part of speech by looking at synomys



In [None]:
# defining a method to get the part of speech for each word
def get_part_of_speech(word):
  # generates the a list of synonyms to establish context for part of speech
  probable_part_of_speech = wordnet.synsets(word)
  # making  a counter
  pos_counts = Counter()
  # Counting the parts of speech of each synonym in the list
  pos_counts["n"] = len(  [ item for item in probable_part_of_speech if item.pos()=="n"]  )
  pos_counts["v"] = len(  [ item for item in probable_part_of_speech if item.pos()=="v"]  )
  pos_counts["a"] = len(  [ item for item in probable_part_of_speech if item.pos()=="a"]  )
  pos_counts["r"] = len(  [ item for item in probable_part_of_speech if item.pos()=="r"]  )
  # returning the most comon part of speech in the data set
  most_likely_part_of_speech = pos_counts.most_common(1)[0][0]
  return most_likely_part_of_speech

## NLTK  method of taggin

In [None]:
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
nltk.download('averaged_perceptron_tagger')
# text to analyze
text = "Wow! Ramona and her class are happily studying the new textbook she has on NLP."
# tokenize text
token_text = word_tokenize(text)
# pass the tokenized text into pos_tag function (takes in full sentences of tokenized words)
pos_text = pos_tag(token_text)
# this returns a list of each token and its part of speech
print(pos_text)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[('Wow', 'NN'), ('!', '.'), ('Ramona', 'NNP'), ('and', 'CC'), ('her', 'PRP$'), ('class', 'NN'), ('are', 'VBP'), ('happily', 'RB'), ('studying', 'VBG'), ('the', 'DT'), ('new', 'JJ'), ('textbook', 'NN'), ('she', 'PRP'), ('has', 'VBZ'), ('on', 'IN'), ('NLP', 'NNP'), ('.', '.')]


#Lemmatization
Lemmatization is the process of casting the word into its root form. It is more involved then stemming because the model needs to also understand the part of speech, but is a little bit more accurate then stemming

## USEFUL WHEN:
You want to analyze the part of speech of a word

## Limitations:
You have to know the part of speech before hand

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from collections import Counter
nltk.download('wordnet')


# -- LEMMENTING -- #
# lemmatizer object
lemmatizer = WordNetLemmatizer()
populated_island = 'Indonesia was founded in 1945. It contains the most populated island in the world, Java, with over 140 million people.'
# tokenizing the word
populated_tokenized = word_tokenize(populated_island)
print(f"tokenized:{ island_tokenized}")
#  printing before part of speech optimization
san_lemmatized = [lemmatizer.lemmatize(token) for token in populated_tokenized]
print(f"No part of speech lemmatized: {san_lemmatized}")
# lemmatized version -- lemmatize is taking in the word and part of speech of the word
lemmatized = [lemmatizer.lemmatize(token, get_part_of_speech(token)) for token in populated_tokenized]
print(f"lemmatized: {lemmatized}")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
tokenized:['Java', 'is', 'an', 'Indonesian', 'island', 'in', 'the', 'Pacific', 'Ocean', '.', 'It', 'is', 'the', 'most', 'populated', 'island', 'in', 'the', 'world', ',', 'with', 'over', '140', 'million', 'people', '.']
No part of speech lemmatized: ['Indonesia', 'wa', 'founded', 'in', '1945', '.', 'It', 'contains', 'the', 'most', 'populated', 'island', 'in', 'the', 'world', ',', 'Java', ',', 'with', 'over', '140', 'million', 'people', '.']
lemmatized: ['Indonesia', 'be', 'found', 'in', '1945', '.', 'It', 'contain', 'the', 'most', 'populate', 'island', 'in', 'the', 'world', ',', 'Java', ',', 'with', 'over', '140', 'million', 'people', '.']


# Full processing example


In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
import re
# loading model
lemmatizer = WordNetLemmatizer()
# text to break down
oprah_wiki = '<p>Working in local media, she was both the youngest news anchor and the first black female news anchor at Nashville\'s WLAC-TV. </p>'
# cleaning the text
clean = re.sub(r'[\<p>\.\/\-]', "", oprah_wiki)
print(clean)
# Making the words lower cased
cl_lw = clean.lower()
print(cl_lw)
# tokenizing
token_cl = word_tokenize(cl_lw)
print(token_cl)
# lemmenting
lem_cl = [lemmatizer.lemmatize(token, get_part_of_speech(token)) for token in token_cl]
print(lem_cl)

Working in local media, she was both the youngest news anchor and the first black female news anchor at Nashville's WLACTV 
working in local media, she was both the youngest news anchor and the first black female news anchor at nashville's wlactv 
['working', 'in', 'local', 'media', ',', 'she', 'was', 'both', 'the', 'youngest', 'news', 'anchor', 'and', 'the', 'first', 'black', 'female', 'news', 'anchor', 'at', 'nashville', "'s", 'wlactv']
['work', 'in', 'local', 'medium', ',', 'she', 'be', 'both', 'the', 'young', 'news', 'anchor', 'and', 'the', 'first', 'black', 'female', 'news', 'anchor', 'at', 'nashville', "'s", 'wlactv']


# Parsing with Regular Expressions
## Regex methods
* **re.compile** --> defines a regex object that can be used to match patterns within text
* **re.match** --> if the regex compile object exist, it matches whatever pattern you defined before hand, else, pass it the expression and then the text and it will work in the same fashion
* **re.group** --> used to get the results of a match from the re object

In [None]:
import re

# strings are defined
character_1 = "Dorothy was a cool person"
character_2 = "Henry was not very nice"

# compile your regular expression here
regular_expression = re.compile("\w{7}")

# check for a match to character_1 here
result_1 = regular_expression.match(character_1)

# .group() holds the list of all the mathc expression
match_1 = result_1.group(0)

# printing the results
print(match_1)

# not defining an object, just using .match()
result_2 = re.match("\w{7}", character_2)
print(result_2)



Dorothy
None


* **re.search** --> lookas at all the text and matches the first cases and not just the first word
* **re.findall** --> looks at all the text and holds everything that matches the regex expression you pass it

In [None]:
import re

# defining the string to search
text = "Everything is green here, while in the country of the Munchkins blue was the favorite color. But the people do not seem to be as friendly as the Munchkins, and I'm afraid we shall be unable to find a place to pass the night."

# using search
first_match  = re.search(".ee.", text)
print(first_match)

# using findall
all_match = re.findall('.ee', text)
print(f"All the matchs of 'ee': {all_match}")

<re.Match object; span=(15, 19), match='reen'>
All the matchs of 'ee': ['ree', 'see']


## Chunking
A process of grouping words by their respective part of speech 

-- **USES** --
--
This process lets you look at the structure of the sentence based on how the parts of speech are being used

**NOTE** [Consult ](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html)this list on the tags for the different parts of speech


In [None]:
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk import RegexpParser, Tree
import re
nltk.download('averaged_perceptron_tagger')

# -- GETTING A POS TAGGED SENTENCE -- #

# text to analyze -- credit to tommy orange
text = "It’s important that he dress like an Indian, dance like an Indian, even if it is an act, even if he feels like a fraud the whole time, because the only way to be Indian in this world is to look and act like an Indian."
# tokenize text
token_text = word_tokenize(text)
# pass the tokenized text into pos_tag function (takes in full sentences of tokenized words)
pos_text = pos_tag(token_text)

# -- CHUNKER -- #

# defining our chuncker -- looking for places where an adjective and noun are grouped together -> AN
chunk_grammar = "AN: {<JJ><NN>}"
# RegexpParser object -- part of NLTK 
chunk_parser = RegexpParser(chunk_grammar)
# chunking the text
chunked_sentence = chunk_parser.parse(pos_text)
# using NLTK tree function to view the structure
Tree.fromstring(str(chunked_sentence)).pretty_print()


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[('It', 'PRP'), ('’', 'VBZ'), ('s', 'JJ'), ('important', 'JJ'), ('that', 'IN'), ('he', 'PRP'), ('dress', 'VBZ'), ('like', 'IN'), ('an', 'DT'), ('Indian', 'JJ'), (',', ','), ('dance', 'NN'), ('like', 'IN'), ('an', 'DT'), ('Indian', 'JJ'), (',', ','), ('even', 'RB'), ('if', 'IN'), ('it', 'PRP'), ('is', 'VBZ'), ('an', 'DT'), ('act', 'NN'), (',', ','), ('even', 'RB'), ('if', 'IN'), ('he', 'PRP'), ('feels', 'VBZ'), ('like', 'IN'), ('a', 'DT'), ('fraud', 'NN'), ('the', 'DT'), ('whole', 'JJ'), ('time', 'NN'), (',', ','), ('because', 'IN'), ('the', 'DT'), ('only', 'JJ'), ('way', 'NN'), ('to', 'TO'), ('be', 'VB'), ('Indian', 'JJ'), ('in', 'IN'), ('this', 'DT'), ('world', 'NN'), ('is', 'VBZ'), ('to', 'TO'), ('look', 'VB'), ('and', 'CC'), ('act', 'VB'), ('like', 'IN'), ('an', 'DT'), ('Indian', 'JJ'), ('.', '.')]


## Noun phrases
These function as list of words that serve the purpose of being a now, formally defined as a group of words headed by a noun that includes modifers

**Examples (bold is the noun)**
* A **solider** in the window.
* A fast, dazziling red, stupidly expensive **car**.

The noun is pretty much modified by all of the words around it but essitally in just functing a noun for the purpose of meaning.

Common forms in writing would be phrases that start with a DT(determiner, ie: a, the) followed by any number of adjectives with a noun at the end

USES 
--
When you are curious about:
* **how often noun phrases occur**, 
* ***adjectives used to define a particular noun***, 
* **the length of noun phrases in particular styles**,
* **decerning topics of sentences**, so much more! 

In [None]:
import re
tokenized_sentences =['Twitter declined to address the backlash it has faced for allowing members of the Taliban to have accounts on the service, but said it would continue its work monitoring content for any policy violations.', "Zabihullah Mujahid, a spokesperson for the organization, shared live updates of the group's takeover of Afghanistan on its Twitter feed.", 'Sharing claims that the public is "happy" with the group\'s arrival, Twitter came under criticism for allowing the Taliban to spread its messages, especially after former President Donald Trump was banned.', 'When asked for comment on the backlash the social media platform was receiving, a Twitter spokesperson told Newsweek the company will "continue to proactively enforce our rules and review content that may violate Twitter rules, specifically policies against glorification of violence, platform manipulation and spam."', 'On Tuesday, Representative Doug Lamborn sent a letter to Twitter CEO Jack Dorsey expressing "concerns" about members of the Taliban being allowed on the platform but not Trump.', 'He called it "clear" that the Taliban falls under the violent organization category and noted that spokespeople have been promoting messages of a peaceful takeover that runs contrary to media reports of violence against civilians.', '"In my review of these accounts, I did not find a single fact check on any of their tweets, nor any warnings for false or misleading content," Lamborn wrote.', '"It is impossible to see how the accounts of Zabihullah Mujahid and Yousef Ahmandi do not violate your policies."', 'An October 2020 update from Twitter noted that there is "no place" for violent organizations.', 'Assessments as to what constitutes a "violent organization" under the policy are "informed by national and international terrorism designations" and include organizations that identify as an extremist group, have engaged in or currently engage in violence to further their cause and target civilians.', "The Office of the Director of National Intelligence's counterterrorism guide lists the Afghan Taliban as a terrorist group and the group was placed on a Treasury Department list of specially designated global terrorists.", "The Taliban's Pakistan-based branch was also designated a foreign terrorist organization in 2010, although the Taliban itself has never received the designation from the State Department.", 'This may be why Twitter is allowing the accounts to remain active.', "Affiliating with or promoting a terrorist organizations' illicit activities violates the policy, according to Twitter.", "Newsweek reached out to Twitter for clarification as to why the Taliban accounts didn't violate the violent organization policy but did not receive a response in time for publication.", 'While Twitter has largely avoided the issue of Taliban accounts, Facebook announced on Tuesday it would continue its ban on Taliban content and that it dedicated a team of Afghan experts to monitor and remove content.', '"The Taliban is sanctioned as a terrorist organization under U.S. law and we have banned them from our services under our Dangerous Organization policies.', 'This means we remove accounts maintained by or on behalf of the Taliban and prohibit praise, support, and representation of them," a Facebook spokesperson told the BBC.', "Much of the criticism aimed at Twitter came from Trump's supporters, including Representative Marjorie Taylor Greene, who recently had her suspension lifted, and Turning Point USA founder Charlie Kirk.", 'Trump was banned from Twitter on January 8 in the wake of the Capitol riot.', 'The company decided to permanently suspend the former president due to the risk of "further incitement of violence."', 'Regardless of a person\'s opinion on de-platforming, Donie O\'Sullivan, a correspondent for CNN, noted that Trump being banned while the Taliban is not, exposes "some big holes in the company\'s policy."'] 
tokenized_sentences = [re.sub("[^\w*\'?\s]", '', sentences) for sentences in tokenized_sentences]

In [None]:
# -- HANDY WAY OF COUNTING CHUCKS -- #
from collections import Counter

# function that pulls chunks out of chunked sentence and finds the most common chunks
def np_chunk_counter(chunked_sentences, phrase):

    # create a list to hold chunks
    chunks = list()

    # for-loop through each chunked sentence to extract noun phrase chunks
    if phrase is "NP":
        for chunked_sentence in chunked_sentences:
            for subtree in chunked_sentence.subtrees(filter=lambda t: t.label() == 'NP'):
                chunks.append(tuple(subtree))
    if phrase is "VP":
        for chunked_sentence in chunked_sentences:
            for subtree in chunked_sentence.subtrees(filter=lambda t: t.label() == 'VP'):
                chunks.append(tuple(subtree))


    # create a Counter object
    chunk_counter = Counter()

    # for-loop through the list of chunks
    for chunk in chunks:
        # increase counter of specific chunk by 1
        chunk_counter[chunk] += 1

    # return 30 most frequent chunks
    return chunk_counter.most_common(10)

In [None]:
import nltk
from nltk import RegexpParser
from nltk import pos_tag
from nltk.tokenize import word_tokenize
nltk.download("punkt")
nltk.download('averaged_perceptron_tagger')


# -- Chunk Calculations -- #

# defining the chunk to look for 
chunk_grammar = "NP: {<DT>?<JJ>*<NN>}"

# creating the parser
chunk_parser = RegexpParser(chunk_grammar)

# tagging the part of speeches of each word
words_tokened = [nltk.word_tokenize(sentences) for sentences in tokenized_sentences]
pos_words = [pos_tag(sentences) for sentences in words_tokened]

# defining a list to hold the results of the parser and parser each sentences
parsed_np_text = [chunk_parser.parse(sentences) for sentences in pos_words]

# looking at the most common NPs
print(np_chunk_counter(parsed_np_text, "NP"))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[((('content', 'NN'),), 4), ((('violence', 'NN'),), 4), ((('the', 'DT'), ('group', 'NN')), 3), ((('the', 'DT'), ('backlash', 'NN')), 2), ((('platform', 'NN'),), 2), ((('spokesperson', 'NN'),), 2), ((('the', 'DT'), ('company', 'NN')), 2), ((('the', 'DT'), ('violent', 'JJ'), ('organization', 'NN')), 2), ((('the', 'DT'), ('policy', 'NN')), 2), ((('policy', 'NN'),), 2)]


# Verb Phrasing

These are very similar in nature to the noun phrases but instead give you a lot of insight in to how the action of a sentence is being described

Usually two basic structure:
1. Verb -- NP -- and optional adverb(RB)
2. NP -- VB -- and optional adverb(RB)

These are useful in the same information task as np and the structure for tagging them is very much the same


Understanding the regex code: 
--
`<DT>?<JJ>*<NN>` -- Looking for the structure of a noun phrase -- optional DT plus any number of adjectives followd by a now

` <VB.*><RB.?>?` -- a verb followed by an optional form of an adverb 



In [None]:
import nltk
from nltk import RegexpParser
from nltk import pos_tag
from nltk.tokenize import word_tokenize
nltk.download("punkt")
nltk.download('averaged_perceptron_tagger')


# -- Chunk Calculations -- #

# defining the chunk to look for 
chunk_grammar = "VP: {<VB.*><DT>?<JJ>*<NN><RB.?>?}"


# creating the parser
chunk_parser = RegexpParser(chunk_grammar)

# tagging the part of speeches of each word
words_tokened = [nltk.word_tokenize(sentences) for sentences in tokenized_sentences]
pos_words = [pos_tag(sentences) for sentences in words_tokened]

# defining a list to hold the results of the parser and parser each sentences
parsed_vp_text = [chunk_parser.parse(sentences) for sentences in pos_words]

# looking at the most common NPs
print(np_chunk_counter(parsed_vp_text, "VB"))
# If there was a verb phrase then this would print it

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[]


# Chunk Filtering
The process of defining the chunks that you want by first selecting the sentence then removing the parts of speech you don't want

USEFUL FOR:
--
When you know what specific parts of a sentence you want to look at, and the implementation is fairly similar to other methods of chunking. 

In [None]:
import nltk
from nltk import RegexpParser
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk import RegexpParser, Tree
nltk.download("punkt")
nltk.download('averaged_perceptron_tagger')


# -- Chunk Calculations -- #

# defining the chunk to be a sentence 
grammar = "Chunk: {<.*>+}"


# creating the parser
parser = RegexpParser(grammar)

# tagging the part of speeches of each word
words_tokened = [nltk.word_tokenize(sentences) for sentences in tokenized_sentences]
pos_words = [pos_tag(sentences) for sentences in words_tokened]

# chunking the 3rd sentence in the text above
parsed_text = [parser.parse(pos_words[2])]

# looking at the most common 
print(parsed_text)

# Making a chunk that defines np and removes any verbs or preps
chunk_grammar = """NP: {<.*>+}
                       }<VB.?|IN>+{"""
# defining the parser again
chunk_parser = RegexpParser(chunk_grammar)

# Applying the filter
filter_sentence2 = chunk_parser.parse(pos_words[2])

# printing the tree
Tree.fromstring(str(filter_sentence2)).pretty_print()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[Tree('S', [Tree('Chunk', [('Sharing', 'VBG'), ('claims', 'NNS'), ('that', 'IN'), ('the', 'DT'), ('public', 'NN'), ('is', 'VBZ'), ('happy', 'JJ'), ('with', 'IN'), ('the', 'DT'), ('group', 'NN'), ("'s", 'POS'), ('arrival', 'JJ'), ('Twitter', 'NNP'), ('came', 'VBD'), ('under', 'IN'), ('criticism', 'NN'), ('for', 'IN'), ('allowing', 'VBG'), ('the', 'DT'), ('Taliban', 'NNP'), ('to', 'TO'), ('spread', 'VB'), ('its', 'PRP$'), ('messages', 'NNS'), ('especially', 'RB'), ('after', 'IN'), ('former', 'JJ'), ('President', 'NNP'), ('Donald', 'NNP'), ('Trump', 'NNP'), ('was', 'VBD'), ('banned', 'VBN')])])]
                                                                                                     

# Bag of Words

It can be used for all sorts of applications such as:
* topic analysis of songs
* filtering out text
* sentiment analysis
* word clouds

Basics: BoW models are most interested in word count and don't care about the order of the words. Also know as unigram model.

# Basic function for making BOW dictionary

This will produce the frequency count for the words in the sentence as well as reduce them to the lowest root possible some words like **games** would be counted for **game**

In [None]:
def text_to_bow(some_text):
  # create a blank dictionary
  bow_dictionary = {}
  # preprocess the text
  tokens = preprocess_text(some_text)
  # add to the count if its in the dictonary else add it to the dictionary
  for token in tokens:
    if token in bow_dictionary:
      bow_dictionary[token] += 1
    else:
      bow_dictionary[token] = 1
  return bow_dictionary

# Example 
sentence ='Twitter declined to address the backlash it has faced for allowing members of the Taliban to have accounts on the service, but said it would continue its work monitoring content for any policy violations.'
# Calling the funciton
print(text_to_bow(sentence))

{'twitter': 1, 'decline': 1, 'address': 1, 'backlash': 1, 'face': 1, 'allow': 1, 'member': 1, 'taliban': 1, 'account': 1, 'service': 1, 'say': 1, 'would': 1, 'continue': 1, 'work': 1, 'monitor': 1, 'content': 1, 'policy': 1, 'violation': 1}


# BOW: Feature Extraction & Feature Dictionary

Using the news article as an example, a feature dictionary will hold the index position of where the token appeared within all of the text

Below is a pure python implementation of a feature dictionary



In [None]:
# Define create_features_dictionary() below:
def create_features_dictionary(documents):
  # creating a dictonary to hold the index location of each words
  features_dictionary = {}
  # merging all of the sentences into a single string
  merged = " ".join(documents)
  # processing the text
  tokens = preprocess_text(merged)
  # keeping track of the index position
  index  = 0
  # adding the index position as the key value if it doesnt have one
  for token in tokens:
    if token not in features_dictionary:
      features_dictionary[token] = index
      index += 1
  # returning the arguments
  return features_dictionary, tokens

# Text to make into a feature dictionary
tokenized_sentences =['Twitter declined to address the backlash it has faced for allowing members of the Taliban to have accounts on the service, but said it would continue its work monitoring content for any policy violations.', "Zabihullah Mujahid, a spokesperson for the organization, shared live updates of the group's takeover of Afghanistan on its Twitter feed.", 'Sharing claims that the public is "happy" with the group\'s arrival, Twitter came under criticism for allowing the Taliban to spread its messages, especially after former President Donald Trump was banned.', 'When asked for comment on the backlash the social media platform was receiving, a Twitter spokesperson told Newsweek the company will "continue to proactively enforce our rules and review content that may violate Twitter rules, specifically policies against glorification of violence, platform manipulation and spam."', 'On Tuesday, Representative Doug Lamborn sent a letter to Twitter CEO Jack Dorsey expressing "concerns" about members of the Taliban being allowed on the platform but not Trump.', 'He called it "clear" that the Taliban falls under the violent organization category and noted that spokespeople have been promoting messages of a peaceful takeover that runs contrary to media reports of violence against civilians.', '"In my review of these accounts, I did not find a single fact check on any of their tweets, nor any warnings for false or misleading content," Lamborn wrote.', '"It is impossible to see how the accounts of Zabihullah Mujahid and Yousef Ahmandi do not violate your policies."', 'An October 2020 update from Twitter noted that there is "no place" for violent organizations.', 'Assessments as to what constitutes a "violent organization" under the policy are "informed by national and international terrorism designations" and include organizations that identify as an extremist group, have engaged in or currently engage in violence to further their cause and target civilians.', "The Office of the Director of National Intelligence's counterterrorism guide lists the Afghan Taliban as a terrorist group and the group was placed on a Treasury Department list of specially designated global terrorists.", "The Taliban's Pakistan-based branch was also designated a foreign terrorist organization in 2010, although the Taliban itself has never received the designation from the State Department.", 'This may be why Twitter is allowing the accounts to remain active.', "Affiliating with or promoting a terrorist organizations' illicit activities violates the policy, according to Twitter.", "Newsweek reached out to Twitter for clarification as to why the Taliban accounts didn't violate the violent organization policy but did not receive a response in time for publication.", 'While Twitter has largely avoided the issue of Taliban accounts, Facebook announced on Tuesday it would continue its ban on Taliban content and that it dedicated a team of Afghan experts to monitor and remove content.', '"The Taliban is sanctioned as a terrorist organization under U.S. law and we have banned them from our services under our Dangerous Organization policies.', 'This means we remove accounts maintained by or on behalf of the Taliban and prohibit praise, support, and representation of them," a Facebook spokesperson told the BBC.', "Much of the criticism aimed at Twitter came from Trump's supporters, including Representative Marjorie Taylor Greene, who recently had her suspension lifted, and Turning Point USA founder Charlie Kirk.", 'Trump was banned from Twitter on January 8 in the wake of the Capitol riot.', 'The company decided to permanently suspend the former president due to the risk of "further incitement of violence."', 'Regardless of a person\'s opinion on de-platforming, Donie O\'Sullivan, a correspondent for CNN, noted that Trump being banned while the Taliban is not, exposes "some big holes in the company\'s policy."'] 

# running the function
print(create_features_dictionary(tokenized_sentences))

({'twitter': 0, 'decline': 1, 'address': 2, 'backlash': 3, 'face': 4, 'allow': 5, 'member': 6, 'taliban': 7, 'account': 8, 'service': 9, 'say': 10, 'would': 11, 'continue': 12, 'work': 13, 'monitor': 14, 'content': 15, 'policy': 16, 'violation': 17, 'zabihullah': 18, 'mujahid': 19, 'spokesperson': 20, 'organization': 21, 'share': 22, 'live': 23, 'update': 24, 'group': 25, 'takeover': 26, 'afghanistan': 27, 'fee': 28, 'claim': 29, 'public': 30, 'happy': 31, 'arrival': 32, 'come': 33, 'criticism': 34, 'spread': 35, 'message': 36, 'especially': 37, 'former': 38, 'president': 39, 'donald': 40, 'trump': 41, 'ban': 42, 'ask': 43, 'comment': 44, 'social': 45, 'medium': 46, 'platform': 47, 'receive': 48, 'tell': 49, 'newsweek': 50, 'company': 51, 'proactively': 52, 'enforce': 53, 'rule': 54, 'review': 55, 'may': 56, 'violate': 57, 'specifically': 58, 'glorification': 59, 'violence': 60, 'manipulation': 61, 'spam': 62, 'tuesday': 63, 'representative': 64, 'doug': 65, 'lamborn': 66, 'send': 67, 

# Build the BOW Vector

The next step in using the BOW language model, is building a vectored list. This will help you understand the relationship between these words


In [None]:
# Single sentence bow vectorization
def text_to_bow_vector(some_text, features_dictionary):
  # defining the the vector to be returned giving each value +1 for the times it appears
  # in the features dictionary 
  bow_vector = [0 for keys in features_dictionary]
  # preprocessing the text
  tokens = preprocess_text(some_text)
  # adding a +1 if the feature is in the dictioary
  for token in tokens:
      if token in features_dictionary:
        feature_index = features_dictionary[token]
        bow_vector[feature_index] += 1
  # returning both the bag of words and tokens
  return bow_vector, tokens

# -- showing the function in action -- #

# Text to make into a feature dictionary
tokenized_sentences =['Twitter declined to address the backlash it has faced for allowing members of the Taliban to have accounts on the service, but said it would continue its work monitoring content for any policy violations.', "Zabihullah Mujahid, a spokesperson for the organization, shared live updates of the group's takeover of Afghanistan on its Twitter feed.", 'Sharing claims that the public is "happy" with the group\'s arrival, Twitter came under criticism for allowing the Taliban to spread its messages, especially after former President Donald Trump was banned.', 'When asked for comment on the backlash the social media platform was receiving, a Twitter spokesperson told Newsweek the company will "continue to proactively enforce our rules and review content that may violate Twitter rules, specifically policies against glorification of violence, platform manipulation and spam."', 'On Tuesday, Representative Doug Lamborn sent a letter to Twitter CEO Jack Dorsey expressing "concerns" about members of the Taliban being allowed on the platform but not Trump.', 'He called it "clear" that the Taliban falls under the violent organization category and noted that spokespeople have been promoting messages of a peaceful takeover that runs contrary to media reports of violence against civilians.', '"In my review of these accounts, I did not find a single fact check on any of their tweets, nor any warnings for false or misleading content," Lamborn wrote.', '"It is impossible to see how the accounts of Zabihullah Mujahid and Yousef Ahmandi do not violate your policies."', 'An October 2020 update from Twitter noted that there is "no place" for violent organizations.', 'Assessments as to what constitutes a "violent organization" under the policy are "informed by national and international terrorism designations" and include organizations that identify as an extremist group, have engaged in or currently engage in violence to further their cause and target civilians.', "The Office of the Director of National Intelligence's counterterrorism guide lists the Afghan Taliban as a terrorist group and the group was placed on a Treasury Department list of specially designated global terrorists.", "The Taliban's Pakistan-based branch was also designated a foreign terrorist organization in 2010, although the Taliban itself has never received the designation from the State Department.", 'This may be why Twitter is allowing the accounts to remain active.', "Affiliating with or promoting a terrorist organizations' illicit activities violates the policy, according to Twitter.", "Newsweek reached out to Twitter for clarification as to why the Taliban accounts didn't violate the violent organization policy but did not receive a response in time for publication.", 'While Twitter has largely avoided the issue of Taliban accounts, Facebook announced on Tuesday it would continue its ban on Taliban content and that it dedicated a team of Afghan experts to monitor and remove content.', '"The Taliban is sanctioned as a terrorist organization under U.S. law and we have banned them from our services under our Dangerous Organization policies.', 'This means we remove accounts maintained by or on behalf of the Taliban and prohibit praise, support, and representation of them," a Facebook spokesperson told the BBC.', "Much of the criticism aimed at Twitter came from Trump's supporters, including Representative Marjorie Taylor Greene, who recently had her suspension lifted, and Turning Point USA founder Charlie Kirk.", 'Trump was banned from Twitter on January 8 in the wake of the Capitol riot.', 'The company decided to permanently suspend the former president due to the risk of "further incitement of violence."', 'Regardless of a person\'s opinion on de-platforming, Donie O\'Sullivan, a correspondent for CNN, noted that Trump being banned while the Taliban is not, exposes "some big holes in the company\'s policy."'] 

# running the feature dictionary function
feat_dict, tokens = create_features_dictionary(tokenized_sentences[5:13:1])
print(tokenized_sentences[5])
# making a bow vector of the first sentence
bow_vector, tokens = text_to_bow_vector(tokenized_sentences[0], feat_dict)

# printing the bow_vector
print(bow_vector)

He called it "clear" that the Taliban falls under the violent organization category and noted that spokespeople have been promoting messages of a peaceful takeover that runs contrary to media reports of violence against civilians.
Twitter declined to address the backlash it has faced for allowing members of the Taliban to have accounts on the service, but said it would continue its work monitoring content for any policy violations.
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]


Understanding the output:
The vector returned will indicate if the text given appears in the feature dictonary that you passed in. If it does, the count is stored at the index position of the dictionary that the token from the sentence appears at. So, for example, the word "it" appears in the feature dictonary at index 2, since "it" appears in the sentence we feed it, the count for it is increased by one!

# Using the Scikit for word vectors
Scikit offers a very compact way of generating word vectors in just a few lines of code.

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
# Import CountVectorizer from sklearn:
from collections import Counter

# This count vectorizer will give us a vector count object
bow_vectorizer_count = CountVectorizer()

# The text we are going to train the vector on
tokenized_sentences =['Twitter declined to address the backlash it has faced for allowing members of the Taliban to have accounts on the service, but said it would continue its work monitoring content for any policy violations.', "Zabihullah Mujahid, a spokesperson for the organization, shared live updates of the group's takeover of Afghanistan on its Twitter feed.", 'Sharing claims that the public is "happy" with the group\'s arrival, Twitter came under criticism for allowing the Taliban to spread its messages, especially after former President Donald Trump was banned.', 'When asked for comment on the backlash the social media platform was receiving, a Twitter spokesperson told Newsweek the company will "continue to proactively enforce our rules and review content that may violate Twitter rules, specifically policies against glorification of violence, platform manipulation and spam."', 'On Tuesday, Representative Doug Lamborn sent a letter to Twitter CEO Jack Dorsey expressing "concerns" about members of the Taliban being allowed on the platform but not Trump.', 'He called it "clear" that the Taliban falls under the violent organization category and noted that spokespeople have been promoting messages of a peaceful takeover that runs contrary to media reports of violence against civilians.', '"In my review of these accounts, I did not find a single fact check on any of their tweets, nor any warnings for false or misleading content," Lamborn wrote.', '"It is impossible to see how the accounts of Zabihullah Mujahid and Yousef Ahmandi do not violate your policies."', 'An October 2020 update from Twitter noted that there is "no place" for violent organizations.', 'Assessments as to what constitutes a "violent organization" under the policy are "informed by national and international terrorism designations" and include organizations that identify as an extremist group, have engaged in or currently engage in violence to further their cause and target civilians.', "The Office of the Director of National Intelligence's counterterrorism guide lists the Afghan Taliban as a terrorist group and the group was placed on a Treasury Department list of specially designated global terrorists.", "The Taliban's Pakistan-based branch was also designated a foreign terrorist organization in 2010, although the Taliban itself has never received the designation from the State Department.", 'This may be why Twitter is allowing the accounts to remain active.', "Affiliating with or promoting a terrorist organizations' illicit activities violates the policy, according to Twitter.", "Newsweek reached out to Twitter for clarification as to why the Taliban accounts didn't violate the violent organization policy but did not receive a response in time for publication.", 'While Twitter has largely avoided the issue of Taliban accounts, Facebook announced on Tuesday it would continue its ban on Taliban content and that it dedicated a team of Afghan experts to monitor and remove content.', '"The Taliban is sanctioned as a terrorist organization under U.S. law and we have banned them from our services under our Dangerous Organization policies.', 'This means we remove accounts maintained by or on behalf of the Taliban and prohibit praise, support, and representation of them," a Facebook spokesperson told the BBC.', "Much of the criticism aimed at Twitter came from Trump's supporters, including Representative Marjorie Taylor Greene, who recently had her suspension lifted, and Turning Point USA founder Charlie Kirk.", 'Trump was banned from Twitter on January 8 in the wake of the Capitol riot.', 'The company decided to permanently suspend the former president due to the risk of "further incitement of violence."', 'Regardless of a person\'s opinion on de-platforming, Donie O\'Sullivan, a correspondent for CNN, noted that Trump being banned while the Taliban is not, exposes "some big holes in the company\'s policy."'] 

# Lets split this up into a training set and a testing set
# train set is everything after the 3rd sentence in the article
train_set = tokenized_sentences[3::]
# test set will be the first 3 sentences
test_set = tokenized_sentences[0:3:1]
# Define training_vectors:
bow_vectorizer = bow_vectorizer_count.fit(train_set)
# Define test_vectors:
bow_vectors = bow_vectorizer.transform(test_set)
# printing them as arrays
print(bow_vectors.toarray()[0])
# The ouput is how many times the features of the all sentences afte the third appear
# in the first third of the sentences

[0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 3 0 0 0 0 0 0 2 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0]


# Looking at a Scikits implementations

Check out this [link](https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html#extracting-features-from-text-files) for some extremely nice examples of how to implement Scikits language pipeline


# Term Frequency-Inverse Document Frequency

Basic Idea: Getting an idea of which words are actually important in understanding the meaning of the sentence. It is also know as tf-idf

How it works: you apply it over a large body of text containing multiple documents, the tf-idf gives you a score of each words importance relative to how many times it appears within each text document. This would make more common words have less of score if the appear across each document, but unique words would have a high score.

Its a bag of words model that gives a better insight to what words are important relative to the document they are in.

In [None]:
# Example 
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download("punkt")
nltk.download("wordnet")
nltk.download('averaged_perceptron_tagger')


# -- Preprocessing function needs to join to be set to true! -- #

# defining the text we want to look at
document_1 = "They ran across the small river"
document_2 = "I ran across the small river"
document_3 = "He ran across the small river"

# corpus of documents
corpus = [document_1, document_2, document_3]

# preprocess documents
processed_corpus = [preprocess_text(doc, join = True) for doc in corpus]

# initialize and fit TfidfVectorizer
vectorizer = TfidfVectorizer(norm=None)
tf_idf_scores = vectorizer.fit_transform(processed_corpus)

# get vocabulary of terms
feature_names = vectorizer.get_feature_names()
corpus_index = [n for n in processed_corpus]

# create pandas DataFrame with tf-idf scores
df_tf_idf = pd.DataFrame(tf_idf_scores.T.todense(), index=feature_names, columns=corpus_index)
print(df_tf_idf)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
        they run across the small river  ...  he run across the small river
across                         1.000000  ...                       1.000000
he                             0.000000  ...                       1.693147
river                          1.000000  ...                       1.000000
run                            1.000000  ...                       1.000000
small                          1.000000  ...                       1.000000
the                            1.000000  ...                       1.000000
they                           1.693147  ...          

For these next two exercises poems by emily dickinson are used  to look at the application of tf-idf.

### First exercise
Looking at the first part of the TF - term frequency. We are getting an idea of how many times each word appears in the poem, the logic being that the more it appears the more relevence it should have

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

poem ='''
A Bird, came down the Walk -
He did not know I saw -
He bit an Angle Worm in halves
And ate the fellow, raw,

And then, he drank a Dew
From a convenient Grass -
And then hopped sidewise to the Wall
To let a Beetle pass -

He glanced with rapid eyes,
That hurried all abroad -
They looked like frightened Beads, I thought,
He stirred his Velvet Head. -

Like one in danger, Cautious,
I offered him a Crumb,
And he unrolled his feathers,
And rowed him softer Home -

Than Oars divide the Ocean,
Too silver for a seam,
Or Butterflies, off Banks of Noon,
Leap, plashless as they swim.
'''

# preprocess text
processed_poem = preprocess_text(poem, join=True)

# initialize and fit CountVectorizer
vectorizer = CountVectorizer()
term_frequencies = vectorizer.fit_transform([processed_poem])

# get vocabulary of terms
feature_names = vectorizer.get_feature_names()

# create pandas DataFrame with term frequencies
df_term_frequencies = pd.DataFrame(term_frequencies.T.todense(), index=feature_names, columns=['Term Frequency'])
print(df_term_frequencies)


        Term Frequency
abroad               1
all                  1
an                   1
and                  5
angle                1
...                ...
velvet               1
walk                 1
wall                 1
with                 1
worm                 1

[79 rows x 1 columns]


### Second exercise
We can build on the features and information that we extracted in the last step

This is the IDF part. This is a method of penalizing the words that appear the most often across **all** documents. This way super common words that don't provide insight across documentation should have a lower score.

SciKit provides a super streamlined version of this. Link to the documentation [here](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html)


In [None]:
# POEMS 
poem_1 = '''
Success is counted sweetest
By those who ne'er succeed.
To comprehend a nectar
Requires sorest need.

Not one of all the purple host
Who took the flag to-day
Can tell the definition,
So clear, of victory,

As he, defeated, dying,
On whose forbidden ear
The distant strains of triumph
Break, agonized and clear!'''

poem_2 = '''
Wild nights! Wild nights!
Were I with thee,
Wild nights should be
Our luxury!

Futile the winds
To a heart in port, —
Done with the compass,
Done with the chart.

Rowing in Eden!
Ah! the sea!
Might I but moor
To-night in thee!'''

poem_3 = '''
I'm nobody! Who are you?
Are you nobody, too?
Then there 's a pair of us — don't tell!
They 'd banish us, you know.

How dreary to be somebody!
How public, like a frog
To tell your name the livelong day
To an admiring bog!'''

poem_4 = '''
I felt a funeral in my brain,
   And mourners, to and fro,
Kept treading, treading, till it seemed
   That sense was breaking through.

And when they all were seated,
   A service like a drum
Kept beating, beating, till I thought
   My mind was going numb.

And then I heard them lift a box,
   And creak across my soul
With those same boots of lead, again.
   Then space began to toll

As all the heavens were a bell,
   And Being but an ear,
And I and silence some strange race,
   Wrecked, solitary, here.'''

poem_5 = '''
Hope is the thing with feathers
That perches in the soul,
And sings the tune without the words,
And never stops at all,

And sweetest in the gale is heard;
And sore must be the storm
That could abash the little bird
That kept so many warm.

I 've heard it in the chillest land,
And on the strangest sea;
Yet, never, in extremity,
It asked a crumb of me.'''

poem_6 = '''
The pedigree of honey
Does not concern the bee;
A clover, any time, to him
Is aristocracy.'''

poems = [poem_1, poem_2, poem_3, poem_4, poem_5, poem_6]

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
# -- Setting up the Term Frequencies -- #

# process each poem
poems_proce = [preprocess_text(poem, join=True) for poem in poems]

# counting each word across the poems
vectorizer = CountVectorizer()
# Getting the total count
term_frequencies = vectorizer.fit_transform(poems_proce)

# Storing the words
feature_names = vectorizer.get_feature_names()

# createing the index for each poem
corpus_index = [f"Poem {i+1}" for i in range(len(poems))]

# making a DataFrame for the term frequencies
df_term_frequencies = pd.DataFrame(term_frequencies.T.todense(), index=feature_names, columns=corpus_index)

# Printing just the term frequencies
print(df_term_frequencies)

# making the model and fitting it to the term frequencies
transformer = TfidfTransformer()
transformer.fit(term_frequencies)
# grannomg the frequencies
idf_values = transformer.idf_

# making the df to store all of the inverse frequencies
df_idf = pd.DataFrame(idf_values, index = feature_names, columns=['Inverse Document Frequency'])
# printing the inverse
print(df_idf)

## Putting it together

Overview of how the TF-IDF is calculated

Tfidf (t,d) = tf(t,d) * idf(t,corpus)

* *t* = term
* *d* = document
* *tf* = term frequency
* *idf* = Inverse term frequency
* *corpus* = all documentation

Knowing how its calculated, we can understand the output form the scikit-learns TfidfVectorizer function. Note that this function auto normalizes the values of each word

This function lets us pass in a string and it produces a vectorized version of the scores that we can then view with pandas

In [None]:
# -- Run the Poems block and preprocessing block -- #

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# preprocess documents
processed_poems = [preprocess_text(poem, False,True) for poem in poems]

# initialize the model
vectorizer = TfidfVectorizer()
# fit the model
scores = vectorizer.fit_transform(processed_poems)

# Storing the words
feature_names = vectorizer.get_feature_names()

# createing the index for each poem
poem_index = [f"Poem {i+1}" for i in range(len(poems))]

# printing the TFIDF df
df_tf_idf = pd.DataFrame(scores.T.todense(), index=feature_names, columns=poem_index)
print(df_tf_idf)

# Producing an TFIDF from a BOW model

Lets say you alread have a BOW model and you want to convert that into TFIDF, this is pretty easily done with TfidfTransformer

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

# --  Making the BOW Model -- #

# preprocess text
processed_poems = [preprocess_text(poem, False,True) for poem in poems]

# initialize and fit CountVectorizer
vectorizer = CountVectorizer()
BOW = vectorizer.fit_transform(processed_poems)

# get vocabulary of terms
feature_names = vectorizer.get_feature_names()

# Naming each poem column in the DF
cindex = [f"Poem {i+1}" for i in range(len(poems))]

# create pandas DataFrame with term frequencies
df_term_frequencies = pd.DataFrame(BOW.T.todense(), index=feature_names, columns=cindex)

# -- Transforming the BOW model -- #

# defining the model
transformer = TfidfTransformer(norm=None)
# fitting the model to the BOW model
scores = transformer.fit_transform(BOW)

# making a DF for the values
df_tf_idf = pd.DataFrame(scores.T.todense(), index=feature_names, columns=cindex)
# printing the full df and the max value per poem
print(df_tf_idf)
print(df_tf_idf.max())

           Poem 1  Poem 2    Poem 3    Poem 4    Poem 5  Poem 6
abash    0.000000     0.0  0.000000  0.000000  2.252763     0.0
across   0.000000     0.0  0.000000  2.252763  0.000000     0.0
admire   0.000000     0.0  2.252763  0.000000  0.000000     0.0
again    0.000000     0.0  0.000000  2.252763  0.000000     0.0
agonize  2.252763     0.0  0.000000  0.000000  0.000000     0.0
...           ...     ...       ...       ...       ...     ...
word     0.000000     0.0  0.000000  0.000000  2.252763     0.0
wreck    0.000000     0.0  0.000000  2.252763  0.000000     0.0
yet      0.000000     0.0  0.000000  0.000000  2.252763     0.0
you      0.000000     0.0  6.758289  0.000000  0.000000     0.0
your     0.000000     0.0  2.252763  0.000000  0.000000     0.0

[173 rows x 6 columns]
Poem 1     4.505526
Poem 2     9.011052
Poem 3     6.758289
Poem 4    12.476926
Poem 5     9.000000
Poem 6     2.252763
dtype: float64


# Word Emmbedings 
--- 
## Concept
The concept stems from the idea that the words in a sentence gain there context from the other words around them. So basically the relationship between words is how we understand the words themselves

## Implementation
Words have been catagoriezed based on there relationship to each other into vectors, these vectors can hold an infinate amount of data points as it relates to some relationship the word has with other words. A popular package for doing this embedding is spacy package.

# Looking at word vectors and their length
Spacy provides vectors on different languages so that you don't have to generate them yourself

In [None]:
# importing spacy
import spacy

# Loading the model
nlp = spacy.load('en')

# Looking at different words
sad_vector = nlp('sad').vector
happy_vector = nlp('happy').vector
angry_vector =nlp('angry').vector

print(sad_vector[0:10], happy_vector[0:10], angry_vect0r[0:10])

# Examining the length
print(len(sad_vector))


## How do we use this information?

Distance, distance, distance. For the vectors to mean anything we need some way to quantify what they mean, and that can be done through a bunch of different ways, but the three most common are

**Manhattan**  -- The simpliest where you compare the absolute diffrence between indexes and add them all up. Consider the vectors [1,2,3] and [2,4,6] then the distance would be: 

distance = ∣1−2∣+∣2−4∣+∣3−6∣=1+2+3=6

**Euclidean** -- This is super close to the distance formula learned in math, you take the diffrence of each index squared then take the sum of all them and square root it. Using the same two vectors

eu_distance = sqrt( diffrence 1^2 + diffrence 2^2 ... )

**Cosine** -- This looks at the difference in terms of angle between the two vectors. Basically looking at which way the vectors point and caculating the difference between them. See [here](https://en.wikipedia.org/wiki/Cosine_similarity#Definition) for more info on the math

## BUT WHY?

The reason that the distance is so import is that in theroy words generate there meaning from the context they are in. Since the vector is generated from its context to other words, then similar words should have similar vectors. This gives a great insight into the context of a word


In [None]:
import numpy as np
from scipy.spatial.distance import cityblock, euclidean, cosine
import spacy

# load word embedding model
nlp = spacy.load('en')

# define word embedding vectors
happy_vec = nlp('happy').vector
sad_vec = nlp('sad').vector
angry_vec = nlp('angry').vector

# calculate Manhattan distance
man_happy_sad = cityblock(happy_vec, sad_vec)
man_sad_angry = cityblock(sad_vec, angry_vec)
print("manhattan distances: ",man_happy_sad, man_sad_angry)



# calculate Euclidean distance
euc_happy_sad = euclidean(happy_vec, sad_vec)
euc_sad_angry = euclidean(sad_vec, angry_vec)
print(" Euclidean distances:", euc_happy_sad, euc_sad_angry)

# calculate cosine distance
cos_happy_sad = cosine(happy_vec, sad_vec)
cos_sad_angry = cosine(sad_vec, angry_vec)
print("cosine distances: ", cos_happy_sad, cos_sad_angry)


manhattan distances:  113.86418 118.5128
 Euclidean distances: 14.85196304321289 14.087335586547852
cosine distances:  0.2744985818862915 0.23983347415924072


##  How to generate the vectors

### 1st Approach: CBOW
To use CBOW or  continuous bag of words, this goes through each word in a training corupus and tries to predict the word that goes after it by applying the a bow model to the words around it

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# function for generating CBOWS based of a predeterimined context length
# generating the example sentence and processing it
sentence = "To use CBOW or continuous bag of words, this goes through each word in a training corupus and tries to predict the word that goes after it by applying the a bow model to the words around it"
print(sentence)

# preprocessing
sentence_lst = [word.lower().strip(".") for word in sentence.split()]

# context length
context = 3

def get_cbows(sentence_lst, context_length):
  # defining the cbows array
  cbows = []
  # looping through the sentence list
  for i, val in enumerate(sentence_lst):
    # pass if the word has zero context around it
    if i < context_length:
      pass
    # captures the context surrounding the word
    elif i < len(sentence_lst) - context_length:
      # generating a supset of the contextual words around the word of interest
      context = sentence_lst[i-context_length:i] + sentence_lst[i+1:i+context_length+1]
      # generates a vector for each word
      vectorizer = CountVectorizer()
      vectorizer.fit_transform(context)
      # getting the features of the vector
      context_no_order = vectorizer.get_feature_names()
      # appending the features and value of the vectors to the CBOWS list
      cbows.append((val,context_no_order))
  return cbows

# running the function
cbows = get_cbows(sentence_lst, context)
# printing the results
for cbow in cbows:
    print(cbow)


To use CBOW or continuous bag of words, this goes through each word in a training corupus and tries to predict the word that goes after it by applying the a bow model to the words around it
('or', ['bag', 'cbow', 'continuous', 'of', 'to', 'use'])
('continuous', ['bag', 'cbow', 'of', 'or', 'use', 'words'])
('bag', ['cbow', 'continuous', 'of', 'or', 'this', 'words'])
('of', ['bag', 'continuous', 'goes', 'or', 'this', 'words'])
('words,', ['bag', 'continuous', 'goes', 'of', 'this', 'through'])
('this', ['bag', 'each', 'goes', 'of', 'through', 'words'])
('goes', ['each', 'of', 'this', 'through', 'word', 'words'])
('through', ['each', 'goes', 'in', 'this', 'word', 'words'])
('each', ['goes', 'in', 'this', 'through', 'word'])
('word', ['each', 'goes', 'in', 'through', 'training'])
('in', ['corupus', 'each', 'through', 'training', 'word'])
('a', ['and', 'corupus', 'each', 'in', 'training', 'word'])
('training', ['and', 'corupus', 'in', 'tries', 'word'])
('corupus', ['and', 'in', 'to', 'traini

### Approach 2.

Using the skip grams across the text in order to gather context form how the words appear together. Think of it as interration and then using the patterns from the iteration to understand the positioning and how the words are used

In [None]:
# function for generating CBOWS based of a predeterimined context length
# generating the example sentence and processing it
sentence = "To use CBOW or continuous bag of words, this goes through each word in a training corupus and tries to predict the word that goes after it by applying the a bow model to the words around it"
print(sentence)

# preprocessing
sentence_lst = [word.lower().strip(".") for word in sentence.split()]

# context length
context = 3

# skips across the text and takes every third word and pairs it
def get_skip_grams(sentence_lst, context_length):
  # list to hold skip grams
  skip_grams = []
  
# looping through the sentence list
  for i, val in enumerate(sentence_lst):
    # pass if the word has zero context around it
    if i < context_length:
      pass
    # if there is enough context it starts to capture the words that appear every 3 position after the word
    elif i < len(sentence_lst) - context_length:
      context = sentence_lst[i-context_length:i] + sentence_lst[i+1:i+context_length+1]
      skip_grams.append((val, context))
    # returning the skip_grams
  return skip_grams
# calling the function 
skip_grams = get_skip_grams(sentence_lst, context)
# printing out the results
for skip_gram in skip_grams:
    print(skip_gram)

To use CBOW or continuous bag of words, this goes through each word in a training corupus and tries to predict the word that goes after it by applying the a bow model to the words around it
('or', ['to', 'use', 'cbow', 'continuous', 'bag', 'of'])
('continuous', ['use', 'cbow', 'or', 'bag', 'of', 'words,'])
('bag', ['cbow', 'or', 'continuous', 'of', 'words,', 'this'])
('of', ['or', 'continuous', 'bag', 'words,', 'this', 'goes'])
('words,', ['continuous', 'bag', 'of', 'this', 'goes', 'through'])
('this', ['bag', 'of', 'words,', 'goes', 'through', 'each'])
('goes', ['of', 'words,', 'this', 'through', 'each', 'word'])
('through', ['words,', 'this', 'goes', 'each', 'word', 'in'])
('each', ['this', 'goes', 'through', 'word', 'in', 'a'])
('word', ['goes', 'through', 'each', 'in', 'a', 'training'])
('in', ['through', 'each', 'word', 'a', 'training', 'corupus'])
('a', ['each', 'word', 'in', 'training', 'corupus', 'and'])
('training', ['word', 'in', 'a', 'corupus', 'and', 'tries'])
('corupus', [

## Generating Word2vecs

This can be done using the gensim package that allows the user to make a unique model

### Notes on the arugments for model [generation](https://radimrehurek.com/gensim/models/word2vec.html)

* size = is the number of embeddings/vector positions you want the  model to have for each word
* window = the context window, max distance between one word to another
* min_count = The amount of times a word has to appear to be apart of the corpus
* workers = Thread count
* sample = range(0,.00001) used for downsampleing
* alpha = learning rate
* min_alpha = what the learning rate will slow down to
* sg = 1 for skipgrams, 0 for CBOW



In [None]:
# Using the poems by E 
nltk.download('punkt')
nltk.download('wordnet')
#  loading in the poems
example_poems = poems
# processing the text
procecess_poems =  [preprocess_text(poem, True) for poem in example_poems]

# -- LETS BUILD THIS SHIT! -- #
import gensim

# make the model
model_sg = gensim.models.Word2Vec(procecess_poems, window=5, min_count=1, workers=2, sg=1)
model_CBOW = gensim.models.Word2Vec(procecess_poems, window=5, min_count=1, workers=2, sg=0)

# saving the vocab of the model
vocab_sg =  list(model_sg.wv.vocab)
vocab_cbow = list(model_CBOW.wv.vocab)
# view vocab from models (should be the same)
print(vocab_sg)
print(vocab_cbow)
# Looking at the most similar to frog from both versions
similar_to_frog_sg = model_sg.wv.most_similar("frog", topn = 10)
similar_to_frog_cbow = model_CBOW.wv.most_similar("frog", topn = 10)
# looking at the words
print(similar_to_frog_sg, "\n", similar_to_frog_cbow)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
['success', 'count', 'sweet', 'ne', 'er', 'succeed', 'comprehend', 'nectar', 'require', 'sorest', 'need', 'one', 'purple', 'host', 'take', 'flag', 'day', 'tell', 'definition', 'clear', 'victory', 'defeat', 'die', 'whose', 'forbid', 'ear', 'distant', 'strain', 'triumph', 'break', 'agonize', 'wild', 'night', 'thee', 'luxury', 'futile', 'wind', 'heart', 'port', 'do', 'compass', 'chart', 'rowing', 'eden', 'ah', 'sea', 'might', 'moor', 'nobody', 'pair', 'u', 'banish', 'know', 'dreary', 'somebody', 'public', 'like', 'frog', 'name', 'livelong', 'admire', 'bog', 'felt', 'funeral', 'brain', 'mourner', 'fro', 'keep', 'tread', 'till', 'seem', 'sense', 'seat', 'service', 'drum', 'beat', 'think', 'mind', 'go', 'numb', 'hear', 'lift', 'box', 'creak', 'across', 'soul', 'boot', '