# Session 7 : Language Model (N Grams)

In [1]:
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class NGramLanguageModel : 
    def __init__(self,n):
        self.n = n
        self.vectorizer = CountVectorizer(analyzer='word',ngram_range=(n,n))
        
    def fit_transform(self,corpus):
        return self.vectorizer.fit_transform(corpus)
    
    def transform(self,corpus):
        return self.vectorizer.transform(corpus)

def calculate_cosine_similarity_matrix(matrix, query_v):
    similarities = cosine_similarity(matrix, query_v)
    return similarities

corpus = [
    "This is a sample document.",
    "Here is another document.",
    "And this is a third document."
]

query = "This is the query text"

n = 1
document_index = 0

ngram_model = NGramLanguageModel(n)

matrix = ngram_model.fit_transform(corpus)
query_v = ngram_model.transform([query])

print(f"{n}-Gram Model:")
data = matrix.A
 
print(pd.DataFrame(matrix.A, columns=ngram_model.vectorizer.get_feature_names_out()))
print(query_v.A)

1-Gram Model:
   and  another  document  here  is  sample  third  this
0    0        0         1     0   1       1      0     1
1    0        1         1     1   1       0      0     0
2    1        0         1     0   1       0      1     1
[[0 0 0 0 1 0 0 1]]


In [2]:
similarities = calculate_cosine_similarity_matrix(matrix, query_v)

data = {'Document': corpus, 'Similarity': similarities.flatten()}
df = pd.DataFrame(data)

print(query)
df

This is the query text


Unnamed: 0,Document,Similarity
0,This is a sample document.,0.707107
1,Here is another document.,0.353553
2,And this is a third document.,0.632456


# Session 8 : Word Embeddings

In [3]:
import pandas as pd
import sklearn as sk
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jonathanmaverick/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jonathanmaverick/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
first_sentence = "It is going to rain today"
second_sentence = "Today I am not going outside"

first_sentence = first_sentence.split(" ")
second_sentence = second_sentence.split(" ")
total = set(first_sentence).union(set(second_sentence))

print(total)

{'is', 'today', 'to', 'I', 'am', 'It', 'going', 'outside', 'Today', 'not', 'rain'}


In [5]:
wordDictA = dict.fromkeys(total, 0)
wordDictB = dict.fromkeys(total, 0)

for word in first_sentence:
    wordDictA[word]+=1

for word in second_sentence:
    wordDictB[word]+=1

In [6]:
pd.DataFrame([wordDictA, wordDictB])

Unnamed: 0,is,today,to,I,am,It,going,outside,Today,not,rain
0,1,1,1,0,0,1,1,0,0,0,1
1,0,0,0,1,1,0,1,1,1,1,0


In [7]:
def computeTF(wordDict, doc):
    tfDict = {}
    corpusCount = len(doc)
    for word, count in wordDict.items():
        tfDict[word] = count/float(corpusCount)
    return (tfDict)

tfFirst = computeTF(wordDictA, first_sentence)
tfSecond = computeTF(wordDictB, second_sentence)

tf = pd.DataFrame([tfFirst, tfSecond])
tf

Unnamed: 0,is,today,to,I,am,It,going,outside,Today,not,rain
0,0.166667,0.166667,0.166667,0.0,0.0,0.166667,0.166667,0.0,0.0,0.0,0.166667
1,0.0,0.0,0.0,0.166667,0.166667,0.0,0.166667,0.166667,0.166667,0.166667,0.0


In [8]:
filtered_sentence = []
for word in wordDictA:
    if str(word) not in set (stopwords.words('english')):
        filtered_sentence.append(word)
        
filtered_sentence

['today', 'I', 'It', 'going', 'outside', 'Today', 'rain']

In [9]:
def computeIDF(docList):
    import math
    idfDict = {}
    N = len(docList)
    
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / (float(val) + 1))
    
    idf_df = pd.DataFrame(list(idfDict.items()), columns=['Word', 'IDF'])
    
    return idf_df

idfs = computeIDF([wordDictA, wordDictB])
idfs

Unnamed: 0,Word,IDF
0,is,0.30103
1,today,0.30103
2,to,0.30103
3,I,0.30103
4,am,0.30103
5,It,0.30103
6,going,0.30103
7,outside,0.30103
8,Today,0.30103
9,not,0.30103


In [10]:
def computeTFIDF(tfbow, idfs):
    tfidf = {}
    for word, val in tfbow.items():
        tfidf[word] = val*idfs.loc[idfs['Word'] == word, 'IDF'].values[0]
    return tfidf

idfFirst = computeTFIDF(tfFirst, idfs)
idfSecond = computeTFIDF(tfSecond, idfs)

idf = pd.DataFrame([idfFirst, idfSecond])
idf

Unnamed: 0,is,today,to,I,am,It,going,outside,Today,not,rain
0,0.050172,0.050172,0.050172,0.0,0.0,0.050172,0.050172,0.0,0.0,0.0,0.050172
1,0.0,0.0,0.0,0.050172,0.050172,0.0,0.050172,0.050172,0.050172,0.050172,0.0


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np 
from numpy.linalg import norm

In [12]:
Document1 = "It is going to rain today"
Document2 = "Today I am not going outside"

Doc = [Document1, Document2]
print(Doc)

['It is going to rain today', 'Today I am not going outside']


# Session 9 : Grammar Parsing with NLTK

In [10]:
import nltk
from nltk import CFG
from nltk.parse import ChartParser

def demonstrate_nlp_parsing(sentence, grammar):
    words = nltk.word_tokenize(sentence)
    parser = ChartParser(grammar)
    
    try:
        parses = list(parser.parse(words))
        if parses:
            for tree in parser.parse(sentence.split()):
                print(tree, "\n")
                tree.pretty_print()
        else :
            print("No parse tree found")
    except nltk.parse.api.ParserError as e:
        print(f"Error during parsing {e}")
        
nlp_grammar = CFG.fromstring("""
    S -> NP VP                   
    NP -> Det N | Det N PP | 'i'
    VP -> V NP | VP PP
    PP -> P NP
    Det -> 'the' | 'a'
    N -> 'cat' | 'dog' | 'park' | 'cookie'
    V -> 'chased' | 'saw' | 'ate' 
    P -> 'in' | 'on' | 'with'
""")

nlp_sentences = [
    "the cat chased the dog",
    "i saw a cookie",
    "the dog ate a cookie in the park",
    "i chased the cat with the cookie"
]

for i, sentence in enumerate(nlp_sentences):
    print(f"Sentence {i+1}: {nlp_sentences[i]}")
    demonstrate_nlp_parsing(sentence, nlp_grammar)
    print("\n")

Sentence 1: the cat chased the dog
(S (NP (Det the) (N cat)) (VP (V chased) (NP (Det the) (N dog)))) 

              S               
      ________|_____           
     |              VP        
     |         _____|___       
     NP       |         NP    
  ___|___     |      ___|___   
Det      N    V    Det      N 
 |       |    |     |       |  
the     cat chased the     dog



Sentence 2: i saw a cookie
(S (NP i) (VP (V saw) (NP (Det a) (N cookie)))) 

         S                
  _______|___              
 |           VP           
 |    _______|___          
 |   |           NP       
 |   |        ___|____     
 NP  V      Det       N   
 |   |       |        |    
 i  saw      a      cookie



Sentence 3: the dog ate a cookie in the park
(S
  (NP (Det the) (N dog))
  (VP
    (VP (V ate) (NP (Det a) (N cookie)))
    (PP (P in) (NP (Det the) (N park))))) 

                 S                             
      ___________|________                      
     |                 

In [6]:
import nltk
from nltk import CFG
from nltk.parse import ChartParser

def extract_information(parse_tree):
    for subtree in parse_tree.subtrees():
        if subtree.label() == 'NP':
            print(f"Found a noun parse: {' '.join(subtree.leaves())}")
        elif subtree.label() == 'VP':
            print(f"Found a verb parse: {' '.join(subtree.leaves())}")
            
nlp_grammar = CFG.fromstring("""
    S -> NP VP
    NP -> Det N | Det N PP | 'i' | 'I'
    VP -> V NP | V NP PP
    Det -> 'the' | 'a'
    N -> 'cat' | 'dog' | 'park' | 'cookie'
    V -> 'chased' | 'saw' | 'ate'
    PP -> P NP
    P -> 'in' | 'on' | 'with'
""")

nlp_sentence = "the cat chased the dog"

words = nltk.word_tokenize(nlp_sentence)
parser = ChartParser(nlp_grammar)

try: 
    parses = list(parser.parse(words))
    if parses:
        parse_tree = parses[0]
        print("Parse Tree: ")
        print(parse_tree)
        for tree in parser.parse(sentence.split()):
            print(tree, "\n")
            tree.pretty_print()
            
        print("\nExtracted Information:")
        extract_information(parse_tree)
    else:
        print("No parse tree found")
        
except nltk.parse.api.ParserError as e:
    print(f"Error during parsing {e}")

Parse Tree: 
(S (NP (Det the) (N cat)) (VP (V chased) (NP (Det the) (N dog))))
(S
  (NP i)
  (VP
    (V chased)
    (NP (Det the) (N cat))
    (PP (P with) (NP (Det the) (N cookie))))) 

      S                                   
  ____|_____________                       
 |                  VP                    
 |     _____________|________              
 |    |         |            PP           
 |    |         |        ____|___          
 |    |         NP      |        NP       
 |    |      ___|___    |     ___|____     
 NP   V    Det      N   P   Det       N   
 |    |     |       |   |    |        |    
 i  chased the     cat with the     cookie

(S
  (NP i)
  (VP
    (V chased)
    (NP (Det the) (N cat) (PP (P with) (NP (Det the) (N cookie)))))) 

      S                               
  ____|_________                       
 |              VP                    
 |     _________|___                   
 |    |             NP                
 |    |      _______|____        

# Session 10 : Dependency Parsing with Spacy

In [11]:
import spacy 

def extract_named_entities_and_parse_tree(sentence):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(sentence)
    
    print("Formatted Dependency Parse Tree:")
    for token in doc:
        print(f"{token.text} --{token.dep_}-- {token.head.text} ({token.pos_})")
        
def main():
    sentence = "Elon Musk founded SpaceX, and the headquarters are in Palo Alto."
    named_entities = extract_named_entities_and_parse_tree(sentence)

if __name__ == "__main__":
    main()

Formatted Dependency Parse Tree:
Elon --compound-- Musk (PROPN)
Musk --nsubj-- founded (PROPN)
founded --ROOT-- founded (VERB)
SpaceX --dobj-- founded (PROPN)
, --punct-- founded (PUNCT)
and --cc-- founded (CCONJ)
the --det-- headquarters (DET)
headquarters --nsubj-- are (NOUN)
are --conj-- founded (AUX)
in --prep-- are (ADP)
Palo --compound-- Alto (PROPN)
Alto --pobj-- in (PROPN)
. --punct-- are (PUNCT)


In [12]:
import spacy

def dependency_sparint(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    
    for token in doc:
        print(f"{token.text} --{token.dep_}-- {token.head.text} ({token.head.pos_})")
        
if __name__ == "__main__":
    sentence = "Elon Musk founded SpaceX, and the headquarters are in Palo Alto."
    dependency_sparint(sentence)

Elon --compound-- Musk (PROPN)
Musk --nsubj-- founded (VERB)
founded --ROOT-- founded (VERB)
SpaceX --dobj-- founded (VERB)
, --punct-- founded (VERB)
and --cc-- founded (VERB)
the --det-- headquarters (NOUN)
headquarters --nsubj-- are (AUX)
are --conj-- founded (VERB)
in --prep-- are (AUX)
Palo --compound-- Alto (PROPN)
Alto --pobj-- in (ADP)
. --punct-- are (AUX)


# Session 11 : Name Entitiy Recognition (NER) with SpaCy

In [16]:
import spacy 

def extract_named_entities_and_parse_tree(sentence):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(sentence)

    named_entities = {
        "persons": [ent.text for ent in doc.ents if ent.label_ == "PERSON"],
        "locations": [ent.text for ent in doc.ents if ent.label_ == "GPE"],
        "organizations": [ent.text for ent in doc.ents if ent.label_ == "ORG"]
    }
    
    print("Formatted Dependency Parse Tree:")
    for token in doc:
        print(f"{token.text} --{token.dep_}-- {token.head.text} ({token.pos_})")
        
    return named_entities

def main():
    sentence = "Elon Musk founded SpaceX, and the headquarters are in Palo Alto."
    named_entities = extract_named_entities_and_parse_tree(sentence)
    
    print("\nExtracted Named Entities:")
    print("Presons: ", named_entities["persons"])
    print("Organizations: ", named_entities["organizations"])
    print("Locations: ", named_entities["locations"])
    
if __name__ == "__main__":
    main()

Formatted Dependency Parse Tree:
Elon --compound-- Musk (PROPN)
Musk --nsubj-- founded (PROPN)
founded --ROOT-- founded (VERB)
SpaceX --dobj-- founded (PROPN)
, --punct-- founded (PUNCT)
and --cc-- founded (CCONJ)
the --det-- headquarters (DET)
headquarters --nsubj-- are (NOUN)
are --conj-- founded (AUX)
in --prep-- are (ADP)
Palo --compound-- Alto (PROPN)
Alto --pobj-- in (PROPN)
. --punct-- are (PUNCT)

Extracted Named Entities:
Presons:  ['Elon Musk']
Organizations:  []
Locations:  ['Palo Alto']


In [18]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = """
    Barack Obama was born in Hawaii. He was elected president in 2008.
    During his presidency, he implemented various policies, including the Affordable Care Act.
    
    In 1969, Neil Armstrong was the first person to walk on the moon as part of the Apollo 11 mission.
    He famously said, "That's one small step for [a] man, one giant leap for mankind."
    
    The Mona Lisa, painted by Leonardo da Vinci, is house in the Louvre Museum in Paris.
    It is one of the most famous and valuable pieces of art in the world.
"""

doc = nlp(text)

categories = {}
for ent in doc.ents:
    if ent.label_ not in categories:
        categories[ent.label_] = []
    categories[ent.label_].append(ent.text)
    
print("Categorized Named Entities:")
for label, entities in categories.items():
    print(f"{label}: {', '.join(entities)}")

Categorized Named Entities:
PERSON: Barack Obama, Neil Armstrong, Leonardo da Vinci
GPE: Hawaii, Paris
DATE: 2008, 1969
LAW: the Affordable Care Act, Apollo 11
ORDINAL: first
CARDINAL: one, one
WORK_OF_ART: The Mona Lisa
ORG: the Louvre Museum


In [2]:
import spacy

nlp = spacy.load("en_core_web_sm")

nlp_sentences = [
    "I love to eat pizza",
    "the cat sleeps on the sofa",
    "she dances well at parties",
    "he reads books in the library"
]

for sentence in nlp_sentences:
    doc = nlp(sentence)
    print("Sentence:", sentence)
    
    for token in doc:
        print(f"Token: {token.text}, POS: {token.pos_}, Dependency: {token.dep_}")
    
    # Visualization of the dependency parse tree
    spacy.displacy.render(doc, style="dep", options={'compact': True, 'distance': 100})


Sentence: I love to eat pizza
Token: I, POS: PRON, Dependency: nsubj
Token: love, POS: VERB, Dependency: ROOT
Token: to, POS: PART, Dependency: aux
Token: eat, POS: VERB, Dependency: xcomp
Token: pizza, POS: NOUN, Dependency: dobj


Sentence: the cat sleeps on the sofa
Token: the, POS: DET, Dependency: det
Token: cat, POS: NOUN, Dependency: nsubj
Token: sleeps, POS: VERB, Dependency: ROOT
Token: on, POS: ADP, Dependency: prep
Token: the, POS: DET, Dependency: det
Token: sofa, POS: NOUN, Dependency: pobj


Sentence: she dances well at parties
Token: she, POS: PRON, Dependency: nsubj
Token: dances, POS: VERB, Dependency: ROOT
Token: well, POS: ADV, Dependency: advmod
Token: at, POS: ADP, Dependency: prep
Token: parties, POS: NOUN, Dependency: pobj


Sentence: he reads books in the library
Token: he, POS: PRON, Dependency: nsubj
Token: reads, POS: VERB, Dependency: ROOT
Token: books, POS: NOUN, Dependency: dobj
Token: in, POS: ADP, Dependency: prep
Token: the, POS: DET, Dependency: det
Token: library, POS: NOUN, Dependency: pobj
