In [2]:
import nltk

In [266]:
from typing import List, Optional, Tuple

In [267]:
class NgramModel:
    def __init__(self, measures, finder, corpus, n_gram) -> None:
        self.model = finder.from_words(corpus).score_ngrams(measures.student_t)
        self.n_gram = n_gram
        
    def compare_ngram(self, this: Optional[Tuple[str]], that: Tuple[str]) -> bool:
        """ a function to compare the n-1 words in the nltk collocation model
        Args:
            this (str or list of strings): the user input
            that (list): the existing tuples up to range N-1
        """
        if not this:
            return False

        return list(that[0][:len(this)]) == this
    
    def predict(self, tokens: List[str], n_words: int=4, return_ngram: bool = False) -> List[str]:
        n_tokens = tokens[-(self.n_gram - 1):]
        model_name = "bigram" if self.n_gram == 2 else "trigram"
        print(f"{model_name} tokens: {n_tokens}")
        probas = [w for w in self.model if self.compare_ngram(n_tokens, w)]
        best_probas = sorted(probas, key=lambda x: x[1], reverse=True)
        # best_words = [w[0][-1] for w in best_probas]
        if return_ngram:
            return [w[0] for w in best_probas][:n_words]
        return [w[0][-1] for w in best_probas][:n_words]

In [3]:
document1 = "I love cats"

In [4]:
document2 = "I love dogs"

In [5]:
document3 = "I love cats, but I also like dogs"

In [11]:
tokenize = nltk.word_tokenize(document3)
print(tokenize)

['I', 'love', 'cats', ',', 'but', 'I', 'also', 'like', 'dogs']


In [20]:
FD = nltk.FreqDist(tokenize)

In [36]:
TF = {"love": 0, "like": 0}
for value, count in FD.items():
    if(value == "like"):
        TF["like"] = count / len(FD)
    if(value == "love"):
        TF["love"] = count / len(FD)
TF

{'love': 0.125, 'like': 0.125}

In [45]:
IDF = {"love": 0, "like": 0}

alldocs = document1,document2,document3
print(alldocs)
countterms = {"love": 0, "like": 0}
for doc in alldocs:
    for word in doc.split():
        if word == "love":
            countterms["love"] = countterms["love"] + 1
        if word == "like":
            countterms["like"] = countterms["like"] + 1
            
IDF["love"] = len(alldocs) / countterms["love"]
IDF["like"] = len(alldocs) / countterms["like"]

IDF

('I love cats', 'I love dogs', 'I love cats, but I also like dogs')


{'love': 1.0, 'like': 3.0}

In [43]:
tf_idf = {"love": 0, "like": 0}
for (word1, score1), (word2, score2) in zip(TF.items(), IDF.items()):
    print(word1)
    print(word2)
    tf_idf[word1] = float(score1 * score2)

tf_idf

love
love
like
like


{'love': 0.125, 'like': 0.375}

In [55]:
alldocs2 = document1 +" " + document2 + " " + document3
docsplit = alldocs2.split()

nltk_textcollection = TextCollection(docsplit)



unseen_sent = "I love cats, but I also like dogs"
# produce all tf_idf scores for the given sentence
tf_vector = {}
for word in unseen_sent.split():
    tf_vector[word] = (nltk_textcollection.tf_idf(word, unseen_sent))
tf_vector

{'I': 0.07592502839365867,
 'love': 0.046680152755974216,
 'cats,': 0.07997143423076541,
 'but': 0.07997143423076541,
 'also': 0.07997143423076541,
 'like': 0.07997143423076541,
 'dogs': 0.058966974213797374}

In [56]:
from nltk.tag import DefaultTagger
from nltk.tag import UnigramTagger 

In [57]:
from nltk.corpus import brown

In [62]:
sent = "I saw her duck"

In [63]:
train_data = brown.tagged_sents()

In [64]:
test_data = brown.tagged_sents()

In [65]:
tag1 = DefaultTagger('NN')

In [66]:
tag2 = UnigramTagger(train_data, backoff = tag1)

In [69]:
for word, tag in tag2.tag(sent.split()):
...     print(word, '->', tag)

I -> PPSS
saw -> VBD
her -> PP$
duck -> VB


In [94]:
doc = nlp("I saw her duck")

In [95]:
doc

I saw her duck

In [96]:
print(f"{'text':{8}} {'POS':{6}} {'TAG':{6}} {'Dep':{6}} {'POS explained':{20}} {'tag explained'} ")

text     POS    TAG    Dep    POS explained        tag explained 


In [97]:
for token in doc:
    print(f'{token.text:{8}} {token.pos_:{6}} {token.tag_:{6}} {token.dep_:{6}} {spacy.explain(token.pos_):{20}} {spacy.explain(token.tag_)}')

I        PRON   PRP    nsubj  pronoun              pronoun, personal
saw      VERB   VBD    ROOT   verb                 verb, past tense
her      PRON   PRP$   poss   pronoun              pronoun, possessive
duck     NOUN   NN     dobj   noun                 noun, singular or mass


In [72]:
corpus = nltk.corpus.brown.sents()

In [76]:
corpus[0:100]

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]

In [114]:
corpus[0:10]

'The Fulton'

In [80]:
words, tags = [], []
start_words = []
for doc in nlp.pipe(corpus, batch_size=1000):
    _words, _tags = zip(*[(t.text, t.pos_) for t in doc])
    start_words.append(_words[0])
    # _words = [w.lower() for w in _words if w.isalnum()]
    words.extend(_words)
    tags.extend(_tags)


In [127]:
for token in doc[:10]:
    print(token)

The
Fulton
County
Grand
Jury
said
Friday
an
investigation
of


In [132]:
sent1 = list(doc.sents)[0]
sent1

The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place .

In [460]:
nlp = spacy.load("en_core_web_md")

In [361]:
import spacy

In [353]:
import numpy as np

In [461]:
corpus = " ".join(nltk.corpus.brown.words(categories="news"))

In [373]:
doc = nlp(corpus)

In [168]:
your_word = "God"
ms = nlp.vocab.vectors.most_similar(np.asarray([nlp.vocab.vectors[nlp.vocab.strings[your_word]]]), n=10)
words = [nlp.vocab.strings[w] for w in ms[0][0]]
print(words)

['Inkblot', 'Olos', 'Jesus-', 'Christlikeness', 'Krivine', 'Midheaven', 'Allakaket', 'kopitiam', 'gracelessly', 'celestine']


In [159]:
doc1 = nlp("I like salt")
doc2 = nlp("Fast food is salty.")

print(doc1, "<->", doc2, doc1.similarity(doc2))

I like salt <-> Fast food is salty. 0.22491289330745162


In [314]:
print(doc1, "<->", nlp("The Empire State building is in New York"), doc1.similarity(nlp("The Empire State building is in New York")))

I like salt <-> The Empire State building is in New York -0.16069842020932387


In [179]:
print(len(list(doc.sents)))

4157


In [347]:
nlp.analyze_pipes()

{'summary': {'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False}},
 'problems': {'tagger': []},
 'attrs': {'token.tag': {'assigns': ['tagger'], 'requires': []}}}

In [421]:
corpus = [preprocess(sent) for sent in nltk.corpus.brown.sents(categories="news")]

In [420]:
def preprocess(sent):
    sent = " ".join(sent)
    sent = re.sub(r"[^\w,.!?]", " ", sent)
    sent = re.sub(r"\s+", " ", sent)
    return sent.strip()


In [315]:
bigram_to_predict = ("hey", "you")

In [463]:
docs = nlp(corpus)

In [357]:
for token in docs:
    print(token.tag_)

PRP
VBP
VBG
NNS


In [269]:
class TrigramModel(NgramModel):
    def __init__(self, corpus) -> None:
        super().__init__(
            measures=nltk.collocations.TrigramAssocMeasures(),
            finder=nltk.collocations.TrigramCollocationFinder,
            corpus=corpus,
            n_gram=3,
        )

In [270]:
model = TrigramModel(corpus)

In [276]:
model.predict(("hey", "you"))

<bound method NgramModel.compare_ngram of <__main__.TrigramModel object at 0x00000182BC5C7D60>>


In [396]:
testdoc = nlp("hey you")

In [397]:
for token in testdoc:
    print(token.tag_)

UH
PRP


In [466]:
def predict(docs, bigram_to_predict):
    nlpedinput = nlp(" ".join(list(bigram_to_predict)))
    inputtags = []
    for token in nlpedinput:
        inputtags.append(token.tag_)
    tags = []
    for token in doc:
        tags.append((token.text, token.tag_))
    tagbigrams = nltk.bigrams(tags)
    tagsfreqdist = nltk.FreqDist(tagbigrams)
    print(list(tagsfreqdist))
    trigrams = nltk.trigrams(tags)
    freqdist = nltk.FreqDist(trigrams)
    freqdistlist = sorted(freqdist.items(), key = lambda kv : -kv[1])
    good_tag = ""
    for trigram, value in freqdistlist:
        trigramlist = list(trigram)
        if((trigramlist[0], trigramlist[1]) == (inputtags[0], inputtags[1])):
            good_tag = trigramlist[2]
    

            
    
predict(docs, ("hey", "there"))
        




In [259]:
nlpedinput = nlp(" ".join(list(bigram_to_predict)))
inputtags = []
for token in nlpedinput:
    inputtags.append(token.tag_)

In [260]:
inputtags

['UH', 'PRP']

In [280]:
import textacy

In [286]:
doctesting = nlp(" ".join(corpus))

In [289]:
doctesting.retokenize()

<spacy.tokens._retokenize.Retokenizer at 0x1833f4ff3d0>

In [None]:
for tag in doctesting.retokenize() as retokenizer:
    for tag in doc.tag

In [297]:
tagger = nlp.add_pipe("tagger")

ValueError: [E007] 'tagger' already exists in pipeline. Existing names: ['tok2vec', 'tagger', 'parser', 'senter', 'attribute_ruler', 'lemmatizer', 'ner']

ImportError: cannot import name 'Tagger' from 'spacy' (C:\Users\Ivar\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\spacy\__init__.py)

In [495]:
import spacy

In [496]:
import nltk

In [497]:
from nltk.collocations import *

In [498]:
corpus = nltk.corpus.brown.words(categories = "news")

In [501]:
trigram_measures = nltk.collocations.TrigramAssocMeasures()

In [499]:
docs = nlp(" ".join(corpus))

In [502]:
def predict(doc, bigram_to_predict):
    nlpedinput = nlp(" ".join(list(bigram_to_predict)))
    inputtags = []
    for token in nlpedinput:
        inputtags.append(token.tag_)
    tags = []
    texts = []
    for token in doc:
        tags.append(token.tag_)
        tags.append(token.text)

    finder = TrigramCollocationFinder.from_words(tags)
    findertexts = TrigramCollocationFinder.from_words(texts)
    scored = finder.score_ngrams(trigram_measures.raw_freq)
    scoredtexts = findertexts.score_ngrams(trigram_measures.raw_freq)
    alllist = []
    for tuple1,tuple2 in zip(scored,scoredtexts):
        alllist.append((tuple1, tuple2))
    print(alllist[0:50])
    good_tag = ""
    good_trigram = ()
    for trigram in scored:
        trigramlist = list(trigram)
        if((trigramlist[0],trigramlist[1]) == bigram_to_predict):
            good_tag = trigramlist[2]
            good_trigram = trigram
    filter_trigrams_from_words = []
    finderWords = TrigramCollocationFinder.from_words(tags)
    
        

    scored = finder.score_ngrams(trigram_measures.raw_freq)
    best = sorted(finder.nbest(trigram_measures.raw_freq, 2))
    print(best)
    
predict(doc, ("hey", "there"))
        

[]
[('DT', 'the', 'NN'), ('TO', 'to', 'VB')]
