In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import ngrams
from collections import Counter

In [None]:
corpus = ["King Krishnadevaraya loved horses and had the best collection of horse breeds in the Kingdom. Well, one day, a trader came to the King and told him that he had brought with him a horse of the best breed in Arabia. He invited the King to inspect the horse. King Krishnadevaraya loved the horse; so the trader said that the King could buy this one and that he had two more like this one, back in Arabia that he would go back to get. The King loved the horse so much that he had to have the other two as well. He paid the trader 5000 gold coins in advance. The trader promised that he would return within two days with the other horses.Two days turned into two weeks, and still, there was no sign of the trader and the two horses. One evening, to ease his mind, the King went on a stroll in his garden. There he spotted Tenali Raman writing down something on a piece of paper. Curious, the King asked Tenali what he was jotting down.Tenali Raman was hesitant, but after further questioning, he showed the King the paper. On the paper was a list of names, the King’s being at the top of the list. Tenali said these were the names of the biggest fools in the Vijayanagara Kingdom!As expected, the King was furious that his name was at the top and asked Tenali Raman for an explanation. Tenali referred to the horse story, saying the King was a fool to believe that the trader, a stranger, would return after receiving 5000 gold coins.Countering his argument, the King then asked, what happens if/when the trader does come back? In true Tenali humour, he replied saying, in that case, the trader would be a bigger fool, and his name would replace the King’s on the list!"]

In [None]:
#TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

In [None]:
#N-grams
ngram_sizes = [1, 2, 3]
ngram_counts = {n: Counter() for n in ngram_sizes}

In [None]:
for sentence in corpus:
    for n in ngram_sizes:
        grams = ngrams(sentence.split(), n)
        ngram_counts[n].update(grams)

#Calculate probabilities for N-grams
ngram_probabilities = {n: {k: v / sum(ngram_counts[n].values()) for k, v in ngram_counts[n].items()} for n in ngram_sizes}

In [None]:
#Print results
print("TF-IDF Feature Names:")
print(tfidf_feature_names)

TF-IDF Feature Names:
['5000' 'advance' 'after' 'an' 'and' 'arabia' 'argument' 'as' 'asked' 'at'
 'back' 'be' 'being' 'believe' 'best' 'bigger' 'biggest' 'breed' 'breeds'
 'brought' 'but' 'buy' 'came' 'case' 'coins' 'collection' 'come' 'could'
 'countering' 'curious' 'day' 'days' 'does' 'down' 'ease' 'evening'
 'expected' 'explanation' 'fool' 'fools' 'for' 'furious' 'further'
 'garden' 'get' 'go' 'gold' 'had' 'happens' 'have' 'he' 'hesitant' 'him'
 'his' 'horse' 'horses' 'humour' 'if' 'in' 'inspect' 'into' 'invited'
 'jotting' 'king' 'kingdom' 'krishnadevaraya' 'like' 'list' 'loved' 'mind'
 'more' 'much' 'name' 'names' 'no' 'of' 'on' 'one' 'other' 'paid' 'paper'
 'piece' 'promised' 'questioning' 'raman' 'receiving' 'referred' 'replace'
 'replied' 'return' 'said' 'saying' 'showed' 'sign' 'so' 'something'
 'spotted' 'still' 'story' 'stranger' 'stroll' 'tenali' 'that' 'the'
 'then' 'there' 'these' 'this' 'to' 'told' 'top' 'trader' 'true' 'turned'
 'two' 'vijayanagara' 'was' 'weeks' 'well'

In [None]:
print("\nTF-IDF Matrix:")
print(tfidf_matrix.toarray())

for n in ngram_sizes:
    print(f"\n{n}-gram Probabilities:")
    for k, v in ngram_probabilities[n].items():
        print(f"{k}: {v}")


TF-IDF Matrix:
[[0.03899147 0.01949573 0.03899147 0.01949573 0.13647013 0.03899147
  0.01949573 0.03899147 0.0584872  0.03899147 0.0584872  0.01949573
  0.01949573 0.01949573 0.03899147 0.01949573 0.01949573 0.01949573
  0.01949573 0.01949573 0.01949573 0.01949573 0.01949573 0.01949573
  0.03899147 0.01949573 0.01949573 0.01949573 0.01949573 0.01949573
  0.01949573 0.03899147 0.01949573 0.03899147 0.01949573 0.01949573
  0.01949573 0.01949573 0.03899147 0.01949573 0.01949573 0.01949573
  0.01949573 0.01949573 0.01949573 0.01949573 0.03899147 0.07798293
  0.01949573 0.01949573 0.21445307 0.01949573 0.03899147 0.09747867
  0.1169744  0.0584872  0.01949573 0.01949573 0.15596587 0.01949573
  0.01949573 0.01949573 0.01949573 0.27294027 0.03899147 0.03899147
  0.01949573 0.0584872  0.0584872  0.01949573 0.01949573 0.01949573
  0.03899147 0.03899147 0.01949573 0.13647013 0.07798293 0.07798293
  0.03899147 0.01949573 0.0584872  0.01949573 0.01949573 0.01949573
  0.0584872  0.01949573 0.019495