## Perplexity

In [None]:
pip install pyplexity



In [None]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from pyplexity import PerplexityModel, PerplexityProcessor

model = PerplexityModel.from_str("bigrams-cord19")
text_processor = PerplexityProcessor(perpl_model=model, perpl_limit=8000.0)
clean_text = text_processor.process("This is a normal sentence. Meanwhile, hjldfuia HTML BODY this one will be deleted LINK URL COUISUDOANLHJWQKEJK")

Loading model... Done.


In [None]:
import nltk
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE
from nltk.lm import Vocabulary

train_sentences = ['an apple', 'an orange']
tokenized_text = [list(map(str.lower, nltk.tokenize.word_tokenize(sent))) for sent in train_sentences]

n = 2
train_data = [nltk.bigrams(t,  pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="</s>") for t in tokenized_text]
words = [word for sent in tokenized_text for word in sent]
words.extend(["<s>", "</s>"])
padded_vocab = Vocabulary(words)
model = MLE(n)
model.fit(train_data, padded_vocab)

test_sentences = ['an apple','an ant']
tokenized_text = [list(map(str.lower, nltk.tokenize.word_tokenize(sent))) for sent in test_sentences]

test_data = [nltk.bigrams(t,  pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="</s>") for t in tokenized_text]
for test in test_data:
    print ("MLE Estimates:", [((ngram[-1], ngram[:-1]),model.score(ngram[-1], ngram[:-1])) for ngram in test])

test_data = [nltk.bigrams(t,  pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="</s>") for t in tokenized_text]
for i, test in enumerate(test_data):
  print("PP({0}):{1}".format(test_sentences[i], model.perplexity(test)))

MLE Estimates: [(('an', ('<s>',)), 1.0), (('apple', ('an',)), 0.5), (('</s>', ('apple',)), 1.0)]
MLE Estimates: [(('an', ('<s>',)), 1.0), (('ant', ('an',)), 0.0), (('</s>', ('ant',)), 0)]
PP(an apple):1.2599210498948732
PP(an ant):inf


## Naive Bayes

In [None]:
import pandas as pd
import numpy as np

data = {
    'colour': ['white', 'white', 'green', 'green', 'white', 'green', 'green', 'white'],
    'legs': [2, 3, 2, 3, 2, 3, 2, 3],
    'smell': ['yes', 'no', 'yes', 'no', 'yes', 'no', 'yes', 'no'],
    'height': ['tall', 'short', 'tall', 'short', 'tall', 'short', 'tall', 'short'],
    'species': ['mammals', 'not-mammals', 'mammals', 'not-mammals', 'mammals', 'not-mammals', 'mammals', 'not-mammals']
}

df = pd.DataFrame(data)
df

Unnamed: 0,colour,legs,smell,height,species
0,white,2,yes,tall,mammals
1,white,3,no,short,not-mammals
2,green,2,yes,tall,mammals
3,green,3,no,short,not-mammals
4,white,2,yes,tall,mammals
5,green,3,no,short,not-mammals
6,green,2,yes,tall,mammals
7,white,3,no,short,not-mammals


In [None]:
def calculate_frequency_table(attribute):
    freq_table = df.groupby([attribute, 'species']).size().unstack().fillna(0)
    return freq_table

calculate_frequency_table('colour')

species,mammals,not-mammals
colour,Unnamed: 1_level_1,Unnamed: 2_level_1
green,2,2
white,2,2


In [None]:
def calculate_likelihood_table(attribute):
    freq_table = calculate_frequency_table(attribute)
    likelihood_table = freq_table.apply(lambda x: x / x.sum(), axis=1)
    return likelihood_table

calculate_likelihood_table('colour')

species,mammals,not-mammals
colour,Unnamed: 1_level_1,Unnamed: 2_level_1
green,0.5,0.5
white,0.5,0.5


In [None]:
def predict(sample, likelihood_tables):
    predictions = {}
    for species in df['species'].unique():
        p = 1
        for attribute, value in sample.items():
            if attribute != 'species':
                p *= likelihood_tables[attribute].loc[value, species]
        predictions[species] = p
    return max(predictions, key=predictions.get)


likelihood_tables = {}
for attribute in df.columns[:-1]:
    likelihood_tables[attribute] = calculate_likelihood_table(attribute)

likelihood_tables

{'colour': species  mammals  not-mammals
 colour                       
 green        0.5          0.5
 white        0.5          0.5,
 'legs': species  mammals  not-mammals
 legs                         
 2            1.0          0.0
 3            0.0          1.0,
 'smell': species  mammals  not-mammals
 smell                        
 no           0.0          1.0
 yes          1.0          0.0,
 'height': species  mammals  not-mammals
 height                       
 short        0.0          1.0
 tall         1.0          0.0}

In [None]:
sample_index = np.random.randint(0, df.shape[0])
sample = df.iloc[sample_index, :-1].to_dict()

# Predict output
prediction = predict(sample, likelihood_tables)
print("Sample:", sample)
print("Predicted species:", prediction)

Sample: {'colour': 'green', 'legs': 3, 'smell': 'no', 'height': 'short'}
Predicted species: not-mammals
