In [5]:
# Markov Model: Predict
import numpy as np

In [6]:
P = np.array([
    [0.6,0.3,0.1],
    [0.4,0.5,0.1],
    [0.2,0.4,0.4]
])

initial_state = np.array([1,0,0])

steps = 3

state_distribution = initial_state

for _ in range(steps):

    state_distribution = np.dot(state_distribution,P)

print("State distribution after 3 steps : ")

print(f"Sunny : {state_distribution[0]}")
print(f"Rainy: {state_distribution[1]}")
print(f"Cloudy : {state_distribution[2]}")

State distribution after 3 steps : 
Sunny : 0.474
Rainy: 0.38699999999999996
Cloudy : 0.139


In [7]:
import pandas as pd
from collections import defaultdict, Counter

df = pd.read_csv('/content/movie_reviews.csv')
df

FileNotFoundError: [Errno 2] No such file or directory: '/content/movie_reviews.csv'

In [None]:
texts = df["critics_consensus"].dropna()

# transitions = {
#     'fun':{'movie':3, 'experience':2},
#     'great':{'plot':1, 'acting':2}
# }

transitions = defaultdict(Counter)

for text in texts:

    words = text.lower().split()

    for i in range(len(words)-1):

        transitions[words[i]][words[i+1]]+=1

for current_word, next_words in transitions.items():

    print(f"{current_word} ----> {dict(next_words)}")

In [None]:
transition_matrix = {}

for current_word, next_words in transitions.items():

    total = sum(next_words.values())

    transition_matrix[current_word] = {

        next_word:count/total for next_word, count in next_words.items()
    }

next_words.items()

In [None]:
current = 'thriller'

if current in transition_matrix:

    next_words = transition_matrix[current]

    sorted_next = sorted(next_words.items(), key=lambda x:x[1], reverse=True)

    print(f"Most likely next word(s) after '{current}' : ")

    for word, prob in sorted_next:

        print(f"{word} ----> {prob}")

else:

    print(f"The word '{current}' was not found in the transition matrix. ")

In [None]:

def calculate_entropy(transition_matrix):

    entropy = {}

    for current_word, next_probs in transition_matrix.items():

        entropy[current_word] = -sum(p*np.log2(p) for p in next_probs.values() if p > 0)

    return entropy

entropy = calculate_entropy(transition_matrix)

print("Entropy for each word : ")

for word, H in entropy.items():

    print(f"{word} ----> {H}")

In [None]:
# find the 10 maximum and minimum entropies
#(i.e. word after which word is the worst to predict and which ones are the easiest to predict)


df = pd.DataFrame(list(entropy.items()), columns=['word', 'entropy'])

df_max = df.sort_values(by='entropy', ascending=False).head(10)

df_max

In [None]:
df_min = df.sort_values(by='entropy', ascending=True).head(10)

df_min

In [None]:
# Find the entropies for stop words

import spacy

nlp = spacy.load('en_core_web_sm')

series=[]
for word in df['word']:
    series.append(nlp(word)[0].is_stop)

df = df[series]

df

In [None]:
for i in df['word']:
    print(i)

In [None]:
# Find entropies of stopwords

stop_words = set(['a', 'an', 'the', 'and', 'or', 'but',
                  'if', 'or','to','in','on','at','by',
                  'this','for','from','as','for','with',])

stop_word_entropy = {word:entropy.get(word, 0) for word in stop_words}

for word, H in stop_word_entropy.items():
    print(f"Entropy of '{word}' : {H}")