In [1]:
# Markov Model: Predict
import numpy as np

In [2]:
P = np.array([
    [0.6,0.3,0.1],
    [0.4,0.5,0.1],
    [0.2,0.4,0.4]
])

initial_state = np.array([1,0,0])

steps = 3

state_distribution = initial_state

for _ in range(steps):

    state_distribution = np.dot(state_distribution,P)

print("State distribution after 3 steps : ")

print(f"Sunny : {state_distribution[0]}")
print(f"Rainy: {state_distribution[1]}")
print(f"Cloudy : {state_distribution[2]}")

State distribution after 3 steps : 
Sunny : 0.474
Rainy: 0.38699999999999996
Cloudy : 0.139


In [4]:
import pandas as pd
from collections import defaultdict, Counter

df = pd.read_csv('movie_reviews.csv')
df

Unnamed: 0,movie_title,rating,genre,in_theaters_date,movie_info,directors,director_gender,tomatometer_rating,audience_rating,critics_consensus
0,A Dog's Journey,PG,"Drama, Kids & Family",5/17/19,Bailey (voiced again by Josh Gad) is living th...,Gail Mancuso,female,50,92,A Dog's Journey is as sentimental as one might...
1,A Dog's Way Home,PG,Drama,01/11/2019,"Separated from her owner, a dog sets off on an...",Charles Martin Smith,male,60,71,A Dog's Way Home may not quite be a family-fri...
2,A Tuba to Cuba,NR,"Documentary, Musical & Performing Arts",2/15/19,The leader of New Orleans' famed Preservation ...,"Danny Clinch, T.G. Herrington",male,100,82,
3,A Vigilante,R,Drama,3/29/19,"A once abused woman, Sadie (Olivia Wilde), dev...",Sarah Daggar-Nickson,female,92,50,Led by Olivia Wilde's fearless performance and...
4,After,PG-13,"Drama, Romance",04/12/2019,Based on Anna Todd's best-selling novel which ...,Jenny Gage,female,17,72,"Tepid and tired, After's fun flourishes are le..."
...,...,...,...,...,...,...,...,...,...,...
161,Velvet Buzzsaw,R,"Comedy, Drama, Mystery & Suspense",02/01/2019,Velvet Buzzsaw is a satirical thriller set in ...,Dan Gilroy,male,63,36,If you only watch one art-world satire with ho...
162,What Men Want,R,Comedy,02/08/2019,Inspired by the Nancy Meyers hit romantic come...,Adam Shankman,male,44,33,"Admittedly uneven but easy to like, What Men W..."
163,Wild Rose,R,"Comedy, Drama",6/21/19,WILD ROSE tells the complicated story of Rose-...,Tom Harper,male,93,88,"There's no shortage of star-is-born stories, b..."
164,Wine Country,R,Comedy,05/08/2019,In honor of Rebecca (Rachel Dratch)'s 50th bir...,Amy Poehler,female,66,30,Wine Country's comedy might not be quite as ro...


In [5]:
texts = df["critics_consensus"].dropna()

# transitions = {
#     'fun':{'movie':3, 'experience':2},
#     'great':{'plot':1, 'acting':2}
# }

transitions = defaultdict(Counter)

for text in texts:

    words = text.lower().split()

    for i in range(len(words)-1):

        transitions[words[i]][words[i+1]]+=1

for current_word, next_words in transitions.items():

    print(f"{current_word} ----> {dict(next_words)}")

a ----> {"dog's": 2, 'tear': 1, 'family-friendly': 1, 'vigilante': 1, 'thoughtful': 1, 'pivotal': 3, 'good': 3, 'satisfying': 2, 'smart': 1, 'diverting': 1, 'captive': 1, 'welcome': 1, 'disappointing': 3, 'liam': 1, 'sophisticated': 1, 'completely': 1, 'fun': 1, 'second': 1, 'classic': 1, 'crowded': 2, 'thoroughly': 2, 'career-best': 1, 'step': 1, 'handful': 1, 'truly': 1, 'passing': 1, 'grounded': 1, 'single': 1, 'potent': 1, 'complex': 1, 'talented': 1, 'poignant': 1, 'few': 2, 'bonkers': 1, 'reasonably': 2, 'pair': 2, 'deeply': 1, 'funnier': 1, 'sci-fi': 1, 'sequel': 1, 'tenth': 1, 'well-made': 2, 'visually': 1, 'feel-good': 1, 'star': 3, 'heist': 1, 'galvanizing': 1, 'thought-provoking': 1, 'disappointingly': 2, 'familiar': 2, 'big': 1, 'story': 4, 'grimly': 2, 'commendable': 1, 'sharp': 1, 'real-life': 1, 'stellar': 1, 'satisfyingly': 1, 'labored': 1, 'true': 1, 'franchise': 1, 'horror': 1, 'future': 2, 'compelling': 1, 'visual': 1, 'little': 1, 'solid': 1, 'towering': 1, 'documen

In [6]:
transition_matrix = {}

for current_word, next_words in transitions.items():

    total = sum(next_words.values())

    transition_matrix[current_word] = {

        next_word:count/total for next_word, count in next_words.items()
    }

next_words.items()

dict_items([('--', 1)])

In [7]:
current = 'thriller'

if current in transition_matrix:

    next_words = transition_matrix[current]

    sorted_next = sorted(next_words.items(), key=lambda x:x[1], reverse=True)

    print(f"Most likely next word(s) after '{current}' : ")

    for word, prob in sorted_next:

        print(f"{word} ----> {prob}")

else:

    print(f"The word '{current}' was not found in the transition matrix. ")

Most likely next word(s) after 'thriller' : 
that ----> 0.2
doesn't ----> 0.2
may ----> 0.2
-- ----> 0.2
from ----> 0.2


In [8]:

def calculate_entropy(transition_matrix):

    entropy = {}

    for current_word, next_probs in transition_matrix.items():

        entropy[current_word] = -sum(p*np.log2(p) for p in next_probs.values() if p > 0)

    return entropy

entropy = calculate_entropy(transition_matrix)

print("Entropy for each word : ")

for word, H in entropy.items():

    print(f"{word} ----> {H}")

Entropy for each word : 
a ----> 6.838869969720591
dog's ----> 1.0
journey ----> -0.0
is ----> 4.266764026202947
as ----> 4.335970858598519
sentimental ----> -0.0
one ----> 2.321928094887362
might ----> 2.807354922057604
expect, ----> -0.0
but ----> 4.663320189106002
even ----> 2.2516291673878226
cynical ----> -0.0
viewers ----> 2.2359263506290326
may ----> 2.999581230646064
find ----> 2.321928094887362
their ----> 1.0
ability ----> -0.0
to ----> 5.5569576083772985
resist ----> 1.0
shedding ----> -0.0
tear ----> -0.0
stretched ----> -0.0
the ----> 6.970627692490496
puppermost ----> -0.0
way ----> 2.0
home ----> 1.584962500721156
not ----> 3.418295834054489
quite ----> 0.7219280948873623
be ----> 3.94770277922009
family-friendly ----> -0.0
animal ----> -0.0
drama ----> 2.75
fan's ----> -0.0
best ----> 1.9219280948873623
friend, ----> -0.0
this ----> 4.165894208390023
canine ----> -0.0
adventure ----> -0.0
no ----> 2.321928094887362
less ----> 1.9219280948873623
heartwarming ----> -0.0
f

In [9]:
# find the 10 maximum and minimum entropies
#(i.e. word after which word is the worst to predict and which ones are the easiest to predict)


df = pd.DataFrame(list(entropy.items()), columns=['word', 'entropy'])

df_max = df.sort_values(by='entropy', ascending=False).head(10)

df_max

Unnamed: 0,word,entropy
22,the,6.970628
0,a,6.83887
42,its,5.946686
49,and,5.857225
17,to,5.556958
85,of,5.227227
54,an,4.88879
80,it,4.871178
9,but,4.66332
41,for,4.454636


In [10]:
df_min = df.sort_values(by='entropy', ascending=True).head(10)

df_min

Unnamed: 0,word,entropy
1518,nose,-0.0
1247,legend,-0.0
34,"friend,",-0.0
1249,"parts,",-0.0
36,canine,-0.0
37,adventure,-0.0
1250,powerfully,-0.0
1251,"told,",-0.0
40,heartwarming,-0.0
1252,last,-0.0


In [12]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------- ------------------------ 5.0/12.8 MB 23.2 MB/s eta 0:00:01
     --------------------------------------  12.6/12.8 MB 34.2 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 28.6 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [13]:
# Find the entropies for stop words

import spacy

nlp = spacy.load('en_core_web_sm')

series=[]
for word in df['word']:
    series.append(nlp(word)[0].is_stop)

df = df[series]

df

Unnamed: 0,word,entropy
0,a,6.838870
3,is,4.266764
4,as,4.335971
6,one,2.321928
7,might,2.807355
...,...,...
1477,either,-0.000000
1485,you,-0.000000
1491,others,-0.000000
1509,they,-0.000000


In [14]:
for i in df['word']:
    print(i)

a
is
as
one
might
but
even
may
their
to
the
not
quite
be
this
no
less
for
its
by
and
an
that
after's
are
down
with
if
it
never
of
keep
up
still
themselves
more
than
all
at
always
my
in
own
between
his
have
done
before
isn't
some
there's
back
doesn't
much
get
when
doing
while
does
whatever
on
although
it's
enough
from
along
make
that's
just
name
across
has
seem
well
it,
nothing
what's
often
through
out
well-matched
over-the-top
over
should
five
further
few
or
almost
she's
a-list
into
made
so
which
say
everything
will
well-made
really
how
your
many
well-suited
another
most
which,
behind
whose
her
yet
above
also
against
nowhere
itself
well-acted
amount
take
well-executed
could
have,
something
about,
what
first
anywhere
anything
other
go
below
next
though
five-star
we
only
they're
who
rather
already
don't
all,
can
being
side
well-acted,
beyond
well-framed,
well-cast,
well-intentioned,
would
all-ages
sometimes
last
part
by-the-numbers
then
about
due
any
around
well-told
without
ever
full
th

In [15]:
# Find entropies of stopwords

stop_words = set(['a', 'an', 'the', 'and', 'or', 'but',
                  'if', 'or','to','in','on','at','by',
                  'this','for','from','as','for','with',])

stop_word_entropy = {word:entropy.get(word, 0) for word in stop_words}

for word, H in stop_word_entropy.items():
    print(f"Entropy of '{word}' : {H}")

Entropy of 'but' : 4.663320189106002
Entropy of 'if' : 2.1556390622295662
Entropy of 'an' : 4.8887896794220005
Entropy of 'and' : 5.857225337483041
Entropy of 'a' : 6.838869969720591
Entropy of 'the' : 6.970627692490496
Entropy of 'or' : 2.807354922057604
Entropy of 'to' : 5.5569576083772985
Entropy of 'in' : 3.7191821684284743
Entropy of 'on' : 3.1462863706621045
Entropy of 'at' : 2.6416041678685933
Entropy of 'for' : 4.454636150748477
Entropy of 'from' : 3.057476076289932
Entropy of 'as' : 4.335970858598519
Entropy of 'with' : 3.891853096329677
Entropy of 'by' : 4.152492594515677
Entropy of 'this' : 4.165894208390023
