## 1. Dataset creation

In [3]:
import pandas as pd

reviews = pd.read_csv("Movies_and_TV.csv")
reviews["label"] = reviews["rating"].apply(lambda x: 1 if x >= 4 else 0)

print(reviews["label"].value_counts(normalize=True))

label
1    0.807584
0    0.192416
Name: proportion, dtype: float64


Results

Percentage 1 = 0.807584
Percentage 0 = 0.192416

This means that there are way more positive reviews than negative ones

## 2. Preprocessing & Analysis

Tokenize and preprocess data

In [None]:
import re
import string

def clean_data(inhoud):
    
    #Check for any non-text input
    if not isinstance(inhoud, str):
        return ""   

    #Turn all text into lowercase text
    inhoud = inhoud.lower()

    #Remove numbers and any punctuation
    inhoud = re.sub(r"\d+", "", inhoud)
    inhoud = inhoud.translate(str.maketrans("", "", string.punctuation))

    #Split the text into singular words
    words = inhoud.split()

    #Put a space between the words when they get returned in the cleaned_text
    return " ".join(words)

reviews['cleaned_text'] = reviews['text'].apply(clean_data)

Descriptive statistics

Number of reviews

In [5]:
print("Number of reviews: ", len(reviews))

Number of reviews:  102024


Average words per review

In [6]:
reviews["total_words"] = reviews["cleaned_text"].apply(lambda x: len(x.split()))
print("Average words per review: ", round(reviews["total_words"].mean(), 2))

Average words per review:  52.6


Number of unique words

In [7]:
alle_woorden = " ".join(reviews["cleaned_text"]).split()
print("Number of unique words:", len(set(alle_woorden)))

Number of unique words: 103056


Show the 50 most common words in the corpus after preprocessing

In [None]:
from collections import Counter

alle_woorden = " ".join(reviews["cleaned_text"]).split()
woord_frequentie = Counter(alle_woorden)
top50 = woord_frequentie.most_common(50)

for word, freq in top50:
    print(f"{word}: {freq}")

the: 280562
and: 162310
a: 144252
to: 124728
of: 115297
i: 105824
is: 92520
this: 89461
it: 86838
in: 71284
for: 49989
that: 49909
was: 48605
movie: 47882
with: 43430
but: 38735
you: 38625
as: 37591
br: 36336
on: 34823
not: 31065
are: 30600
have: 27259
great: 24773
my: 24209
one: 23184
good: 23033
so: 22501
be: 22148
all: 21640
like: 19516
they: 19356
love: 18767
its: 17824
at: 17759
very: 17142
an: 16819
he: 16664
his: 16658
from: 16544
if: 16542
just: 16514
more: 16016
her: 14859
who: 14661
about: 14560
story: 14303
there: 14245
show: 14153
or: 14136


## 3. Bag-Of-Word Model

Remove stopwords

In [10]:
from nltk.corpus import stopwords

alle_woorden = " ".join(reviews["cleaned_text"]).split()
stop_words = set(stopwords.words("english"))

words = [w for w in alle_woorden if w not in stop_words]



Show the 100 most common words after stopword removal

In [11]:
woord_frequentie_100 = Counter(words)
top100 = woord_frequentie_100.most_common(100)

for word, freq in top100:
    print(f"{word}: {freq}")

movie: 47882
br: 36336
great: 24773
one: 23184
good: 23033
like: 19516
love: 18767
story: 14303
show: 14153
watch: 14092
series: 13533
film: 13522
time: 12676
really: 12502
would: 11952
well: 11246
see: 9960
first: 9646
much: 9568
get: 9454
movies: 9169
dvd: 9067
season: 9019
even: 8199
watching: 7932
also: 7702
dont: 7384
many: 6642
characters: 6591
people: 6559
way: 6555
best: 6552
loved: 6478
could: 6431
little: 6395
two: 6365
back: 6248
think: 6246
better: 6044
acting: 6020
fun: 5996
life: 5982
still: 5897
make: 5790
im: 5654
enjoyed: 5565
watched: 5511
know: 5446
never: 5435
funny: 5397
family: 5346
made: 5319
workout: 5309
work: 5289
new: 5211
old: 5078
want: 4832
bad: 4776
enjoy: 4762
didnt: 4736
seen: 4686
end: 4638
interesting: 4628
lot: 4546
character: 4498
excellent: 4420
years: 4374
find: 4278
times: 4260
every: 4184
minutes: 4119
recommend: 4097
go: 4086
worth: 4084
shows: 4080
always: 4073
set: 3971
going: 3965
cant: 3962
bit: 3946
actors: 3946
video: 3930
done: 3829
say:

Calculate mutual information for these 100 words with the target label and report the most informative ones

In [22]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_extraction.text import CountVectorizer

top100_justwords = [w for w, _ in top100]

vector = CountVectorizer(vocabulary=top100_justwords)
X = vector.fit_transform(reviews['cleaned_text'])
Y = reviews['label']

mutual_info_score = mutual_info_classif(X, Y, discrete_features=True)

mutual_info_score_dataframe = pd.DataFrame({'Word': top100_justwords, 'Mutual Information': mutual_info_score})
mutual_info_score_dataframe = mutual_info_score_dataframe.sort_values(by="Mutual Information", ascending=False)
mutual_info_score_dataframe_clean = mutual_info_score_dataframe.reset_index(drop=True)

print(mutual_info_score_dataframe_clean.head(20).to_string(index=False))

     Word  Mutual Information
    great            0.011072
      bad            0.008628
     love            0.007399
    didnt            0.005409
  minutes            0.004700
    loved            0.004484
    would            0.003707
    could            0.003451
   better            0.003246
excellent            0.003056
     like            0.002974
     dont            0.002915
     plot            0.002769
     even            0.002525
  enjoyed            0.002509
   series            0.002063
     much            0.001883
   acting            0.001773
      get            0.001672
    movie            0.001576
