In [1]:
# Data handling
import json
import numpy as np
import pandas as pd
from rich import print

# NLP
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import preprocessing
from nltk import word_tokenize

# Visualization
# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="whitegrid")


def extract_tf_idf(documents, ngram_range=(1, 3)):
    no_features = None
    tfidf_vectorizer = TfidfVectorizer(max_features=no_features, ngram_range=ngram_range)
    tf_idf = tfidf_vectorizer.fit_transform(documents)
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()
    df_idf = pd.DataFrame(np.array([tfidf_feature_names, tfidf_vectorizer.idf_]).T, columns=['Word', 'IDF'])
    
    return df_idf

def get_top_n_words(corpus, n=None, stopwords=None, ngram_range=(1, 1)):
    """ List the top n words in a vocabulary according to occurrence in a text corpus. """
    vec = CountVectorizer(ngram_range=ngram_range, stop_words=stopwords).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]


def visualize_word_frequenty(data, n):
    f, ax = plt.subplots(figsize=(10, 4))
    sns.barplot(y=[x[0] for x in data[:n]], x=[y[1] for y in data[:n]], color='b')

    
def c_tf_idf(documents, m, ngram_range=(1, 1)):
    """ Calculate Class-based TF-IDF
    
    The result is a single score for each word
    
    documents = list of documents where each entry contains a single string
    of each class. For example, let's say you have 200 documents per class and you have 2 classes. 
    The documents is a list of two documents, where each document is a join of all 200 documents. 
    
    m = total number of documents
    
    """
    
    count = CountVectorizer(ngram_range=ngram_range).fit(documents)
    t = count.transform(documents)
    t = np.array(t.todense()).T
    w = t.sum(axis=0)
    tf = np.divide(t+1,w+1)
    sum_tij = np.array(t.sum(axis=1)).T
    idf = np.log(np.divide(m, sum_tij)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)
    
    return tf_idf, count

# with open('coco_reviews.json') as f:
#     coco = json.load(f)
    
# with open('frozen_reviews.json') as f:
#     frozen = json.load(f)

## Analysis - Top n Words

In [83]:
# freq = get_top_n_words(coco, n=100, stopwords="english", ngram_range=(1, 1))

In [123]:
# visualize_word_frequenty(freq, 20)

In [485]:
# df_idf = extract_tf_idf(coco, ngram_range=(1, 3))

In [124]:
# visualize_word_frequenty([tuple(r) for r in df_idf.sort_values("IDF", ascending=False).to_numpy()], 20)

## C-TF-IDF

In [3]:
with open('../data/pixar_reviews.json') as f:
    reviews = json.load(f)
    
titles = list(reviews.keys())
documents = [" ".join([doc for _, doc in reviews[title]]) for title in titles]
m = sum([len(reviews[title]) for title in titles])
tf_idf, count = c_tf_idf(documents, m, ngram_range=(1, 1))

In [5]:
result = pd.DataFrame(tf_idf, index=count.get_feature_names(), columns=titles)

In [7]:
result.sort_values("Toy Story 3", ascending=False).head(10)

Unnamed: 0,Toy Story,A Bug's Life,Toy Story 2,"Monsters, Inc.",Finding Nemo,The Incredibles,Cars,Ratatouille,WALL-E,Up,...,Brave,Monsters University,Inside Out,The Good Dinosaur,Finding Dory,Cars 3,Coco,Incredibles 2,Toy Story 4,Onward
toys,0.010536,0.000276,0.005572,0.000307,0.00019,0.000117,0.000346,0.00013,0.000119,9.2e-05,...,0.000111,9.8e-05,0.000123,9.4e-05,9.2e-05,0.000467,5.5e-05,5.6e-05,0.00472,3.3e-05
andy,0.01021,3.3e-05,0.004478,1.5e-05,6.2e-05,2.6e-05,2.5e-05,1.1e-05,1.2e-05,1.7e-05,...,1.4e-05,4e-05,3.8e-05,5e-05,1.6e-05,5.1e-05,1.1e-05,1.1e-05,0.002733,4.7e-05
lotso,0.000103,6.4e-05,6.9e-05,3e-05,2e-05,1.7e-05,2.5e-05,2.1e-05,1.2e-05,3.4e-05,...,5.6e-05,3.9e-05,3e-05,3.3e-05,3.2e-05,5e-05,2.2e-05,2.2e-05,0.000308,4.6e-05
college,2.8e-05,0.000103,5.5e-05,0.000142,9.8e-05,4.1e-05,5.9e-05,5.1e-05,2e-05,2.7e-05,...,2.2e-05,0.008532,9.5e-05,0.00013,2.6e-05,8e-05,8.8e-05,5.3e-05,0.000492,3.7e-05
woody,0.010603,0.000661,0.010158,0.000147,9.4e-05,1.8e-05,5.3e-05,3.8e-05,2.6e-05,1.8e-05,...,6e-05,5.6e-05,5.3e-05,3.5e-05,2.3e-05,1.8e-05,1.6e-05,8e-06,0.009648,3.3e-05
daycare,3.7e-05,6.8e-05,3.7e-05,3.1e-05,6.5e-05,1.8e-05,5.3e-05,2.3e-05,2.6e-05,1.8e-05,...,3e-05,4.2e-05,1.6e-05,3.5e-05,6.8e-05,5.3e-05,2.3e-05,2.4e-05,5.8e-05,4.9e-05
toy,0.004659,0.001151,0.004958,0.000879,0.000502,0.00034,0.000544,0.000281,0.000193,0.000244,...,0.000224,0.00033,0.000332,0.000185,0.00027,0.000226,0.000209,9.3e-05,0.004977,0.000215
ken,3.7e-05,6.9e-05,3.7e-05,3.2e-05,2.2e-05,0.000111,2.7e-05,2.3e-05,1.3e-05,1.8e-05,...,3e-05,4.2e-05,1.6e-05,3.5e-05,3.4e-05,5.4e-05,2.4e-05,2.4e-05,7.8e-05,4.9e-05
trilogy,0.00027,0.000168,0.000717,7.7e-05,0.000196,0.000165,0.000194,3.7e-05,8.5e-05,8.9e-05,...,0.000171,0.000375,6.5e-05,0.000114,8.4e-05,0.001527,9.6e-05,0.000116,0.001941,0.000199
buzz,0.014812,0.000221,0.008557,0.000175,0.000131,5.1e-05,0.000134,7.3e-05,3.6e-05,3.4e-05,...,9.7e-05,7.7e-05,7.3e-05,4.8e-05,3.1e-05,2.5e-05,5.4e-05,1.1e-05,0.004214,6.8e-05


In [26]:
top_200_words = {movie: None for movie in titles}
for movie in titles:
    words = result[[movie]].sort_values(movie, ascending=False).index[:200]
    values = result[[movie]].sort_values(movie, ascending=False).values[:200].flatten()
    top_200_words[movie] = [(word, value) for word, value in zip(words, values)]

In [27]:
with open(f'../data/pixar_tfidf.json', 'w') as f:
    json.dump(top_200_words, f)

In [29]:
result["Importance"] = result["WALL-E"].values / result.drop("WALL-E", 1).reset_index(drop=True).sum(axis=1).values

In [31]:
result.sort_values("Importance", ascending=False).head(10)

Unnamed: 0,Toy Story,A Bug's Life,Toy Story 2,"Monsters, Inc.",Finding Nemo,The Incredibles,Cars,Ratatouille,WALL-E,Up,...,Monsters University,Inside Out,The Good Dinosaur,Finding Dory,Cars 3,Coco,Incredibles 2,Toy Story 4,Onward,Importance
eve,7.2e-05,4.4e-05,7.1e-05,2e-05,1.4e-05,1.2e-05,1.7e-05,3e-05,0.009365,2.4e-05,...,2.7e-05,2.1e-05,4.5e-05,2.2e-05,3.5e-05,1.5e-05,1.5e-05,1.3e-05,6.3e-05,14.166724
robots,2.8e-05,5.2e-05,5.5e-05,2.4e-05,1.6e-05,0.000291,0.000139,3.4e-05,0.006652,0.00011,...,3.1e-05,4.8e-05,2.6e-05,5.2e-05,8.1e-05,5.3e-05,5.3e-05,1.5e-05,3.7e-05,5.124182
robot,2.4e-05,4.4e-05,2.3e-05,4e-05,1.4e-05,0.000718,5.1e-05,5.8e-05,0.008775,0.000128,...,0.000134,0.000102,4.5e-05,2.2e-05,3.4e-05,3e-05,9.1e-05,4.9e-05,3.1e-05,4.707481
cockroach,4.1e-05,7.7e-05,4.1e-05,3.5e-05,2.4e-05,2.1e-05,3e-05,2.5e-05,0.002356,4.1e-05,...,4.7e-05,1.8e-05,3.9e-05,3.8e-05,6e-05,2.6e-05,2.6e-05,2.2e-05,5.5e-05,3.05475
700,4.2e-05,7.8e-05,4.1e-05,3.6e-05,2.5e-05,2.1e-05,3e-05,2.6e-05,0.002252,2.1e-05,...,4.7e-05,1.8e-05,3.9e-05,3.9e-05,6.1e-05,2.7e-05,2.7e-05,2.2e-05,5.5e-05,2.889654
axiom,4.1e-05,7.6e-05,8.2e-05,3.5e-05,2.4e-05,2.1e-05,2.9e-05,2.5e-05,0.002352,2e-05,...,4.7e-05,1.8e-05,3.9e-05,7.6e-05,6e-05,2.6e-05,2.6e-05,2.2e-05,5.5e-05,2.83838
wall,0.000227,4e-05,8.6e-05,0.000102,5.1e-05,0.000124,0.000124,0.000167,0.015508,0.001451,...,0.000331,0.000342,0.000307,0.000231,0.000299,0.00038,2.1e-05,2.8e-05,0.00023,2.65868
earth,0.000148,0.000118,4.2e-05,0.000109,0.000138,0.000148,0.000152,0.000196,0.00982,0.000199,...,7.2e-05,9.1e-05,0.001579,3.9e-05,0.000123,0.000108,0.000122,4.4e-05,0.000112,2.53996
spaceship,0.000205,7.6e-05,4.1e-05,3.5e-05,2.4e-05,2e-05,2.9e-05,2.5e-05,0.002317,2e-05,...,4.6e-05,1.8e-05,3.9e-05,3.8e-05,6e-05,2.6e-05,2.6e-05,2.2e-05,5.4e-05,2.441615
ship,0.000129,6e-05,9.7e-05,2.8e-05,0.000211,4.9e-05,2.3e-05,6e-05,0.004305,0.000128,...,7.3e-05,7e-05,6.1e-05,9e-05,9.4e-05,4.1e-05,0.000187,6.8e-05,8.6e-05,2.244284


In [511]:
result.sort_values("Frozen", ascending=False).head(10)

Unnamed: 0,Coco,Frozen,Importance
sister,6.2e-05,0.00261,0.023765
ice,7e-06,0.002563,0.00286
olaf,0.000199,0.002476,0.080221
snow,2.4e-05,0.002469,0.009812
powers,2.4e-05,0.002462,0.009904
hans,1.7e-05,0.002413,0.007067
snowman,1.7e-05,0.0024,0.007183
queen,1.8e-05,0.002382,0.007348
let,0.000433,0.002376,0.182069
princess,3.5e-05,0.002372,0.014759


## Wordcloud

In [113]:
from os import path
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import os

from wordcloud import WordCloud, STOPWORDS
import random

%matplotlib inline

def grey_color_func(word, font_size, position, orientation, random_state=None,
                    **kwargs):
    return "hsl(0, 0%%, %d%%)" % random.randint(255, 255)

In [114]:
result = pd.DataFrame(tf_idf, index=count.get_feature_names(), columns=titles)
result["Importance"] = result["Coco"].values / result.drop("Coco", 1).reset_index(drop=True).sum(axis=1).values
words = result.sort_values("Importance", ascending=False).index[:5000]
values = result.sort_values("Importance", ascending=False)['Importance'].values[:5000]

freq = {word: value for word, value in zip(words, values)}
text = " ".join([review[1] for review in reviews['Coco']])
mask = Image.open("../Naamloos1.png")
mask = np.array(mask.resize((2048, 2048)))

In [131]:
wc = WordCloud(background_color="black", max_words=2000, mask=mask,
               stopwords=stopwords, contour_width=3, contour_color=None, min_font_size=1)
wc.generate_from_frequencies(freq)


# store default colored image
default_colors = wc.to_array()
plt.title("Custom colors")
plt.imshow(wc.recolor(color_func=grey_color_func, random_state=3),
           interpolation="bilinear")
plt.axis("off")
# plt.show()
# plt.savefig("result.png", dpi=900)

In [130]:
# image = wc.to_image(); image

In [129]:
# image.save("result.png")