# Media Trope Visualization
Author: Ra Cohen
Purpose: Visualize the relationships between tropes and media in addition to local recommendations.

In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)
import re
import json

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from sklearn.cluster import KMeans

In [None]:
import access
import recommend

Read in data sets and grab all ids of specifically films.

In [None]:
film_df = pd.read_csv('data/film_imdb_match.csv')

In [None]:
film_ids = [x for x in film_df['tconst']]

In [None]:
with open('data/tropes_to_media.json') as data:
    tropes_to_media_ids = json.load(data)

with open('data/media_to_tropes.json') as data:
    media_ids_to_tropes = json.load(data)

## Media per Trope
For each Trope, how many pieces of media use that trope?

In [None]:
media_per_trope = pd.DataFrame()
media_per_trope['trope'] = tropes_to_media_ids.keys()
counts = []
for trope in media_per_trope['trope']:
    counts.append(len(tropes_to_media_ids[trope]))
media_per_trope['Number of Media Using Each Trope'] = counts

In [None]:
plt.figure()
sns.histplot(data = media_per_trope, x="Number of Media Using Each Trope", log_scale=True)
plt.show()

Now let's distinguish between TV and Films to see if there is a distinction in usage.

In [None]:
film_tv_per_trope = pd.DataFrame()
film_tv_per_trope['trope'] = tropes_to_media_ids.keys()
film_counts = list()
tv_counts = list()
# Iterate through all tropes
for trope in film_tv_per_trope['trope']:
    film_count = 0
    tv_count = 0
    # Iterate through all usages of that trope
    for id_ in tropes_to_media_ids[trope]:
        # If using media is a film, increment film_count
        if id_ in film_ids:
            film_count += 1
        # if not a film, increment tv count
        else:
            tv_count += 1
    film_counts.append(film_count)
    tv_counts.append(tv_count)

film_tv_per_trope['Film'] = film_counts
film_tv_per_trope['TV'] = tv_counts

Let's look at the most popular tropes in film and tv to annotate our graph.

In [None]:
popular_tv_tropes_filter = film_tv_per_trope['TV'] > 300
popular_movie_tropes_filter = film_tv_per_trope['Film'] > 700
popular_tropes = film_tv_per_trope[popular_movie_tropes_filter | popular_tv_tropes_filter]
popular_tropes

In [None]:
plt.figure(figsize=(16,10))
ax = sns.histplot(data = film_tv_per_trope, x="Film", y="TV", bins=50, cbar=True)
plt.title('Bivariate Histogram of the Number of TV Shows and Movies Using Each Trope')

# Annotations
# Horror Films
ax.text(popular_tropes.iloc[0,1]-15, popular_tropes.iloc[0,2]+15, popular_tropes.iloc[0,0])
# Shout Out
ax.text(popular_tropes.iloc[1,1]-50, popular_tropes.iloc[1,2]+10, popular_tropes.iloc[1,0])
# Retroactive Recognition
ax.text(popular_tropes.iloc[2,1]-50, popular_tropes.iloc[2,2]+10, popular_tropes.iloc[2,0])
# Oh Crap
ax.text(popular_tropes.iloc[3,1]-15, popular_tropes.iloc[3,2]+25, popular_tropes.iloc[3,0])
# Chekhov's Gun
ax.text(popular_tropes.iloc[4,1]-15, popular_tropes.iloc[4,2]+15, popular_tropes.iloc[4,0])
# Films of the
ax.text(popular_tropes.iloc[5,1]-15, popular_tropes.iloc[5,2]+10, popular_tropes.iloc[5,0])
# British Series
ax.text(popular_tropes.iloc[6,1]-15, popular_tropes.iloc[6,2]+10, popular_tropes.iloc[6,0])
# Short Runners
ax.text(popular_tropes.iloc[7,1]-15, popular_tropes.iloc[7,2]+20, popular_tropes.iloc[7,0])
# Short Runner
ax.text(popular_tropes.iloc[8,1]-15, popular_tropes.iloc[8,2]+10, popular_tropes.iloc[8,0])

plt.show()

## Tropes per Media
For each piece of content, how many tropes does it use?

In [None]:
tropes_per_media = pd.DataFrame()
tropes_per_media['media'] = media_ids_to_tropes.keys()
counts = []
for id_ in tropes_per_media['media']:
    counts.append(len(media_ids_to_tropes[id_]))
tropes_per_media['Number of Tropes Used in Media'] = counts

In [None]:
plt.figure()
sns.histplot(data = tropes_per_media, x="Number of Tropes Used in Media", log_scale=True)
plt.show()

Now let's see if there are any trends in the quantity of tropes in media based on whether it is a TV show or a film.

In [None]:
film_vs_tv = list()
for id_ in tropes_per_media['media']:
    if id_ in film_ids:
        film_vs_tv.append('Film')
    else:
        film_vs_tv.append('TV')
tropes_per_media['Film vs TV'] = film_vs_tv

Now let's grab the most trope-rich contents and sort them for an annotation.

In [None]:
popular_tropes_per_media_filter = tropes_per_media['Number of Tropes Used in Media'] > 1500
popular_tropes_per_media = tropes_per_media[popular_tropes_per_media_filter]
names = []
for id_ in popular_tropes_per_media['media']:
    names.append(access.get_media_name(id_))
popular_tropes_per_media['Title'] = names
sorted_popular_tropes_per_media = popular_tropes_per_media.sort_values('Number of Tropes Used in Media', ascending=False)
sptpm = sorted_popular_tropes_per_media.reset_index()[['Number of Tropes Used in Media', 'Title']]
sptpm

In [None]:
plt.figure(figsize=(14,8))
ax = sns.histplot(data = tropes_per_media, x="Number of Tropes Used in Media", hue="Film vs TV", multiple="dodge", log_scale=True)
plt.title('Histogram of the Number of Tropes Used by TV and Movies')

ax.text(400, 1100, "Top 12 Trope-rich Media")
for i in range(12):
    ax.text(150, 1000-50*i, "{} - {}".format(sptpm.iloc[i,0], sptpm.iloc[i,1]))

ax.axvline(x = 1500, ymin = 0, ymax = .05, linestyle='dashed', color='red')

plt.show()

# Visualization of Recommendation
Given a recommendation grouping, does the K-means cluster provide any human intelligible benefit? Should we include the cluster graph in the user interface?

In [None]:
# Manually go through recommendation steps, first candidate retrieval
original_ids, candidates = recommend.find_candidates(['The Wiggles Movie (1997)'], ['Trust Password'])

In [None]:
the_wiggles = recommend.find_candidate_expanded_tags(candidates)

In [None]:
# Initialize visualization
tsne = TSNE(n_components = 2, metric='manhattan', init='pca', verbose=1)

tsne_data = tsne.fit_transform(the_wiggles)

In [None]:
plt.figure(figsize=(16,12))
plt.scatter(tsne_data[:,0],tsne_data[:,1])
plt.title("tsne in 2-dims on all Content-Trope data")
plt.show()

In [None]:
# Initialize Clustering
kmeans = KMeans(n_clusters=4)

# Fit
kmeans.fit(the_wiggles)

# Predict
cluster_labels = kmeans.predict(the_wiggles)

# Centers
centers = kmeans.cluster_centers_

In [None]:
# Visualize clustering
plt.figure(figsize=(16,12))
plt.scatter(tsne_data[:,0],tsne_data[:,1],c=cluster_labels)
plt.title("Clusters in 2-dims on all Content-Trope data")
plt.show()

I don't see any benefit for including this in the user interface.