# Market Basket Analysis Tropes
by Ra Cohen 
June 13, 2023

In [1]:
import numpy as np
import pandas as pd
import re
import json

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.preprocessing import MultiLabelBinarizer

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

import access

In [2]:
with open('data/media_to_tropes.json') as json_file:
    media_to_tropes = json.load(json_file)

media_to_tropes_raw = pd.DataFrame.from_dict(media_to_tropes, orient='index')

In [3]:
media_to_tropes_df = pd.DataFrame(index=media_to_tropes_raw.index)

media_to_tropes_df['tropes'] =  media_to_tropes_raw.values.tolist()
media_to_tropes_df['tropes'] = media_to_tropes_df['tropes'].apply(lambda list_: list(filter(lambda item: item is not None, list_)))

In [4]:
mlb = MultiLabelBinarizer()

dum = mlb.fit_transform(media_to_tropes_df.tropes)

df = pd.DataFrame(dum.astype(bool), media_to_tropes_df.index, mlb.classes_)
df

Unnamed: 0,A Birthday Not A Break,A Bloody Mess,A Boy A Girl And A Baby Family,A Boy And His X,A Cappella,A Cat In A Gang Of Dogs,A Chat With Satan,A Child Shall Lead Them,A Crack In The Ice,A Cup Angst,...,Zombie Advocate,Zombie Apocalypse,Zombie Fic,Zombie Gait,Zombie Infectee,Zombie Mooks,Zombie Puke Attack,Zombify The Living,Zonk,Zorro Mark
tt0075617,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
tt2926810,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
tt0268978,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
tt0060165,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
tt1582465,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tt1000694,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
tt5556434,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
tt0090553,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
tt0085118,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## Trope Analysis

In [5]:
trope_frequencies = pd.DataFrame(df.sum(), columns=['counts'])

In [6]:
# Top 20 tropes documented in the set
trope_frequencies.counts.sort_values(ascending=False)[0:20]

Shout Out                     1368
Horror Films                  1071
Chekhovs Gun                   781
Films Of The                   767
Oh Crap                        752
Deadpan Snarker                749
Big Bad                        697
Foreshadowing                  687
Running Gag                    627
The Cameo                      602
Large Ham                      593
Bittersweet Ending             587
Title Drop                     568
British Series                 551
What Happened To The Mouse     547
Meaningful Name                539
Hollywood Hype Machine         517
Jerkass                        509
Berserk Button                 496
Brick Joke                     461
Name: counts, dtype: int64

## Analyzing Co-occurance

In [7]:
# instantiate encoder
TE = TransactionEncoder()

# store transformed playlists to variable
encoded_media = TE.fit_transform(media_to_tropes_df['tropes'])

# inspect encoded playlists
encoded_media.astype('int')   # represent as binary integers for ease of interpretation

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [8]:
# present encoded playlists as dataframe
media_df = pd.DataFrame(encoded_media, columns=TE.columns_)

# inspect initial rows
media_df.head()

Unnamed: 0,A Birthday Not A Break,A Bloody Mess,A Boy A Girl And A Baby Family,A Boy And His X,A Cappella,A Cat In A Gang Of Dogs,A Chat With Satan,A Child Shall Lead Them,A Crack In The Ice,A Cup Angst,...,Zombie Advocate,Zombie Apocalypse,Zombie Fic,Zombie Gait,Zombie Infectee,Zombie Mooks,Zombie Puke Attack,Zombify The Living,Zonk,Zorro Mark
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [9]:
# view top 10 tropes for co-occurences (each unique co-occurence across any playlist!) 
media_df.sum().sort_values(ascending=False).head(10)

Shout Out          1368
Horror Films       1071
Chekhovs Gun        781
Films Of The        767
Oh Crap             752
Deadpan Snarker     749
Big Bad             697
Foreshadowing       687
Running Gag         627
The Cameo           602
dtype: int64

## Market Basket Analysis

In [10]:
# create dataframe of support values (subject to criteria)
support_df = apriori(media_df,
                    min_support=0.001,
                    max_len=5,
                    use_colnames=True, # This parameter specifies that we want to use the column names from our data (e.g., trope names) instead of item indices.
                    low_memory=True).sort_values('support', ascending=False) # This parameter optimizes memory usage during the algorithm's execution.


# add column with 'count' of attributes for each support score
support_df['count'] = support_df['itemsets'].apply(lambda x: len(x))

# view dataframe
support_df

Unnamed: 0,support,itemsets,count
9325,0.105499,(Shout Out),1
5099,0.082594,(Horror Films),1
1854,0.060230,(Chekhovs Gun),1
3867,0.059150,(Films Of The),1
7564,0.057993,(Oh Crap),1
...,...,...,...
599089,0.001003,"(Adult Fear, Freeze Frame Bonus, All There In ...",5
599088,0.001003,"(Adult Fear, Shout Out, All There In The Scrip...",5
599087,0.001003,"(Adult Fear, Shout Out, All There In The Scrip...",5
231776,0.001003,"(Conservation Of Ninjutsu, Shout Out, Oh Crap)",3


In [13]:
# analyze rules in terms of multiple metrics
rules_df = association_rules(support_df, metric='lift', min_threshold=1)
rules_df.sort_values('confidence', ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
9625270,"(Too Dumb To Live, Oh Crap, Call Back, Precisi...",(Shout Out),0.001003,0.105499,0.001003,1.000000,9.478801,0.000897,inf,0.895399
11448464,"(Early Bird Cameo, Big Apple Sauce, And Starri...",(Superhero),0.001003,0.018046,0.001003,1.000000,55.414530,0.000984,inf,0.982940
771005,"(Promoted To Opening Credits, Promotedto Openi...",(Promoted To Opening Titles),0.001774,0.005321,0.001774,1.000000,187.927536,0.001764,inf,0.996446
771006,"(Promoted To Opening Titles, Promotedto Openin...",(Promoted To Opening Credits),0.001774,0.005244,0.001774,1.000000,190.691176,0.001764,inf,0.996523
771009,"(Promoted To Opening Credits, Promoted To Open...","(Promotedto Opening Titles, Promotionto Openin...",0.001774,0.005244,0.001774,1.000000,190.691176,0.001764,inf,0.996523
...,...,...,...,...,...,...,...,...,...,...
10681350,(Shout Out),"(Wham Line, Mass Oh Crap)",0.105499,0.001311,0.001003,0.009503,7.248495,0.000864,1.008270,0.963710
10822389,(Shout Out),"(Big Bad, Hope Spot, Big Damn Heroes)",0.105499,0.001157,0.001003,0.009503,8.214961,0.000881,1.008426,0.981855
9987287,(Shout Out),"(Wham Line, Freeze Frame Bonus, Rule Of Symbol...",0.105499,0.001003,0.001003,0.009503,9.478801,0.000897,1.008582,1.000000
10681344,(Shout Out),"(Demoted To Extra, Pet The Dog, Foreshadowing)",0.105499,0.001388,0.001003,0.009503,6.845801,0.000856,1.008193,0.954638


In [14]:
print('hii')

hii


In [15]:
rules_df.to_csv('market_basket_rules.csv')