In [1]:
from sklearn.preprocessing import StandardScaler
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
from yellowbrick.cluster import SilhouetteVisualizer
from sklearn.datasets import make_blobs
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import plotly.express as px
import pandas as pd
import numpy as np
import os

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpgrowth, fpmax
from mlxtend.frequent_patterns import association_rules

In [2]:
from nltk.tokenize import RegexpTokenizer
import re
import nltk

In [3]:
import ast 
nltk.download('words')
words = set(nltk.corpus.words.words())

[nltk_data] Downloading package words to /Users/rajshah/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [4]:
# Function to filter out tokens based on English dictionary words

def cleanTokens(total_tokens):
    tokens = ast.literal_eval(total_tokens)
    return [token for token in tokens if token in words]

In [5]:
bechdelScenes = pd.read_csv('Dataset/allBechdelScenes.csv')
mediaScenes = pd.read_csv('Dataset/allMediaDiversityScenes.csv')
df = pd.concat([bechdelScenes, mediaScenes])

# Define 'passesBechdel' column based on Bechdel test criteria

df['passesBechdel'] = (df['chars'] >= 2) & (df['males'] <= 1) & (df['talking_about_men'] == False)
# Apply token cleaning function to 'total_tokens' column

df['total_tokens'] = df.apply(lambda x: cleanTokens(x.total_tokens), axis=1)
df.sample(5)

Unnamed: 0,chars,females,males,nbs,unknowns,total_tokens,total_others,talking_about_men,passesBechdel
30654,3,0,3,0,0,"[sir, find, data, ship, main, unusual, momenta...","['computer', 'data']",True,False
5758,1,0,1,0,0,"[name, picked, ever, particularly, quickly, re...","['halliday', 'her', 'she']",True,False
42734,4,2,2,0,0,"[block, river, might, find, hole, two, data, m...","['anderson', 'maya']",True,False
40199,2,1,1,0,0,"[kibble, near, nowhere, feed, big, cat, escort...","['he', 'her', 'his']",True,False
15505,1,1,0,0,0,"[give, gun]",[],False,False


In [6]:
pass_bechdel_scenes = df[df.passesBechdel]
fail_bechdel_scenes = df[df.passesBechdel == False]

In [None]:
pass_tokens = pass_bechdel_scenes['total_tokens'].tolist()
fail_tokens = fail_bechdel_scenes['total_tokens'].tolist()
# Perform Association Rule Mining on scenes that pass Bechdel test

te = TransactionEncoder()
te_ary1 = te.fit(pass_tokens).transform(pass_tokens)
pass_transactions = pd.DataFrame(te_ary1, columns=te.columns_)

In [12]:
frequent_itemsets = apriori(pass_transactions, min_support=0.01, use_colnames=True)

In [25]:
frequent_itemsets = frequent_itemsets[frequent_itemsets['itemsets'].str.len() > 1].reset_index(drop=True)
frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False).reset_index(drop=True)
frequent_itemsets.head(10)

Unnamed: 0,index,support,itemsets
0,1433,0.102595,"(like, know)"
1,982,0.087376,"(know, get)"
2,1508,0.08194,"(know, think)"
3,1525,0.079766,"(know, want)"
4,1466,0.070254,"(know, one)"
5,988,0.069303,"(like, get)"
6,1680,0.068759,"(like, think)"
7,1619,0.064954,"(like, look)"
8,1480,0.064683,"(right, know)"
9,1694,0.064411,"(like, want)"


> Before we go further, let's remove very common words from the dataset. We'll be using the advice given here: https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [7]:
# Token cleaning for more meaningful analysis

df['text'] = df.apply(lambda x: (' ').join(x.total_tokens), axis = 1)
df.sample(5)

Unnamed: 0,chars,females,males,nbs,unknowns,total_tokens,total_others,talking_about_men,passesBechdel,text
6212,1,0,1,0,0,"[guy, hard, deck, watch, interrupt, pull, alt,...",['rooster'],True,False,guy hard deck watch interrupt pull alt coming ...
5113,2,0,2,0,0,"[since, hour, come, perfect, today, much, went...",[],False,False,since hour come perfect today much went work g...
24203,3,1,2,0,0,"[get, die, run, see, bitch, away]","['raizo', 'mika']",True,False,get die run see bitch away
39410,2,1,1,0,0,"[two, find, last, companion, king, three, book...","['she', 'his', 'her', 'ofelia', 'he']",True,False,two find last companion king three book echo s...
17372,5,0,4,1,0,"[two, last, hope, would, whilst, math, crack, ...","['fischer', 'ariadne']",True,False,two last hope would whilst math crack three ef...


In [8]:
text_vals = df.text.values
countVect = CountVectorizer()
countVect = countVect.fit(text_vals)
X_cv = countVect.transform(text_vals)

In [10]:
pdCV = pd.DataFrame(X_cv.toarray(), columns=countVect.get_feature_names_out())
pdCV[pdCV > 0] = 1
pdCV = pdCV.loc[:,(pdCV.sum(axis=0) > 15)]
pdCV = pdCV.loc[:,(pdCV.sum(axis=0) < pdCV.shape[0]/5)]
good_tokens = set(pdCV.columns.values)


In [9]:
pdCV = pd.read_csv('pdCV.csv', index_col=False)
good_tokens = set(pdCV.columns.values)

  return func(*args, **kwargs)


In [17]:
pdCV.sample(5).to_csv('pdCV.csv', index_label=False)

In [10]:
# Token cleaning function for scenes failing Bechdel test

def cleanerTokens(tokens):
    return [token for token in tokens if token in good_tokens]

df['scrubbed_tokens'] = df.apply(lambda x: cleanerTokens(x.total_tokens), axis=1)

In [11]:
pass_bechdel_scenes = df[df.passesBechdel]
fail_bechdel_scenes = df[df.passesBechdel == False]

In [12]:
pass_bechdel_scenes.shape

(7359, 11)

In [13]:
fail_bechdel_scenes = fail_bechdel_scenes.sample(pass_bechdel_scenes.shape[0])

In [14]:
pass_tokens = pass_bechdel_scenes['scrubbed_tokens'].tolist()
fail_tokens = fail_bechdel_scenes['scrubbed_tokens'].tolist()


In [21]:
# Association Rule Mining on scenes failing Bechdel test

te = TransactionEncoder()
te_ary1 = te.fit(pass_tokens).transform(pass_tokens)
pass_transactions = pd.DataFrame(te_ary1, columns=te.columns_)

frequent_itemsets = apriori(pass_transactions, min_support=0.01, use_colnames=True)

In [22]:
frequent_itemsets = frequent_itemsets[frequent_itemsets['itemsets'].str.len() > 1].reset_index(drop=True)
frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False).reset_index(drop=True)
frequent_itemsets.head(10)

Unnamed: 0,support,itemsets
0,0.052317,"(want, think)"
1,0.043756,"(right, think)"
2,0.043484,"(want, right)"
3,0.042533,"(want, going)"
4,0.042261,"(going, think)"
5,0.041854,"(well, think)"
6,0.041582,"(want, got)"
7,0.039815,"(want, see)"
8,0.039272,"(think, good)"
9,0.039272,"(right, got)"


This is much better - we can see more interesting results. Let's look at scenes that failed the Bechdel test.

In [15]:
te = TransactionEncoder()
te_ary2 = te.fit_transform(fail_tokens)
fail_transactions = pd.DataFrame(te_ary2, columns=te.columns_)

In [16]:
frequent_itemsets_fail = apriori(fail_transactions, min_support=0.05, use_colnames=True)
frequent_itemsets_fail = frequent_itemsets_fail[frequent_itemsets_fail['itemsets'].str.len() > 1].reset_index(drop=True)
frequent_itemsets_fail = frequent_itemsets_fail.sort_values(by='support', ascending=False).reset_index(drop=True)
frequent_itemsets_fail

Unnamed: 0,support,itemsets
0,0.077320,"(want, right)"
1,0.077184,"(want, think)"
2,0.075282,"(right, think)"
3,0.074874,"(right, got)"
4,0.073108,"(want, got)"
...,...,...
131,0.050414,"(could, would)"
132,0.050143,"(got, man)"
133,0.050007,"(take, thing)"
134,0.050007,"(make, going)"


In [28]:
association_pass = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.001, support_only=True)
association_pass.sort_values(by='support', ascending=False).head(20)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(want),(think),,,0.052317,,,,,
1,(think),(want),,,0.052317,,,,,
2,(right),(think),,,0.043756,,,,,
3,(think),(right),,,0.043756,,,,,
4,(want),(right),,,0.043484,,,,,
5,(right),(want),,,0.043484,,,,,
6,(want),(going),,,0.042533,,,,,
7,(going),(want),,,0.042533,,,,,
8,(going),(think),,,0.042261,,,,,
9,(think),(going),,,0.042261,,,,,


In [29]:
association_fail = association_rules(frequent_itemsets_fail, metric='confidence', min_threshold=0.001, support_only=True)
association_fail.sort_values(by='support', ascending=False).head(20)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(want),(right),,,0.07732,,,,,
1,(right),(want),,,0.07732,,,,,
2,(want),(think),,,0.077184,,,,,
3,(think),(want),,,0.077184,,,,,
4,(right),(think),,,0.075282,,,,,
5,(think),(right),,,0.075282,,,,,
6,(right),(got),,,0.074874,,,,,
7,(got),(right),,,0.074874,,,,,
8,(want),(got),,,0.073108,,,,,
9,(got),(want),,,0.073108,,,,,


> Market Basket Analysis doesn't show us much - we are only able to see very common words still. Next step would be to do it by movies instead of scenes to see if that gives us better data.

In [30]:
# Next step: Analyzing by movies instead of scenes

df = [] # Initializing empty list to store scene data
for f in os.listdir('Dataset/allScenes'):
    if not f.endswith('.csv'):
        continue
    filename = f.split('_scenes.csv')[0]
    scenes = pd.read_csv('Dataset/allScenes/' + f, index_col=False).total_tokens.values
    full_tokens = []
    full_scenes = []

    for scene in scenes:
        scene_x = ast.literal_eval(scene)
        full_scenes.append([token for token in scene_x if token in words])
        full_tokens += scene_x
    full_tokens = [token for token in full_tokens if token in good_tokens]
    df.append({
        'movie_filename': filename,
        'tokens': full_tokens,
        'text': (' ').join(full_tokens),
        'scenes': full_scenes
    })
df = pd.DataFrame(df)

In [31]:
df

Unnamed: 0,movie_filename,tokens,text,scenes
0,Easy-A,"[go, two, town, every, video, confess, make, a...",go two town every video confess make account o...,"[[go, two, town, neighboring, every, video, co..."
1,Killers-Of-The-Flower-Moon-Read-The-Screenplay,"[sacred, teaching, white, bury, gave, grandfat...",sacred teaching white bury gave grandfather ta...,"[[sacred, teaching, white, bury, gave, pah, gr..."
2,Cast-Away,"[pretty, gas, way, cut, mountain, filter, engi...",pretty gas way cut mountain filter engine gues...,"[[pretty], [gas, way, get, cut, mountain, filt..."
3,Ghost-Ship,"[work, cabin, mind, friendship, find, main, ta...",work cabin mind friendship find main talk grad...,"[[work, cabin, mind, friendship, find, know, m..."
4,Downsizing,"[afraid, happen, make, right, born, thing, old...",afraid happen make right born thing old give f...,"[[afraid, know, happen, make, right, born, thi..."
...,...,...,...,...
791,Bourne-Ultimatum-The,"[radio, give, argument, gun, would, last, ago,...",radio give argument gun would last ago pam thr...,"[[radio, give, argument, gun], [would, last, a..."
792,Happy-Go-Lucky,"[bit, dance, make, test, holding, celebrate, c...",bit dance make test holding celebrate child cr...,"[[bit, dance, make, test, framing, holding, ce..."
793,Blind-Side-The,"[investigate, trouble, bit, find, granger, fil...",investigate trouble bit find granger file wind...,"[[investigate], [trouble, bit, find, know, gra..."
794,Croods-The,"[find, last, hope, would, every, three, fun, n...",find last hope would every three fun neighbor ...,"[[find, last, hope, would, every, three, fun, ..."


In [35]:
# Token filtering and cleaning for movie-level analysis

text_vals = df.text.values
countVect = CountVectorizer()
countVect = countVect.fit(text_vals)
X_cv = countVect.transform(text_vals)
pdCV = pd.DataFrame(X_cv.toarray(), columns=countVect.get_feature_names_out())
pdCV[pdCV > 0] = 1
pdCV = pdCV.loc[:,(pdCV.sum(axis=0) > 15)]
pdCV = pdCV.loc[:,(pdCV.sum(axis=0) < pdCV.shape[0]/2)]
good_tokens = set(pdCV.columns.values)

def cleanerTokens(tokens):
    return [token for token in tokens if token in good_tokens]

df['scrubbed_tokens'] = df.apply(lambda x: cleanerTokens(x.tokens), axis=1)

In [36]:
df

Unnamed: 0,movie_filename,tokens,text,scenes,scrubbed_tokens
0,Easy-A,"[go, two, town, every, video, confess, make, a...",go two town every video confess make account o...,"[[go, two, town, neighboring, every, video, co...","[video, confess, account, occasional, record, ..."
1,Killers-Of-The-Flower-Moon-Read-The-Screenplay,"[sacred, teaching, white, bury, gave, grandfat...",sacred teaching white bury gave grandfather ta...,"[[sacred, teaching, white, bury, gave, pah, gr...","[sacred, teaching, bury, grandfather, taught, ..."
2,Cast-Away,"[pretty, gas, way, cut, mountain, filter, engi...",pretty gas way cut mountain filter engine gues...,"[[pretty], [gas, way, get, cut, mountain, filt...","[gas, mountain, filter, engine, dirty, fuel, s..."
3,Ghost-Ship,"[work, cabin, mind, friendship, find, main, ta...",work cabin mind friendship find main talk grad...,"[[work, cabin, mind, friendship, find, know, m...","[cabin, friendship, main, graduate, voyage, qu..."
4,Downsizing,"[afraid, happen, make, right, born, thing, old...",afraid happen make right born thing old give f...,"[[afraid, know, happen, make, right, born, thi...","[born, colleague, agree, impressive, pleasure,..."
...,...,...,...,...,...
791,Bourne-Ultimatum-The,"[radio, give, argument, gun, would, last, ago,...",radio give argument gun would last ago pam thr...,"[[radio, give, argument, gun], [would, last, a...","[radio, argument, pam, confirmed, tape, locati..."
792,Happy-Go-Lucky,"[bit, dance, make, test, holding, celebrate, c...",bit dance make test holding celebrate child cr...,"[[bit, dance, make, test, framing, holding, ce...","[dance, test, celebrate, cross, text, apart, h..."
793,Blind-Side-The,"[investigate, trouble, bit, find, granger, fil...",investigate trouble bit find granger file wind...,"[[investigate], [trouble, bit, find, know, gra...","[investigate, file, investigator, evidence, od..."
794,Croods-The,"[find, last, hope, would, every, three, fun, n...",find last hope would every three fun neighbor ...,"[[find, last, hope, would, every, three, fun, ...","[neighbor, fat, instant, forever, breakfast, b..."
