In [1]:
import io
import networkx as nx
import netwulf as nw 
from tqdm import tqdm 

In [2]:
import sys
import os
import pandas as pd
import mmh3
import numpy as np

def shingles(string:str,q:int):
    string.replace('|','')
    string.replace(' ','')
    string.replace('/','')
    string.replace('#','')
    split = [*string.lower()]
    output = set()
    for i in range(len(split)+1):
        if i < q:
            pass
        else:
            output.add(' '.join(split[i-q:i]))
    return output

def listhash(l,seeds):
    vals = set()
    for e in l:
        val = 0
        for seed in seeds:
            val = val ^ mmh3.hash(e, seed)
        vals.add(val)
    return vals

def signatures(docs, q=9, k=20):
    sign = {}
    for key, value in docs.items():
        sign[key] = listhash(shingles(value,q=q),np.arange(k))
    return sign

def jaccard(doc1, doc2):
    doc1=set(doc1)
    doc2=set(doc2)
    intersect = doc1.intersection(doc2)
    union = doc1.union(doc2)
    if len(union) == 0:
        return 0
    return len(intersect) / len(union)

def similarity(doc1:dict,doc2:dict, accept=0.9):
    output = dict()
    for key1,value1 in doc1.items():
        similar_items = []
        for key2,value2 in doc2.items():
            jac_value = jaccard(value1,value2)
            if jac_value >= accept: #or key2 in key1:
                similar_items.append(key2)
        if len(similar_items) > 0:
            output[key1] = similar_items
    return output



In [3]:
import pandas as pd
directory = '../files/'
DC = pd.read_csv(f'{directory}dc.csv')
MARVEL = pd.read_csv(f'{directory}marvel.csv')
DC = DC[DC['WikiLink'].notna()]
MARVEL = MARVEL[MARVEL['WikiLink'].notna()]

In [4]:
DC = DC.drop('Unnamed: 0', axis=1)
MARVEL = MARVEL.drop('Unnamed: 0', axis=1)

In [5]:
MARVEL['universe'] = 'Marvel'
DC['universe'] = 'DC'

df = pd.concat([MARVEL, DC], ignore_index=True, axis=0) 

In [6]:
meta_sex = pd.read_csv('super_meta/heroesInformation.csv')
meta_sex = meta_sex.drop('Unnamed: 0', axis=1)

In [7]:
male = meta_sex[meta_sex['Gender']=='Male']['name']
female = meta_sex[meta_sex['Gender']=='Female']['name']
good = meta_sex[meta_sex['Alignment']=='good']['name']
bad = meta_sex[meta_sex['Alignment']=='bad']['name']

In [8]:
wikilinks = dict(zip(df['WikiLink'],df['WikiLink']))
q=3
k=100
wikilinks = signatures(wikilinks,q=q,k=k)
wikichars = signatures(dict(zip(df['CharacterName'],df['CharacterName'])), q=q,k=k)
male_shingles = signatures(dict(zip(male,male)),q=q,k=k)
female_shingles = signatures(dict(zip(female,female)),q=q,k=k)
good_shingles = signatures(dict(zip(good,good)),q=q,k=k)
bad_shingles = signatures(dict(zip(bad,bad)),q=q,k=k)
meta_shingles = {'male':male_shingles,'female':female_shingles,'good':good_shingles,'bad':bad_shingles}
df['asian'] = 0
df['latino'] = 0
df['woman'] = 0
df['black'] = 0
df['man'] = 0
df['good'] = 0
df['bad'] = 0
from tqdm import tqdm
for key, value in tqdm(meta_shingles.items()):
    if key == 'male':
        sim = similarity(wikilinks, value, accept=0.9)
        simchars = similarity(wikichars, value, accept=0.9)
        for i in range(len(df)):
            if df['WikiLink'][i] in list(sim.keys()) or df['CharacterName'][i] in list(simchars.keys()):
                df.loc[i,'man'] = 1
    
    
    if key == 'female':
        sim = similarity(wikilinks, value, accept=0.9)
        simchars = similarity(wikichars, value, accept=0.9)
        for i in range(len(df)):
            if df['WikiLink'][i] in list(sim.keys()) or df['CharacterName'][i] in list(simchars.keys()):
                df.loc[i,'woman'] = 1
                
    if key == 'good':
        sim = similarity(wikilinks, value, accept=0.9)
        simchars = similarity(wikichars, value, accept=0.9)
        for i in range(len(df)):
            if df['WikiLink'][i] in list(sim.keys()) or df['CharacterName'][i] in list(simchars.keys()):
                df.loc[i,'good'] = 1
                
    if key == 'bad':
        sim = similarity(wikilinks, value, accept=0.9)
        simchars = similarity(wikichars, value, accept=0.9)
        for i in range(len(df)):
            if df['WikiLink'][i] in list(sim.keys()) or df['CharacterName'][i] in list(simchars.keys()):
                df.loc[i,'bad'] = 1

100%|██████████| 4/4 [00:07<00:00,  1.86s/it]


In [9]:
meta_links = ['List of Asian superheroes', 'List of Latino superheroes', 'List of black superheroes', 'List of superheroines','List of female supervillains']

In [10]:
import json
import urllib.request
from tqdm import tqdm
from urllib.parse import quote

for i in tqdm(meta_links):
    wikipage = i.replace(" ", "_")
    if '#' in i:
        wikipage = i.split('#')[0].replace(' ','_')
        wikisection = i.split('#')[1].replace(' ','_')
        
    baseurl = 'https://en.wikipedia.org/w/api.php?'
    action = 'action=query'
    title = f'titles={quote(wikipage)}'
    content = 'prop=revisions&rvprop=content'
    dataformat = 'format=json'

    query = f'{baseurl}{action}&{title}&{content}&{dataformat}'

    wikiresponse = urllib.request.urlopen(query)
    wikidata = wikiresponse.read()
    wikitext = wikidata.decode('utf-8')
    wikijson = json.loads(wikitext)
    with open(f"meta_superheroes/{i.replace('/','=')}.txt", "w") as outfile:
        outfile.write(str(wikijson))



100%|██████████| 5/5 [00:02<00:00,  1.87it/s]


In [11]:
import re
meta = []
for source_node in meta_links:
    f = io.open('meta_superheroes/' + source_node.replace('/','=') + '.txt', 'r', encoding='utf-8').read()
    text = re.findall(r'\[\[(.*?)\]\]',f)
    meta.append(text)
    
meta = dict(zip(meta_links,meta))

In [12]:

#Super heroes
import requests
from bs4 import BeautifulSoup
males = []
male_link_list = ['https://en.wikipedia.org/w/index.php?title=Category:DC_Comics_male_superheroes&pageuntil=Tasmanian+Devil+%28DC+Comics%29#mw-pages',
                  'https://en.wikipedia.org/w/index.php?title=Category:Marvel_Comics_male_superheroes&pagefrom=Sunspot+%28Comics%29%0ASunspot+%28Marvel+Comics%29#mw-pages',
                 'https://en.wikipedia.org/w/index.php?title=Category:Marvel_Comics_male_superheroes',
                 'https://en.wikipedia.org/w/index.php?title=Category:DC_Comics_male_superheroes&pagefrom=Tasmanian+Devil+%28DC+Comics%29#mw-pages']

for i in male_link_list:
    response = requests.get(f"{i}")
    soup = BeautifulSoup(response.text, "html")
    links = (
        soup
        .find_all("a", href=True)
    )
    for link in links:
        try:
            males.append(link["title"])
        except:
            pass



    
meta['male superheroes'] = males

In [13]:

#super villains male
import requests
from bs4 import BeautifulSoup
males = []
male_link_list = ['https://en.wikipedia.org/wiki/Category:Marvel_Comics_male_supervillains',
                  'https://en.wikipedia.org/wiki/Category:DC_Comics_male_supervillains',
                 'https://en.wikipedia.org/w/index.php?title=Category:Marvel_Comics_male_supervillains&pagefrom=Nuke+%28Marvel+Comics%29#mw-pages',
                 'https://en.wikipedia.org/w/index.php?title=Category:DC_Comics_male_supervillains&pagefrom=Toyman#mw-pages']

for i in male_link_list:
    response = requests.get(f"{i}")
    soup = BeautifulSoup(response.text, "html")
    links = (
        soup
        .find_all("a", href=True)
    )
    for link in links:
        try:
            males.append(link["title"])
        except:
            pass



    
meta['male supervillains'] = males

In [14]:
#hispanic supervillains
hispanic_villains=[]
hispanic_villains_links = ['https://comicvine.gamespot.com/profile/azrael111/lists/marvel-s-hispanic-superheroes-and-supervillains/48236/',
                          'https://comicvine.gamespot.com/profile/azrael111/lists/marvel-s-hispanic-superheroes-and-supervillains/48236/?page=2',
                          'https://comicvine.gamespot.com/profile/azrael111/lists/dc-s-hispanic-superheroes-and-supervillains/48225/',
                          'https://comicvine.gamespot.com/profile/azrael111/lists/dc-s-hispanic-superheroes-and-supervillains/48225/?page=2']
for i in hispanic_villains_links:
    response = requests.get(f"{i}")
    soup = BeautifulSoup(response.text, "html")
    links = (
        soup
        .find_all("a", href=True)
    )
    for link in links:
        try:
            hispanic_villains.append(link["href"])
        except:
            pass
for i in range(len(hispanic_villains)):
    try:
        hispanic_villains[i] = hispanic_villains[i].split('/')[1]
        hispanic_villains[i] = hispanic_villains[i].replace('-',' ')
    except:
        pass
hispanic_villains = list(np.unique(hispanic_villains))
meta['latino villains'] = hispanic_villains

In [15]:
#black supervillains (not wikipedia)
#from https://en.everybodywiki.com/List_of_black_supervillains
black_villains_dc = ['Amanda Waller',
'Black Manta',
'Black Mass',
'Black Spider',
'Brick Daniel Brickwell',
'Clock King',
'Coldcast',
'Crowbar',
'Fatality',
'Flare',
'Houngan',
'Killer Croc',
'Peek-a-Boo',
'Power Ring III',
'Shadow Thief',
'Skorpio',
'Tally Man II',
'Tattooed Man III',
'Tobias Whale',
'Wunda']

black_marvel_villains = ['Advisor',
'Aries',
'Alex Wilder',
'Barracuda',
'Bedlam',
'Black Mariah',
'Bushman',
'Bushmaster',
'Cardiac',
'Chemistro III',
'Noah Black',
'Coldfire',
'Comanche',
'Condor',
'Cottonmouth',
'Cottonmouth',
'Darkoth',
'Diamondback',
'Erik Killmonger',
'Foxfire',
'Geoffrey Wilder',
'Hairbag',
'Hammer',
'Hardcore',
'Hypno-Hustler',
'King Bedlam',
'Locus',
'Man-Ape',
'Menace',
"Midnight's Fire",
'Moses Magnum',
'Nekra',
'Nightshade',
'Powerhouse',
'Prowler',
'Puff Adder',
'Rock Python',
'Rocket Racer',
'Shades',
'Slyde',
'Tempo',
'Thunderball',
'Tombstone',
'Vermin']
villains = black_marvel_villains+black_villains_dc
meta['black villains'] = villains

In [16]:

wikilinks = dict(zip(df['WikiLink'],df['WikiLink']))
q=3
k=100
wikilinks = signatures(wikilinks,q=q,k=k)
wikichars = signatures(dict(zip(df['CharacterName'],df['CharacterName'])), q=q,k=k)

for key, value in tqdm(meta.items()):
    if key == 'List of Asian superheroes':
        asians = signatures(dict(zip(meta[key],meta[key])),q=q,k=k)
        sim = similarity(wikilinks, asians)
        simchars = similarity(wikichars, asians)
        for i in range(len(df)):
            if df['WikiLink'][i] in list(sim.keys()) or df['CharacterName'][i] in list(simchars.keys()):
                df.loc[i,'asian'] = 1
    if key == 'List of Latino superheroes' or key == 'latino villains':
        latinos = signatures(dict(zip(meta[key],meta[key])),q=q,k=k)
        sim = similarity(wikilinks, latinos)
        simchars = similarity(wikichars, latinos)
        for i in range(len(df)):
            if df['WikiLink'][i] in list(sim.keys()) or df['CharacterName'][i] in list(simchars.keys()):
                df.loc[i,'latino'] = 1
    if key == 'List of black superheroes' or key == 'black villains':
        blacks = signatures(dict(zip(meta[key],meta[key])),q=q,k=k)
        sim = similarity(wikilinks, blacks)
        simchars = similarity(wikichars, blacks)
        for i in range(len(df)):
            if df['WikiLink'][i] in list(sim.keys()) or df['CharacterName'][i] in list(simchars.keys()):
                df.loc[i,'black'] = 1
    if key == 'List of superheroines' or key == 'List of female supervillains':
        women = signatures(dict(zip(meta[key],meta[key])),q=q,k=k)
        sim = similarity(wikilinks, women)
        simchars = similarity(wikichars, women)
        for i in range(len(df)):
            if df['WikiLink'][i] in list(sim.keys()) or df['CharacterName'][i] in list(simchars.keys()):
                df.loc[i,'woman'] = 1
    if key == 'male superheroes' or key == 'male supervillains':
        women = signatures(dict(zip(meta[key],meta[key])),q=q,k=k)
        sim = similarity(wikilinks, women)
        simchars = similarity(wikichars, women)
        for i in range(len(df)):
            if df['WikiLink'][i] in list(sim.keys()) or df['CharacterName'][i] in list(simchars.keys()):
                df.loc[i,'man'] = 1
    
    if key == 'male superheroes' or key == 'List of superheroines' or key == 'List of black superheroes' or key == 'List of Latino superheroes' or key == 'List of Asian superheroes':
        women = signatures(dict(zip(meta[key],meta[key])),q=q,k=k)
        sim = similarity(wikilinks, women)
        simchars = similarity(wikichars, women)
        for i in range(len(df)):
            if df['WikiLink'][i] in list(sim.keys()) or df['CharacterName'][i] in list(simchars.keys()):
                df.loc[i,'good'] = 1
    
    if key == 'male supervillains' or key == 'List of female supervillains' or key == 'black villains':
        women = signatures(dict(zip(meta[key],meta[key])),q=q,k=k)
        sim = similarity(wikilinks, women)
        simchars = similarity(wikichars, women)
        for i in range(len(df)):
            if df['WikiLink'][i] in list(sim.keys()) or df['CharacterName'][i] in list(simchars.keys()):
                df.loc[i,'bad'] = 1

100%|██████████| 9/9 [01:14<00:00,  8.32s/it]


In [17]:
df.sum(axis=0)

CharacterName    AbominationAbsorbing ManAchebeAgentAgent XAirs...
WikiLink         Abomination (character)Absorbing ManAchebe (co...
universe         MarvelMarvelMarvelMarvelMarvelMarvelMarvelMarv...
asian                                                           29
latino                                                          77
woman                                                          182
black                                                           75
man                                                            585
good                                                           468
bad                                                            404
dtype: object

In [18]:
nasians = df['asian'].sum(0)
nlatinos = df['latino'].sum(0)
nwomen = df['woman'].sum(0)
nblacks = df['black'].sum(0)
nmen = df['man'].sum(0)
ngood = df['good'].sum(0)
nbad = df['bad'].sum(0)

for idx, row in df.iterrows():
    if row['asian'] == 1 and row['latino'] == 1:
        if nasians < nlatinos:
            df.loc[idx,'latino'] = 0
            nlatinos -= 1
        else:
            df.loc[idx,'asian'] = 0
            nasians -= 1
    if row['asian'] == 1 and row['black'] == 1:
        if nasians < nblacks:
            df.loc[idx,'black'] = 0
            nblacks -= 1
        else:
            df.loc[idx,'asian'] = 0
            nasians -=1
    if row['latino'] == 1 and row['black'] == 1:
        if nlatinos < nblacks:
            df.loc[idx,'black'] = 0
            nblacks-=1
        else:
            df.loc[idx,'latino'] = 0
            nlatinos-=1
    if row['woman'] == 1 and row['man'] == 1:
        if nwomen < nmen:
            df.loc[idx,'man'] = 0
            nmen-=1
        else:
            df.loc[idx,'woman'] = 0
            nwomen-=1
    if row['good'] == 1 and row['bad'] == 1:
        if ngood < nbad:
            df.loc[idx,'bad'] = 0
            nbad-=1
        else:
            df.loc[idx,'good'] = 0
            ngood-=1

In [19]:
#write csv
df.to_csv('metadataproject.csv') 