### data exploration

In [2]:
import json
import gzip
import gc

# memes_data = "../../sourcedata/kym.json"
# memes_data_spotlight = "../../sourcedata/kym_spotlight.json"
# memes_data_vision = "../../sourcedata/kym_vision.json"
memes_data = "../../sourcedata/kym.json.gz"
memes_data_spotlight = "../../sourcedata/kym_spotlight.json.gz"
memes_data_vision = "../../sourcedata/kym_vision.json.gz"

def load_data(file):
    if file.endswith('.json'):
        with open(file, 'r', encoding='utf-8') as f:
            data = json.load(f)
    elif file.endswith('.gz'):
        # https://stackoverflow.com/questions/39450065/python-3-read-write-compressed-json-objects-from-to-gzip-file
        with gzip.open(file, 'r') as f:
            json_bytes = f.read()
            json_str = json_bytes.decode('utf-8')
            data = json.loads(json_str)
    elif file.endswith('.zip'):
        pass
        
    return data

def free_mem(d, ds, dv):
    del d
    del ds
    del dv
    gc.collect()
    
try:
    free_mem(data, data_spotlight, data_vision)
except:
    pass
            
data = load_data(memes_data)
data_spotlight = load_data(memes_data_spotlight)
data_vision = load_data(memes_data_vision)

x = ["Meme data", "Spotlight data", "Vision data"]
for i, element in enumerate([data, data_spotlight, data_vision]):
    print(f"{x[i]} if of type {type(element)}")
    print(f"{x[i]} has {len(element)} elements")



Meme data if of type <class 'list'>
Meme data has 28799 elements
Spotlight data if of type <class 'dict'>
Spotlight data has 7603 elements
Vision data if of type <class 'dict'>
Vision data has 16686 elements


### Some encoding checks

In [3]:
for _dict in data:  # for dict in data list
    print(_dict['title'])



This is Relevant To My Interests
ROFLcopter
Bitches Don't Know
Leave Britney Alone
O RLY?
Yatta!
In Soviet Russia...
Domo
I Like Turtles
Diabeetus
Star Wars Kid
A Winner Is You
G.I. Joe PSAs
Recession Raven
Dramatic Chipmunk
One Does Not Simply Walk Into Mordor
Hasbro
Viral Videos
You Have My Sword, and My Bow, and My Axe
Lord Of The Rings
Creepy Katara
PopoZão
Viral Videos
Bus Uncle (巴士阿叔)
Animals
Konami Code
Eli Porter
Cornify
Avatar: The Last Airbender / The Legend of Korra
Rule 34
Make Your Own Album Cover
Album Cover Parodies
Translator San / He Just Doesn't Give a Fuck
E3 Sony 2006 / Giant Enemy Crab
Electronic Entertainment Expo (E3)
December 21st, 2012
Electronic Entertainment Expo (E3)
Nyoro~n
Furries
Cockmongler
YTMND
EPIC Maneuvers
Yes! Yes!
Street Fighter
Sleeveface
Keyboard Cat
The Melancholy of Haruhi Suzumiya
Extreme Advertising
Cats
Otaku
Snel hest
Demotivational Posters
Download More RAM
LOLspeak / Chanspeak
Do It Faggot
Julesoogle Gets Hacked
Bill Cosby
Squidward Fad 

### Main keys in the dataset

In [4]:
_main_keys = {}

for _dict in data: # for dict in data list
    keys = _dict.keys()
    for key in keys:
        if key not in _main_keys:
            _main_keys[key] = {'count':1}
            _main_keys[key].update({'types': {}})
        else:
            _main_keys[key]['count'] += 1
            
        # types
        _type = type(_dict[key])
        if _type not in _main_keys[key]['types']:
            _main_keys[key]['types'].update({_type: 1})
        else:
            _main_keys[key]['types'][_type] += 1

print("\nMain dict:\n")
total = 0
for key in _main_keys:
    print(key, ":", _main_keys[key])
    total += 1
print("\nTotal keys:", total)



Main dict:

title : {'count': 28799, 'types': {<class 'str'>: 28799}}
url : {'count': 28799, 'types': {<class 'str'>: 28799}}
last_update_source : {'count': 28799, 'types': {<class 'int'>: 28799}}
category : {'count': 28799, 'types': {<class 'str'>: 28799}}
template_image_url : {'count': 28799, 'types': {<class 'str'>: 28799}}
meta : {'count': 28799, 'types': {<class 'dict'>: 28799}}
ld : {'count': 28798, 'types': {<class 'dict'>: 28798}}
added : {'count': 28606, 'types': {<class 'int'>: 28606}}
details : {'count': 28799, 'types': {<class 'dict'>: 28799}}
content : {'count': 15406, 'types': {<class 'dict'>: 15406}}
tags : {'count': 15406, 'types': {<class 'list'>: 15406}}
additional_references : {'count': 15406, 'types': {<class 'dict'>: 15406}}
search_keywords : {'count': 19539, 'types': {<class 'list'>: 19539}}
parent : {'count': 12203, 'types': {<class 'str'>: 12203}}
siblings : {'count': 12203, 'types': {<class 'list'>: 12203}}
children : {'count': 14161, 'types': {<class 'list'>:

### Sampling Memes

In [5]:
import random

def sample_meme(data=data, title=None, category="Meme", status='Confirmed', details_type=None):
    if title:
        for i, _dict in enumerate(data): 
            if _dict['title'] == title:
                return data[i]
    else:
        meme = random.choice(data)
        while meme['category'] != category:
            meme = random.choice(data)
        if details_type:
            while (meme['category'] != category and len(meme['details']) < 5):
                meme = random.choice(data)
        return meme

def sample_augmentation(data):
    meme = sample_meme()
    meme_vision = None
    while meme_vision is None:
        try:
            meme_vision = data[meme['url']]
        except KeyError:
            meme = sample_meme()
            
    return {meme['url']: meme_vision}


# sample_meme(title="Viral Videos")
# meme_vision = sample_augmentation(data_vision)
# sample_meme(category="Meme", details_type=True)
# sample_meme(category="Meme")

meme_vision = sample_augmentation(data_spotlight)
display(meme_vision)

# meme_spotlight = sample_augmentation(data_spotlight)
# display(meme_spotlight)



{'https://knowyourmeme.com/memes/memes': {'@text': 'Memes are broadly defined as culturally transmitted information or ideas and beliefs that can be spread from one organism, or group of organisms, to another.[2] A key component to the meme concept is that the information is able to self-replicate, and in turn undergoes a type of natural selection, much like biological genes and viruses.',
  '@confidence': '0.5',
  '@support': '0',
  '@types': '',
  '@sparql': '',
  '@policy': 'whitelist',
  'Resources': [{'@URI': 'http://dbpedia.org/resource/Dual_inheritance_theory',
    '@support': '101',
    '@types': '',
    '@surfaceForm': 'culturally transmitted',
    '@offset': '29',
    '@similarityScore': '0.9999999908891937',
    '@percentageOfSecondRank': '9.110827480847145E-9'},
   {'@URI': 'http://dbpedia.org/resource/Meme',
    '@support': '1109',
    '@types': '',
    '@surfaceForm': 'meme',
    '@offset': '184',
    '@similarityScore': '0.9999999974805291',
    '@percentageOfSecondRank'

### Categories

In [6]:
_categories = {}

for _dict in data: # for dict in data list
    keys = _dict.keys()
    for key in keys:
    
        # categories
        if key == 'category':
            if _dict[key] not in _categories:
                _categories[_dict[key]] = 1
            else:
                _categories[_dict[key]] += 1

print("\nCategories:\n")
total = 0
for key in _categories:
    print(key, ":", _categories[key])
    total += int(_categories[key])
print("\nTotal:", total)




Categories:

Meme : 15406
Subculture : 5525
Event : 1957
Culture : 2583
Site : 1402
Person : 1926

Total: 28799


In [26]:
# sample_meme()
sample_meme(title="Fan Art")

# {'title': 'I Have Drawn You',


{'title': 'Fan Art',
 'url': 'https://knowyourmeme.com/memes/cultures/fan-art',
 'last_update_source': 1620426647,
 'category': 'Culture',
 'template_image_url': 'https://i.kym-cdn.com/entries/icons/original/000/011/241/4234234434.jpg',
 'meta': {'og:title': 'Fan Art',
  'og:site_name': 'Know Your Meme',
  'og:image': 'https://i.kym-cdn.com/entries/icons/facebook/000/011/241/4234234434.jpg',
  'og:image:width': '600',
  'og:image:height': '315',
  'og:type': 'article',
  'fb:app_id': '104675392961482',
  'fb:pages': '88519108736',
  'article:publisher': 'https://www.facebook.com/knowyourmeme',
  'twitter:card': 'summary_large_image',
  'twitter:site': '@knowyourmeme',
  'twitter:creator': '@knowyourmeme',
  'twitter:title': 'Fan Art',
  'twitter:description': 'Fan Art is artwork that is based on a character or object from a well known media subject, that was created by someone other than the creator of said media subject. It comes in a large variety of designs and styles, many which ar

### Meta

In [8]:
_meta = {}
_meta2 = {}
_meta3 = {}

for _dict in data: # for dict in data list

    # meta
    for key in _dict['meta']:
        if key not in _meta3:
            _meta3[key] = 1
        else:
            _meta3[key] += 1
                
    key = 'meta'

    if _dict[key]['og:site_name'] not in _meta:
        _meta[_dict[key]['og:site_name']] = 1
    else:
        _meta[ _dict[key]['og:site_name']] += 1

    if _dict[key]['og:type'] not in _meta2:
        _meta2[_dict[key]['og:type']] = 1
    else:
        _meta2[_dict[key]['og:type']] += 1

print("\nMeta:\n")
for key in _meta:
    print(key, ":", _meta[key])
for key in _meta2:
    print(key, ":", _meta2[key])

print("\nMeta Fields:\n")

for key in _meta3:
    print(key, ":", _meta3[key])



Meta:

Know Your Meme : 28799
article : 28799

Meta Fields:

og:title : 28799
og:site_name : 28799
og:image : 28799
og:image:width : 28799
og:image:height : 28799
og:type : 28799
fb:app_id : 28799
fb:pages : 28799
article:publisher : 28799
twitter:card : 28799
twitter:site : 28799
twitter:creator : 28799
twitter:title : 28799
twitter:description : 28779
twitter:image : 28799
description : 28779
og:url : 28799
og:description : 28779


### Sets

In [9]:
_sets = {}

for _dict in data: # for dict in data list
    keys = _dict.keys()
    for key in keys:
                    
        # sets
        if key in ['title', 'url', 'template_image_url', 'parent']:
            if key not in _sets:
                _sets[key] = {'non_unique': [], 'unique':0}
                _sets[key]['non_unique'].append(_dict[key])
            else:
                _sets[key]['non_unique'].append(_dict[key])
                
for key in ['title', 'url', 'template_image_url', 'parent']:
    non_uniq = len(_sets[key]['non_unique'])
    uniq = len(set(_sets[key]['non_unique']))
    _sets[key]['non_unique'] = non_uniq
    _sets[key]['unique'] = uniq

print("\nSets:\n")
for key in _sets:
    print(key, ":", _sets[key])



Sets:

title : {'non_unique': 28799, 'unique': 16713}
url : {'non_unique': 28799, 'unique': 16713}
template_image_url : {'non_unique': 28799, 'unique': 16713}
parent : {'non_unique': 12203, 'unique': 1575}


In [10]:
sample_meme(title="Dogs")


{'title': 'Dogs',
 'url': 'https://knowyourmeme.com/memes/subcultures/dogs',
 'last_update_source': 1620733657,
 'category': 'Subculture',
 'template_image_url': 'https://i.kym-cdn.com/entries/icons/original/000/010/346/gdggfjjgfjgfgg.png',
 'meta': {'og:title': 'Dogs',
  'og:site_name': 'Know Your Meme',
  'og:image': 'https://i.kym-cdn.com/entries/icons/facebook/000/010/346/gdggfjjgfjgfgg.jpg',
  'og:image:width': '600',
  'og:image:height': '315',
  'og:type': 'article',
  'fb:app_id': '104675392961482',
  'fb:pages': '88519108736',
  'article:publisher': 'https://www.facebook.com/knowyourmeme',
  'twitter:card': 'summary_large_image',
  'twitter:site': '@knowyourmeme',
  'twitter:creator': '@knowyourmeme',
  'twitter:title': 'Dogs',
  'twitter:description': 'The dog is a furry, omnivorous mammal widely regarded as the first animal to have been domesticated by humans to assist in field labor, such as herding livestock and hunting, as well as to serve as a companion. Along with the c

### Details

In [11]:
_status = {}
_year = {}
_origin = {}
_type = {}

for _dict in data:  # for dict in data list
    keys = _dict.keys()
    for key in keys:

        # details
        if key == 'details':
            
            for subkey in _dict[key]:
                if subkey == 'status':
                    if _dict[key][subkey] not in _status:
                        _status[_dict[key][subkey]] = 1
                    else:
                        _status[_dict[key][subkey]] += 1
                if subkey == 'year':
                    if _dict[key][subkey] not in _year:
                        _year[_dict[key][subkey]] = 1
                    else:
                        _year[_dict[key][subkey]] += 1
                if subkey == 'origin':
                    if _dict[key][subkey] not in _origin:
                        _origin[_dict[key][subkey]] = 1
                    else:
                        _origin[_dict[key][subkey]] += 1
                if subkey == 'type':
                    for element in _dict[key][subkey]:
                        if element not in _type:
                            _type[element] = 1
                        else:
                            _type[element] += 1


print("\nDetails:\n")
print("\nStatus:\n")
for key in _status:
    print(key, ":", _status[key])
    
print("\nOrigin:\n")
display(dict(sorted(_origin.items(), key=lambda item: item[1], reverse=True)))

print("\nYear:\n")
display(dict(sorted(_year.items(), key=lambda item: item[1], reverse=True)))

print("\nType:\n")
display(dict(sorted(_type.items(), key=lambda item: item[1], reverse=True)))





Details:


Status:

confirmed : 11888
deadpool : 5098
submission : 11807
unlisted : 6

Origin:



{'Unknown': 4141,
 'YouTube': 1774,
 '4chan': 1382,
 'Twitter': 895,
 'ARPANET': 556,
 'Tumblr': 554,
 'Reddit': 478,
 'Facebook': 457,
 'France': 358,
 'United States': 338,
 'The Selfish Gene': 330,
 'United States of America': 301,
 'Fusajiro Yamauchi': 295,
 'Something Awful': 259,
 'niconico': 219,
 'Japan': 213,
 'Nintendo': 200,
 'Usenet': 192,
 'Walt and Roy Disney': 157,
 'Christopher Poole': 157,
 'Youtube': 148,
 'Witches and Stitches': 140,
 'USA': 138,
 'Viacom Media Networks': 135,
 'Advice Dog': 135,
 'Queens, New York City, USA': 120,
 'Chad Hurley, Steve Chen, Jawed Karim': 119,
 'Africa': 116,
 'Jack Dorsey': 109,
 'DeviantArt': 107,
 'YTMND': 106,
 'Nintendo; Game Freak; Satoshi Tajiri': 104,
 'Instagram': 103,
 'Gabe Newell and Mike Harrington': 101,
 'Hassenfeld Brothers': 94,
 'Valve': 94,
 'The Internet': 86,
 'Vine': 86,
 'Time Warner; Turner Broadcasting System': 84,
 'Urban Dictionary': 84,
 'Stephen Hillenburg': 83,
 '2channel': 79,
 'Usenet / Internet': 78,



Year:



{None: 3177,
 '2011': 2283,
 '2010': 1818,
 '2012': 1732,
 '2016': 1432,
 '2013': 1186,
 '2006': 1118,
 '2015': 1109,
 '2014': 1069,
 '2009': 1056,
 '2007': 1002,
 '2008': 926,
 '2017': 774,
 '2003': 720,
 '2004': 662,
 '2005': 627,
 '1960': 606,
 '1999': 437,
 '1996': 395,
 '2001': 381,
 '1976': 371,
 '1908': 342,
 '1889': 304,
 '1991': 279,
 '1923': 254,
 '2002': 248,
 '1977': 237,
 '1985': 235,
 '1998': 229,
 '1997': 212,
 '2000': 196,
 '1987': 181,
 '1992': 180,
 '1994': 175,
 '1995': 139,
 '1980': 131,
 '1910': 128,
 '1969': 128,
 '1946': 125,
 '1984': 113,
 '1986': 112,
 '1993': 111,
 '1982': 109,
 '1981': 103,
 '1990': 86,
 '1975': 79,
 '1776': 69,
 '1983': 64,
 '1961': 64,
 '1988': 56,
 '1989': 52,
 '1935': 51,
 '1863': 49,
 '1962': 46,
 '1947': 46,
 '1920': 46,
 '1979': 45,
 '1891': 44,
 '1971': 42,
 '1940': 41,
 '1939': 40,
 '1970': 34,
 '1978': 31,
 '1869': 29,
 '1963': 28,
 '1967': 25,
 '1753': 25,
 '1966': 22,
 '1839': 22,
 '1954': 21,
 '1955': 21,
 '1792': 19,
 '1949': 18


Type:



{'https://knowyourmeme.com/types/company': 1246,
 'https://knowyourmeme.com/types/video-game': 1172,
 'https://knowyourmeme.com/types/image-macro': 961,
 'https://knowyourmeme.com/types/slang': 715,
 'https://knowyourmeme.com/types/tv-show': 709,
 'https://knowyourmeme.com/types/exploitable': 703,
 'https://knowyourmeme.com/types/art': 697,
 'https://knowyourmeme.com/types/catchphrase': 681,
 'https://knowyourmeme.com/types/technology': 639,
 'https://knowyourmeme.com/types/viral-video': 616,
 'https://knowyourmeme.com/types/election': 513,
 'https://knowyourmeme.com/types/character': 489,
 'https://knowyourmeme.com/types/photoshop': 479,
 'https://knowyourmeme.com/types/film': 478,
 'https://knowyourmeme.com/types/cartoon': 475,
 'https://knowyourmeme.com/types/fauna': 441,
 'https://knowyourmeme.com/types/hashtag': 412,
 'https://knowyourmeme.com/types/social-network': 409,
 'https://knowyourmeme.com/types/animal': 400,
 'https://knowyourmeme.com/types/media-host': 397,
 'https://kno

### sample


In [12]:
sample_meme()


{'title': 'TopKnotKnack',
 'url': 'https://knowyourmeme.com/memes/topknotknack',
 'last_update_source': 1326425078,
 'category': 'Meme',
 'template_image_url': 'https://i.kym-cdn.com/entries/icons/original/000/008/313/Topknotknack.jpg',
 'meta': {'og:title': 'TopKnotKnack',
  'og:site_name': 'Know Your Meme',
  'og:image': 'https://i.kym-cdn.com/entries/icons/facebook/000/008/313/Topknotknack.jpg',
  'og:image:width': '600',
  'og:image:height': '315',
  'og:type': 'article',
  'fb:app_id': '104675392961482',
  'fb:pages': '88519108736',
  'article:publisher': 'https://www.facebook.com/knowyourmeme',
  'twitter:card': 'summary_large_image',
  'twitter:site': '@knowyourmeme',
  'twitter:creator': '@knowyourmeme',
  'twitter:title': 'TopKnotKnack',
  'twitter:description': "Instagram…. i'm not who started it – although the instagram blog claims it was this girl: http://web.stagram.com/n/racquelmishel/ I discovered it here: http",
  'twitter:image': 'https://i.kym-cdn.com/entries/icons/fa

### Content

In [13]:
_content = {}
_about = {}

for _dict in data:  # for dict in data list
    keys = _dict.keys()
    for key in keys:

        # content
        if key == 'content':
            for subkey in _dict[key]:
                
                # content
                if subkey not in _content:
                    _content[subkey] = 1
                else:
                    _content[subkey] += 1

                # about

display(dict(sorted(_content.items(), key=lambda item: item[1], reverse=True)))
# display(dict(sorted(_about.items(), key=lambda item: item[1], reverse=True)))


{'about': 10332,
 'origin': 9368,
 'external references': 8862,
 'spread': 8802,
 'search interest': 8330,
 'various examples': 2802,
 'notable examples': 2674,
 'related memes': 913,
 'types': 371,
 'academic research': 371,
 'dictionary recognition': 371,
 'derivatives': 341,
 'related sites': 331,
 'related subcultures': 330,
 'examples': 205,
 'history': 190,
 'periodic table of advice animals': 133,
 'template': 121,
 'references': 117,
 'usage': 104,
 'origins': 88,
 'search': 75,
 'popularity': 66,
 'background': 58,
 'videos': 53,
 'notable derivatives': 45,
 'impact': 43,
 'templates': 42,
 'notable images': 40,
 'notable videos': 40,
 'external references.': 39,
 'parodies': 35,
 'online presence': 34,
 'types of ironic memes': 34,
 'reception': 30,
 'in popular culture': 28,
 'google insights': 26,
 'overview': 26,
 'various exploitables': 25,
 'external reference': 25,
 'notable developments': 22,
 'criticism': 22,
 'highlights': 21,
 'origin:': 20,
 'external links': 20,
 

### Tags

In [14]:
_tags = {}

for _dict in data:  # for dict in data list
    keys = _dict.keys()
    for key in keys:

        # tags
        if key == 'tags':
            for element in _dict[key]:
                # tags
                if element not in _tags:
                    _tags[element] = 1
                else:
                    _tags[element] += 1

display(dict(sorted(_tags.items(), key=lambda item: item[1], reverse=True)))


{'4chan': 1036,
 'youtube': 948,
 'meme': 867,
 'image macro': 709,
 'slang': 682,
 'catchphrase': 594,
 'twitter': 489,
 'japan': 478,
 'exploitable': 473,
 'tumblr': 461,
 'memes': 451,
 'online behaviors': 446,
 'lol': 436,
 'photoshop': 414,
 'language': 398,
 '1980s': 386,
 'dictionary': 377,
 'anime': 375,
 'science': 370,
 'knowyourmeme': 336,
 'maymay': 335,
 'meme overload': 334,
 'encyclopedia dramatica': 332,
 'meme magic': 332,
 'richard dawkins': 331,
 'memeception': 330,
 'meme within a meme': 330,
 'valeri syutkin': 330,
 'the selfish gene': 330,
 'replicate': 330,
 'mimema': 330,
 'image': 329,
 'cat': 322,
 'music': 314,
 'parody': 302,
 'video': 295,
 'niconico douga': 273,
 'reddit': 272,
 'reaction image': 245,
 'funny': 241,
 'facebook': 239,
 'remix': 233,
 'macro': 230,
 'non-english': 221,
 'copypasta': 218,
 'politics': 215,
 'lolcat': 205,
 'is': 203,
 'what': 196,
 'image macros': 184,
 'trolling': 180,
 'gif': 171,
 "what's delaying": 170,
 'my dinner': 170,

In [15]:
sample_meme()

{'title': 'First Time Tasting Reaction Videos',
 'url': 'https://knowyourmeme.com/memes/first-time-tasting-reaction-videos',
 'last_update_source': 1576536629,
 'category': 'Meme',
 'template_image_url': 'https://i.kym-cdn.com/entries/icons/original/000/017/930/firstimevid.PNG',
 'meta': {'og:title': 'First Time Tasting Reaction Videos',
  'og:site_name': 'Know Your Meme',
  'og:image': 'https://i.kym-cdn.com/entries/icons/facebook/000/017/930/firstimevid.jpg',
  'og:image:width': '600',
  'og:image:height': '315',
  'og:type': 'article',
  'fb:app_id': '104675392961482',
  'fb:pages': '88519108736',
  'article:publisher': 'https://www.facebook.com/knowyourmeme',
  'twitter:card': 'summary_large_image',
  'twitter:site': '@knowyourmeme',
  'twitter:creator': '@knowyourmeme',
  'twitter:title': 'First Time Tasting Reaction Videos',
  'twitter:description': 'First Time Tasting Reaction Videos are recordings of people tasting various foods for the first time while providing commentary abo

### Additional references

In [16]:
_refs = {}

for _dict in data:  # for dict in data list
    keys = _dict.keys()
    for key in keys:

        # refs
        if key == 'additional_references':
            for subkey in _dict[key]:

                if subkey not in _refs:
                    _refs[subkey] = 1
                else:
                    _refs[subkey] += 1

                            

display(dict(sorted(_refs.items(), key=lambda item: item[1], reverse=True)))


{'Wikipedia': 3918,
 'Urban Dictionary,': 2285,
 'Encyclopedia Dramatica,': 1648,
 'Reddit,': 1347,
 'Urban Dictionary': 1022,
 'Twitter,': 844,
 'Facebook,': 838,
 'Meme Generator': 505,
 'Meme Generator,': 446,
 'Encyclopedia Dramatica': 313,
 'Twitter': 301,
 'Reddit': 274,
 'Memebase,': 141,
 'Facebook': 100,
 'Wikipedia,': 21,
 'Dictionary.com,': 20,
 'Merriam-Webster,': 14,
 'Fandom,': 13,
 'Anti-Defamation League,': 12,
 'Fandom.com,': 11,
 'IMDb,': 10,
 'ASCII Art Archive,': 8,
 'Stop-Motion Apps,': 7,
 'Fandom': 6,
 'Cheezburger,': 6,
 'Tumblr': 6,
 'Wikipedia List Of': 6,
 'Rules Of The Internet,': 6,
 'Bulbapedia,': 5,
 'Youtube Clip': 5,
 'Instagram,': 5,
 'Quora,': 4,
 'Genius,': 4,
 'Etsy,': 4,
 'YouTube': 4,
 'Touhou Wiki,': 4,
 'Website,': 4,
 'Tenor GIFs,': 4,
 'Bulbapedia': 4,
 'Tenor,': 4,
 'DeviantArt,': 4,
 'National Geographic,': 3,
 'Pinterest,': 3,
 'Somethingchans Wiki': 3,
 'IMDb': 3,
 'fandom.com': 3,
 'TV Tropes,': 3,
 'Wikitionary': 2,
 'TV Tropes': 2,
 'Go

### Search keywords

In [17]:
_keywords = {}

for _dict in data:  # for dict in data list
    keys = _dict.keys()
    for key in keys:

        # refs
        if key == 'search_keywords':
            for element in _dict[key]:

                if element not in _keywords:
                    _keywords[element] = 1
                else:
                    _keywords[element] += 1


display(
    dict(sorted(_keywords.items(), key=lambda item: item[1], reverse=True)))


{'internet': 556,
 'internet slang': 371,
 'Western Animation': 340,
 'Cartoons': 340,
 'meme': 330,
 'memes': 330,
 'trend': 330,
 'trends': 330,
 'idea': 330,
 'nintendo': 295,
 '2016 Presidential election': 228,
 '/m/0ncc_0w': 228,
 '\\"image macro\\"': 171,
 'tumblr': 162,
 '4chan': 158,
 'disney meme': 157,
 'Anonymous': 157,
 '2016 Republican Presidential Primary': 149,
 'xkcd': 142,
 'webcomics': 140,
 'Cyanide and Happiness': 140,
 'Penny Arcade': 140,
 'ctrl alt del': 140,
 'nickelodeon': 135,
 'nicktoons': 135,
 '\\"advice animals\\"': 133,
 'Anime': 128,
 '/m/01yrx': 122,
 '/m/0bt9lr': 122,
 '/m/015p6': 122,
 '/m/0ch_cf': 122,
 '/m/03k3r': 122,
 'youtube': 119,
 'donald trump': 119,
 'steam': 109,
 'twitter': 109,
 'google plus': 109,
 'pokemon': 104,
 'valve': 101,
 'hasbro': 94,
 'cartoon network': 84,
 'My Little Pony': 84,
 'Friendship is Magic': 84,
 'brony': 84,
 '/m/0czdsgs': 84,
 'MLP': 84,
 'Spongebob Squarepants': 83,
 'owling': 83,
 'Planking': 82,
 'lying down ga

In [18]:
sample_meme()

{'title': "Lil Uzi Vert's Stage Dive",
 'url': 'https://knowyourmeme.com/memes/lil-uzi-verts-stage-dive',
 'last_update_source': 1588869371,
 'category': 'Meme',
 'template_image_url': 'https://i.kym-cdn.com/entries/icons/original/000/022/950/Flying-Uzi.jpg',
 'meta': {'og:title': "Lil Uzi Vert's Stage Dive",
  'og:site_name': 'Know Your Meme',
  'og:image': 'https://i.kym-cdn.com/entries/icons/facebook/000/022/950/Flying-Uzi.jpg',
  'og:image:width': '600',
  'og:image:height': '315',
  'og:type': 'article',
  'fb:app_id': '104675392961482',
  'fb:pages': '88519108736',
  'article:publisher': 'https://www.facebook.com/knowyourmeme',
  'twitter:card': 'summary_large_image',
  'twitter:site': '@knowyourmeme',
  'twitter:creator': '@knowyourmeme',
  'twitter:title': "Lil Uzi Vert's Stage Dive",
  'twitter:description': 'Lil Uzi Vert’s Stage Dive refers to jokes and photoshops made about a photograph of rapper Lil Uzi Vert jumping into the crowd during a show in Miami.',
  'twitter:image'

### Meme sampling | country of origin check

In [19]:
for i in range(20):
    meme = sample_meme()
    print(meme['url'])

https://knowyourmeme.com/memes/its-a-fucking-hammer
https://knowyourmeme.com/memes/liam-neesons-cock
https://knowyourmeme.com/memes/marauder-shields
https://knowyourmeme.com/memes/menat
https://knowyourmeme.com/memes/os-tan
https://knowyourmeme.com/memes/internet-slang
https://knowyourmeme.com/memes/stoner-dog
https://knowyourmeme.com/memes/proud-bat-is-proud
https://knowyourmeme.com/memes/eyebrows-on-fleek
https://knowyourmeme.com/memes/true-story
https://knowyourmeme.com/memes/the-deliciousness-is-gone
https://knowyourmeme.com/memes/i-hate-my-sleeping-subs-muh-bedtime
https://knowyourmeme.com/memes/name-dissociation
https://knowyourmeme.com/memes/sandy-is-dead
https://knowyourmeme.com/memes/viral-videos
https://knowyourmeme.com/memes/gorillaz-demon-days-cover-parodies
https://knowyourmeme.com/memes/console-wars-console-debates
https://knowyourmeme.com/memes/jimmy-fallon-pictionary-results
https://knowyourmeme.com/memes/issa
https://knowyourmeme.com/memes/beating-a-dead-horse


### JSON summary script

In [20]:
#!/usr/bin/env python
# Gives a quick summary of the structure of a JSON file, including the keys, object types, and
# leaf node types. Provides a count of each data type so you can quickly tell which data points
# are common.
#
# Example:
#
# $ curl -sS 'https://raw.githubusercontent.com/johan/world.geo.json/master/countries.geo.json' | python summarizejson
#      9191 {features}.[].{geometry}.{coordinates}.[].[].[].[].float
#        41 {features}.[].{geometry}.{coordinates}.[].[].[].[].int
#     12171 {features}.[].{geometry}.{coordinates}.[].[].[].float
#        25 {features}.[].{geometry}.{coordinates}.[].[].[].int
#       180 {features}.[].{geometry}.{type}.unicode
#       180 {features}.[].{id}.unicode
#       180 {features}.[].{properties}.{name}.unicode
#       180 {features}.[].{type}.unicode
#         1 {type}.unicode
#
# From which I can see that I can extract data like this:
#
# $ curl -sS 'https://raw.githubusercontent.com/johan/world.geo.json/master/countries.geo.json' | jq '.features[0] | [ .id, .geometry.coordinates[0][0][0:2] ]'
# [
#  "AFG",
#  [
#    61.210817,
#    35.650072
#  ]
# ]

import json
import fileinput
import re
from collections import defaultdict

# Look for JSONP wrapped JSON - JSON wrapped in a JavaScript function call
# like:
#    my_callback1 ( [ 1, 2, 3, 4 ] ) ;
#
# This doesn't really respect every possible function name, but it'll catch most
# common ones.


def strip_jsonp(raw):
    start = re.match(r'^( *[$a-z0-9A-Z_]+ *[(] *)[[{]', raw[0:50])
    end = re.search(r"[]}] *([)][ ;]*)$", raw[-10:])

    if start and end:
        raw = raw[len(start.group(1)): -1 * len(end.group(1))]

    return raw


raw = "\n".join(fileinput.input(memes_data))
# raw = "\n".join( memes_data )
raw = strip_jsonp(raw)
data = json.loads(raw)


def yieldkeys(data, parent_key=None):
    parent_key = '%s.' % (parent_key) if parent_key else ''

    if isinstance(data, list):
        for i, item in enumerate(data):
            for y in yieldkeys(item, '%s[]' % (parent_key)):
                yield(y)
    elif isinstance(data, dict):
        for i, item in data.items():
            for y in yieldkeys(item, '%s{%s}' % (parent_key, i)):
                yield(y)
    else:
        yield('%s%s' % (parent_key, type(data).__name__))


keycount = defaultdict(lambda: 0)
for a in yieldkeys(data):
    keycount[a] += 1

for key in sorted(keycount.keys()):
    print("%4d %s" % (keycount[key], key))


UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte