In [22]:
import pickle
from collections import defaultdict
import requests, re
from tqdm import tqdm
import numpy as np

In [2]:
with open('word_dict.pickle', 'rb') as handle:
    word_dict = pickle.load(handle)
common_subtopics = set(word_dict.keys())

In [14]:
with open('clusters.pickle', 'rb') as handle:
    clusters = pickle.load(handle)
clustered_subtopics = list(clusters.values())

In [4]:
URL = "https://en.wikipedia.org/w/api.php"
TITLE = "Wikipedia:Lists of popular pages by WikiProject"
PARAMS = {
    'action': "parse",
    'page': TITLE,
    'prop': 'wikitext',
    'section': 1,
    'format': "json"
}
S = requests.Session()
res = S.get(url=URL, params=PARAMS)
data = res.json()
wikitext = data['parse']['wikitext']['*']
lines = wikitext.split('|-')
popular_pages = []
for line in lines:
        entry = line.split("|")        
        popular_list = re.sub("\[|\]|\n", '', entry[1])
        subject = re.sub("\[|\]|\n", '', entry[-1])
        if 'Computing' in subject:
            popular_pages.append(popular_list)
entries = []
for pages in tqdm(popular_pages):
    PARAMS = {
        'action': "parse",
        'page': pages,
        'prop': 'wikitext',
        'section': 1,
        'format': "json"
    }
    
    res = S.get(url=URL, params=PARAMS)
    data = res.json()
    
    wikitext = data['parse']['wikitext']['*']
    lines = wikitext.split('|-')
    for line in lines:
            entry = line.split("|")
            if len(entry) < 3:
                continue
            page_title = re.sub("\[|\]|\n", '', entry[2])
            importance = re.sub("\[|\]|\n", '', entry[-1])
            if importance != 'Low' and importance != 'Unknown':
                entries.append(page_title[1:].replace('%E2%80%93', '-').replace('&#039;', "'"))

100%|█████████████████████████████████████| 601/601 [00:00<00:00, 198236.61it/s]
100%|███████████████████████████████████████████| 14/14 [00:19<00:00,  1.36s/it]


In [9]:
co_occurence = defaultdict(lambda: defaultdict(int))

In [10]:
URL = "https://en.wikipedia.org/w/api.php"
S = requests.Session()

for entry in tqdm(entries):
    list_of_sections = []
    TITLE = entry
    PARAMS = {
    'action': "parse",
    'page': TITLE,
    'prop': 'sections',
    'format': "json"
    }
    res = S.get(url=URL, params=PARAMS)
    data = res.json()
    bad_sections = ['See_also', 'Notes_and_references', 'External_links', 'References', 'Further_reading', 'Notes', 'Bibliography']
    try:
        data = data['parse']['sections']
    except:
        continue
    for idx, sections in enumerate(data):
        if int(sections['toclevel']) == 1:
            if sections['anchor'] not in bad_sections and sections['anchor'] in common_subtopics:
                list_of_sections.append(sections['anchor'])
    for i in range(len(list_of_sections)):
        for j in range(i+1, len(list_of_sections)):
            co_occurence[list_of_sections[i]][list_of_sections[j]] += 1
            co_occurence[list_of_sections[j]][list_of_sections[i]] += 1

100%|███████████████████████████████████████| 4470/4470 [08:01<00:00,  9.28it/s]


In [11]:
len(co_occurence)

148

In [12]:
co_occurence

defaultdict(<function __main__.<lambda>()>,
            {'Components': defaultdict(int,
                         {'Platforms': 2,
                          'Criticism': 3,
                          'Version_history': 2,
                          'Origins': 6,
                          'Use': 2,
                          'History': 35,
                          'Versions': 2,
                          'Examples': 5,
                          'Market_share': 2,
                          'Terminology': 4,
                          'Development': 4,
                          'Sources': 4,
                          'Features': 8,
                          'Adoption': 2,
                          'Explanatory_notes': 2,
                          'Overview': 2,
                          'Standards': 2,
                          'Impact': 2,
                          'Motivation': 1,
                          'Etymology': 3,
                          'Applications': 5,
                        

In [15]:
clustered_subtopics

[['Components',
  'Features',
  'Applications',
  'Supported_devices',
  'Advantages',
  'System_features',
  'App_features',
  'Main_features'],
 ['Platforms',
  'Products',
  'Issues',
  'Problems',
  'Syntax',
  'Functions',
  'Gameplay',
  'Other',
  'Details',
  'Plot',
  'Function',
  'Games',
  'Name',
  'Mascot'],
 ['Criticism',
  'History',
  'Background',
  'Personal_life',
  'Books',
  'Early_life',
  'Philanthropy',
  'Early_life_and_education',
  'Changes',
  'History_and_development',
  'Etymology',
  'Origins',
  'Origin',
  'Other_uses',
  'Introduction',
  'Criticisms',
  'Education',
  'Biography',
  'Death',
  'Philosophy',
  'Libraries',
  'Uses',
  'Structure',
  'Topics',
  'Theory',
  'Derivatives',
  'Career_and_research'],
 ['Version_history',
  'Release_history',
  'Releases',
  'Timeline',
  'Release',
  'Editions',
  'Versions'],
 ['Hardware',
  'Development',
  'Security_and_privacy',
  'Devices',
  'Architecture',
  'Functionality',
  'Production',
  'Soft

In [13]:
cooccurence_list = []

In [16]:
for key, dic in co_occurence.items():
    for cluster in clustered_subtopics:
        if key in cluster:
            total_freq = sum(dic.values())
            in_cluster = 0
            not_in_cluster = 0
            for k, v in dic.items():
                if k in cluster:
                    in_cluster += v
                else:
                    not_in_cluster += v
    cooccurence_list.append({'subtopic': key, 'total_freq': total_freq, \
                             'in_cluster': in_cluster, 'not_in_cluster': not_in_cluster})

In [17]:
cooccurence_list

[{'subtopic': 'Components',
  'total_freq': 132,
  'in_cluster': 14,
  'not_in_cluster': 118},
 {'subtopic': 'Platforms',
  'total_freq': 87,
  'in_cluster': 0,
  'not_in_cluster': 87},
 {'subtopic': 'Criticism',
  'total_freq': 393,
  'in_cluster': 152,
  'not_in_cluster': 241},
 {'subtopic': 'Version_history',
  'total_freq': 114,
  'in_cluster': 2,
  'not_in_cluster': 112},
 {'subtopic': 'History',
  'total_freq': 3523,
  'in_cluster': 351,
  'not_in_cluster': 3172},
 {'subtopic': 'Architecture',
  'total_freq': 156,
  'in_cluster': 17,
  'not_in_cluster': 139},
 {'subtopic': 'Features',
  'total_freq': 1047,
  'in_cluster': 29,
  'not_in_cluster': 1018},
 {'subtopic': 'Release_history',
  'total_freq': 189,
  'in_cluster': 1,
  'not_in_cluster': 188},
 {'subtopic': 'Reception',
  'total_freq': 905,
  'in_cluster': 8,
  'not_in_cluster': 897},
 {'subtopic': 'Functionality',
  'total_freq': 49,
  'in_cluster': 11,
  'not_in_cluster': 38},
 {'subtopic': 'Implementation',
  'total_freq

In [19]:
cooccurence_score = []

In [20]:
for i in cooccurence_list:
    cooccurence_score.append(i['in_cluster']/i['total_freq'])

In [23]:
# Lower is better
np.average(cooccurence_score)

0.14201563648524493