#### To do:
1. Minimalize pull request functions, streamline and clean

In [10]:
import requests
import pymongo
import time

#### Build functions to search wikipedia and retrieve list of pages by top category

In [11]:
def pull_subcats(category, categories_list):
    category = '+'.join(category.split()).lower()
    
    api_head = 'https://en.wikipedia.org/w/api.php?'
    api_action = 'action=query&'
    api_list = 'list=categorymembers&'
    api_cmtitle = 'cmtitle=Category:' + category + '&'
    api_cmtype = 'cmtype=subcat&'
    api_cmlimit = 'cmlimit=250&'
    api_format = 'format=json'
    
    http = api_head + api_action + api_list + api_cmtitle + api_cmtype + api_cmlimit + api_format
    
    response = requests.get(http)
    json = response.json()

    subcats = []
    for cat in json['query']['categorymembers']:
        sc = cat['title'].split(':')[1]
        subcats.append(sc)
    
    subs = subcats.copy()
    categories_list.extend(subs)
    
    for sub in subs:
        if sub not in categories_list:
            sub_cats = pull_subcats(sub, categories_list)
            if len(sub_cats) > 0:
                subcats.extend(sub_cats)
    
    return subcats

In [12]:
def pull_pages(category):
    category = '+'.join(category.split()).lower()
    
    api_head = 'https://en.wikipedia.org/w/api.php?'
    api_action = 'action=query&'
    api_list = 'list=categorymembers&'
    api_cmtitle = 'cmtitle=Category:' + category + '&'
    api_cmtype = 'cmtype=page&'
    api_cmlimit = 'cmlimit=250&'
    api_format = 'format=json'
    
    http = api_head + api_action + api_list + api_cmtitle + api_cmtype + api_cmlimit + api_format
    
    response = requests.get(http)
    json = response.json()
    
    pages = set()
    for page in json['query']['categorymembers']:
        title = page['title']
        page_id = page['pageid']
        pages.add((page_id, title))
    
    return pages

In [13]:
def all_subpages(categories_list):    
    pages = set()
    for cat in categories_list:
        pages = pages.union(pull_pages(cat))
    
    return pages

In [14]:
def get_cat_page(page, pageid):    
    page = page.replace('+', '%2B')
    page = '%20'.join(page.split())
    
    api_head = 'https://en.wikipedia.org/w/api.php?'
    api_action = 'action=query&'
    api_titles = 'titles=' + page + '&'
    api_pageid = 'pageids=' + str(pageid) + '&'
    api_prop = 'prop=categories&'
    
    api_format = 'format=json'
    
    http = api_head + api_action + api_action + api_titles + api_prop + api_format
    
    response = requests.get(http)
    data = response.json()
    
    try:
        cats = data['query']['pages'][str(pageid)]['categories']
    except:
        cats = {}
    
    cats_list = []
    
    for cat in cats:
        cats_list.append(cat['title'])
    
    return cats_list

In [34]:
def get_page_text(title, page_id):
    title = title.replace('+', '%2B')
    ptitle = '+'.join(title.split())
    
    api_head = 'https://en.wikipedia.org/w/api.php?'
    api_action = 'action=query&'
    api_prop = 'prop=revisions&'
    api_rvprop = 'rvprop=content&'
    api_titles = 'titles=' + ptitle + '&'
    api_pageid = 'pageids=' + page_id + '&'
    api_format2 = 'formatversion=2&'
    api_format = 'format=json&'
    
    
    http = api_head + api_action + api_prop + api_rvprop + api_titles + api_format2 + api_format
    response = requests.get(http)
    json = response.json()
    
    try:
        text = json['query']['pages'][0]['revisions'][0]['content']
    except:
        text = None
    
    return text

#### Master function, pulls and orders data

In [16]:
def get_wiki_category_pages(top_category):
    start = time.time()
    categories = [top_category]
    categories.extend(pull_subcats(top_category, categories))

    pages_list = list(all_subpages(categories))

    all_pages_dict = dict()
    
    for pageid, topic in pages_list:
        all_pages_dict[pageid] = {'title': topic}
        
    for pageid, title_dict in all_pages_dict.items():
        title_dict['categories'] = get_cat_page(title_dict['title'], pageid)
        title_dict['page_id'] = str(pageid)

    all_lod = []
    
    for pageid, title_dict in all_pages_dict.items():
        all_lod.append(title_dict)
        
    for page_dict in all_lod:
        page_dict['text'] = get_page_text(page_dict['title'], page_dict['page_id'])
        page_dict['Category'] = top_category
        
    print(time.time() - start)
    return all_lod

In [57]:
# top_category = "Machine Learning"

In [63]:
# start = time.time()
# categories = [top_category]
# categories.extend(pull_subcats(top_category, categories))

# pages_list = list(all_subpages(categories))

# all_pages_dict = dict()

# print(time.time() - start)

In [62]:
# categories

In [64]:
# len(pages_list)

In [17]:
# for pageid, topic in pages_list:
#     all_pages_dict[pageid] = {'title': topic}

In [55]:
# len(all_pages_dict)

1394

In [54]:
# for pageid, title_dict in all_pages_dict.items():
#     title_dict['categories'] = get_cat_page(title_dict['title'], pageid)
#     title_dict['page_id'] = str(pageid)

# all_lod = []

In [57]:
# for pageid, title_dict in all_pages_dict.items():
#     all_lod.append(title_dict)

In [60]:
# for page_dict in all_lod:
#     page_dict['text'] = get_page_text(page_dict['title'], page_dict['page_id'])
#     page_dict['Category'] = top_category

In [1]:
#all_lod

#### MongoDB Upload

In [66]:
def post_to_mongo(category, page_list):
    cli = pymongo.MongoClient('34.209.242.27', 27016)
    wikidb = cli.wikipedia
    coll_ref = wikidb.get_collection(category)
    coll_ref.insert_many(page_list)

In [7]:
cli = pymongo.MongoClient('34.209.242.27', 27016)
wikidb = cli.wikipedia
cli.database_names()

['admin', 'local', 'my_database', 'test', 'wikipedia']

In [72]:
wikidb.collection_names()

['bussof', 'Machine Learning 2', 'ml_col', 'Business Software 2']

In [64]:
# coll_ref = wikidb.bussof

In [65]:
# for page in all_lod:
#     coll_ref.insert_one(page)

In [66]:
# wikidb.collection_names()

['bussof', 'ml_col']

In [73]:
col_ref = wikidb.get_collection('Business Software 2')
col_ref.count()

1393

In [69]:
#coll_ref.count()

In [70]:
wikidb.bussof.count(), wikidb.ml_col.count()

(1394, 1107)

#### Pull pages and populate MongoDB Server

Page pull procedure currently takes about 8~11 minutes depending on category. 
Proceed with care. 

Business Software timeit: 680.58

Machine Learning timeit: 445.4

In [35]:
bus_sof_list = get_wiki_category_pages('Business Software')

680.5887079238892


In [40]:
ml_list = get_wiki_category_pages('Machine Learning')

445.40201592445374


In [67]:
post_to_mongo('Machine Learning 2', ml_list)

In [71]:
post_to_mongo('Business Software 2', bus_sof_list)

In [75]:
# cats_checker = []
# for page in ml_list:
#     cats_checker.append(page['categories'])

In [76]:
# for lists in cats_checker:
#     if len(lists) == 1:
#         if lists == ['Category:Machine learning']:
#             print(lists)

In [74]:
# cats_checker

#### Rudimentary UX

In [87]:
def pull_and_pop():
    print("Warning, the following script could take up to 20 minutes to complete. Are you sure you wish to continue?")
    contin = input("yes/no: ")
    if contin == 'no':
        return "Quit."
    category_one = input("Input Category 1: ")
    category_two = input("Input Category 2: ")
    #print(category_one, category_two)
    
    category_one_list = get_wiki_category_pages(category_one)
    category_two_list = get_wiki_category_pages(category_two)
    
    post_to_mongo(category_one, category_one_list)
    post_to_mongo(category_two, category_two_list)
    
    return [category_one, category_two]

In [83]:
#pull_and_pop()