# Start MongoDB Server

In [4]:
!conda install --yes --quiet pymongo



# All requested packages already installed.
# packages in environment at /opt/conda:
#
pymongo                   3.4.0                    py36_0    defaults


In [5]:
import pymongo
import re
import requests
import pandas as pd
import numpy as np
from string import punctuation

client = pymongo.MongoClient('54.200.30.7', 27016)

# Wikipedia General Functions

In [391]:
class functions():
    
    def clean_text(self, string):
        string = functions().delete_periods(string)
        new_string = string.replace(' ', '_')
        new_string = new_string.lower()
        return new_string
    
    def pageids(self, category):
        category = category.replace(' ', '+')
        category = category.lower()
        r = requests.get('http://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3A+' + category + '&cmlimit=max')
        pageids = pd.DataFrame(r.json()['query']['categorymembers'])['pageid']
        return pageids

    def find_title(self, page_number):
        query = 'http://en.wikipedia.org/w/api.php?action=query&prop=extracts&\
             rvprop=content&rvsection=0&format=json&pageids={}'.format(page_number)
        my_request = requests.get(query).json()
        return my_request['query']['pages'][str(page_number)]['title']

    def strip_punctuation(self, s):
        return ''.join(c for c in s if c not in punctuation or c is '.')
    
    def delete_periods(self, s):
        return ''.join(c for c in s if c not in punctuation)

    def striphtml(self, data):
        p = re.compile(r'<.*?>')
        return p.sub('', data)

    def get_page_contents(self, pageid, category):
        query = 'http://en.wikipedia.org/w/api.php?action=query&prop=extracts&\
             rvprop=content&rvsection=0&format=json&pageids={}'.format(pageid)
        my_request = requests.get(query).json()
        title = self.striphtml(my_request['query']['pages'][str(pageid)]['title']) 
        no_html_string = self.striphtml(my_request['query']['pages'][str(pageid)]['extract']).replace('\n', ' ')
        return {'title':title, 'raw page text':self.strip_punctuation(no_html_string), 'category':category}
    

In [377]:
class mongodb():
    wiki_db = []
    database_name_ = ''
    
    def __init__(self, database_name):
        function_init = functions()
        self.database_name_ = function_init.clean_text(database_name)
        
    def collections(self):
        return client[self.database_name_].collection_names()
        
    def add_database(self):
        self.wiki_db = client[self.database_name_]
        
    def drop_database(self):
        client.drop_database(self.database_name_)
    
    def add_first_layer(self, database_name):
        function_init = functions()
        database_edited = function_init.clean_text(database_name)
        wiki_level_0 = self.wiki_db[database_edited]
        wiki_level_1 = self.wiki_db.subcategories
        these_pageids = function_init.pageids(self.database_name_)
        for pages in these_pageids:
            if 'Category:' not in function_init.find_title(pages):
                wiki_level_0.insert_one(function_init.get_page_contents(pages, self.database_name_))
            if 'Category:' in function_init.find_title(pages) :
                subcategory = function_init.find_title(pages)
                subcategory = subcategory.replace('Category:', database_name + ' -> ')
                wiki_level_1.insert_one(function_init.get_page_contents(pages, subcategory))
                
    def add_second_layer(self, database_name, subcategory_name):
        function_init = functions()
        database_edited = function_init.clean_text(database_name)
        subcat_edited = function_init.clean_text(subcategory_name)
        wiki_level_0 = self.wiki_db[database_edited][subcat_edited]
        wiki_level_1 = self.wiki_db[database_edited][subcat_edited].subcategories
        these_pageids = functions().pageids(subcategory_name)
        for pages in these_pageids:
            if 'Category:' not in functions().find_title(pages):
                wiki_level_0.insert_one(functions().get_page_contents(pages, 
                                                                      database_name + ' -> ' + subcategory_name))
            if 'Category:' in functions().find_title(pages): 
                subcategory = functions().find_title(pages)
                subcategory = subcategory.replace('Category:', 
                                                  database_name + ' -> ' + subcategory_name + ' -> ')
                wiki_level_1.insert_one(functions().get_page_contents(pages, subcategory))
                
    def add_third_layer(self, database_name, subcategory_name_0, subcategory_name_1):
        function_init = functions()
        subcategory_edited_0 = function_init.clean_text(subcategory_name_0)
        subcategory_edited_1 = function_init.clean_text(subcategory_name_1)
        database_edited = function_init.clean_text(database_name)
        wiki_level_0 = self.wiki_db[database_edited][database_edited][subcategory_edited_0][subcategory_edited_1]
        wiki_level_1 = self.wiki_db[database_edited][database_edited][subcategory_edited_0][subcategory_edited_1].subcategories
        these_pageids = function_init.pageids(subcategory_name_1)
        for pages in these_pageids:
            if 'Category:' not in function_init.find_title(pages):
                wiki_level_0.insert_one(function_init.get_page_contents(pages,
                                                                        database_name + ' -> ' + subcategory_name_0 +
                                                                       ' -> ' + subcategory_name_1))
            if 'Category:' in function_init.find_title(pages) :
                subcategory = function_init.find_title(pages)
                subcategory = subcategory.replace('Category:', database_name + ' -> ' + subcategory_name_0 + ' -> '
                                                 + subcategory_name_1 + ' -> ')
                wiki_level_1.insert_one(function_init.get_page_contents(pages, subcategory))
                
    def add_fourth_layer(self, database_name, subcategory_name_0, subcategory_name_1, subcategory_name_2):
        function_init = functions()
        subcategory_edited_0 = function_init.clean_text(subcategory_name_0)
        subcategory_edited_1 = function_init.clean_text(subcategory_name_1)
        subcategory_edited_2 = function_init.clean_text(subcategory_name_2)
        database_edited = function_init.clean_text(database_name)
        wiki_level_0 = self.wiki_db[database_edited][database_edited][subcategory_edited_0][subcategory_edited_1][subcategory_edited_2]
        wiki_level_1 = self.wiki_db[database_edited][database_edited][subcategory_edited_0][subcategory_edited_1][subcategory_edited_2].subcategories
        these_pageids = function_init.pageids(subcategory_name_2)
        for pages in these_pageids:
            if 'Category:' not in function_init.find_title(pages):
                wiki_level_0.insert_one(function_init.get_page_contents(pages,
                                                                        database_name + ' -> ' + subcategory_name_0 +
                                                                       ' -> ' + subcategory_name_1 + ' -> ' 
                                                                        + subcategory_name_2))
            if 'Category:' in function_init.find_title(pages) :
                subcategory = function_init.find_title(pages)
                subcategory = subcategory.replace('Category:', database_name + ' -> ' + subcategory_name_0 + ' -> '
                                                 + subcategory_name_1 + ' -> ' + subcategory_name_2 + ' -> ')
                wiki_level_1.insert_one(function_init.get_page_contents(pages, subcategory))

In [378]:
def add_new(database_name):
    database_edited = functions().clean_text(database_name)
    database = mongodb(database_name)
    database.drop_database()
    database.add_database()
    database.add_first_layer(database_name)
    for articles in list(client[database_edited].subcategories.find()):
        category = articles['title'].replace('Category:', '')
        category_edited = functions().clean_text(category)
        database.add_second_layer(database_name, category)
        for categories in list(client[database_edited][database_edited][category_edited].subcategories.find()):
            new_subcat = categories['title'].replace('Category:', '')
            new_subcat_edited = functions().clean_text(new_subcat)
            database.add_third_layer(database_name, category, new_subcat)
            for subcategories in list(client[database_edited][database_edited][category_edited][new_subcat_edited].subcategories.find()):
                final_subcat = subcategories['title'].replace('Category:', '')
                database.add_fourth_layer(database_name, category, new_subcat, final_subcat)

In [379]:
add_new('Lunch')

In [380]:
client.lunch.collection_names()

['lunch.lunch.lunch_dishes.sandwiches',
 'lunch.lunch_dishes',
 'lunch.lunch.lunch_dishes.sandwiches.subcategories',
 'lunch.lunch_dishes.subcategories',
 'subcategories',
 'lunch',
 'lunch.lunch_counters']

In [384]:
list(client.lunch.lunch.lunch.lunch_dishes.sandwiches.subcategories.find())

[{'_id': ObjectId('5a20998b6409d6001fd9f609'),
  'category': 'Lunch -> Lunch dishes -> Sandwiches -> American sandwiches',
  'raw page text': '',
  'title': 'Category:American sandwiches'},
 {'_id': ObjectId('5a20998c6409d6001fd9f60a'),
  'category': 'Lunch -> Lunch dishes -> Sandwiches -> Bacon sandwiches',
  'raw page text': '',
  'title': 'Category:Bacon sandwiches'},
 {'_id': ObjectId('5a20998e6409d6001fd9f60b'),
  'category': 'Lunch -> Lunch dishes -> Sandwiches -> Breakfast sandwiches',
  'raw page text': '',
  'title': 'Category:Breakfast sandwiches'},
 {'_id': ObjectId('5a20998f6409d6001fd9f60c'),
  'category': 'Lunch -> Lunch dishes -> Sandwiches -> British sandwiches',
  'raw page text': '',
  'title': 'Category:British sandwiches'},
 {'_id': ObjectId('5a2099916409d6001fd9f60d'),
  'category': 'Lunch -> Lunch dishes -> Sandwiches -> Chilean sandwiches',
  'raw page text': '',
  'title': 'Category:Chilean sandwiches'},
 {'_id': ObjectId('5a2099926409d6001fd9f60e'),
  'category

In [392]:
mule = functions().clean_text('m.u.l.e.')
mule

'mule'