In [201]:
import sys

# ***** change path below as necessary *****
sys.path.append("/Users/herman/Documents/CDIPS-DSW-2015/DataScience-2015_self/")

import acronym_extract
import dewiki, re

In [2]:
import pyspark
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)

In [3]:
from pyspark.sql import Row

In [4]:
wikiData = sqlCtx.read.parquet("./wiki_parquet")

In [5]:
wikiData.printSchema()

root
 |-- id: integer (nullable = true)
 |-- title: binary (nullable = true)
 |-- modified: long (nullable = true)
 |-- text: binary (nullable = true)
 |-- username: binary (nullable = true)



In [6]:
wikiData.count()

39365

In [7]:
wikiData.registerTempTable("wikiData")

In [8]:
resultUCB = sqlCtx.sql("SELECT * FROM wikiData WHERE title LIKE '%University of California, Berkeley%'")

In [23]:
UCB_Text = resultUCB.map(lambda p: str(p.text)).collect()

In [10]:
UCB_Title = resultUCB.map(lambda p: str(p.title)).collect()

In [215]:
UCB = UCB_Text[8]

In [220]:
UCB[:1000]

'{{infobox University|name = University of California, Berkeley School of Public Health|image_size = 223|image_name =[[Image:UCBerkeley SPH Logo.jpeg|250px]]|established = 1943 <ref name="School of Public Health History">{{cite web|title=School of Public Health History|url=http://sunsite.berkeley.edu/~ucalhist/general_history/campuses/ucb/colleges.html#public_health|publisher=UC Berkeley|accessdate=April 21, 2012}}</ref>|type = [[Public university|Public]]|dean = Stefano Bertozzi, MD, PhD|faculty =|students = 410 <ref name="UC Berkeley">{{cite web|title=School of Public Health, University of California, Berkeley|url=http://sph.berkeley.edu/students/pdf/Announcement11-12.pdf|publisher=UC Berkeley|accessdate=April 21, 2012}}</ref>|city = [[Berkeley, California|Berkeley]]|state = [[California]]|country = {{USA}}|website = http://www.sph.berkeley.edu|logo           }}The \'\'\'University of California, Berkeley School of Public Health,\'\'\' commonly called the \'\'\'Berkeley School of Pub

In [216]:
# FUNCTIONS FOR CLEANING WIKI MARKUP

def get_reducer_ranges(braces_list_signed, braces_list_positions, length_of_braces):
    marker = 0
    excise_list = np.zeros([1,2], dtype='i4')
    temp_touple = np.array([0,0])
    if (len(braces_list_signed) != len(braces_list_positions)):
        print "Lists of Unequal Lengths. Return Empty!"
        return
    for i in range(0, len(braces_list_signed)):
        if(marker == 0):
            temp_touple[0] = braces_list_positions[i]
        marker += braces_list_signed[i]
        if(marker == 0):
            temp_touple[1] = braces_list_positions[i] + length_of_braces
            excise_list = np.append(excise_list, [temp_touple], axis=0)
    if (marker != 0):
        print "Marker did not get back to zero indicating unpaired brackets!"
        return
    return excise_list

def apply_excise_list_to_text(excise_list,oldstring):
    newstring = ''
    for i in range(0,len(excise_list)-1):
        newstring += oldstring[excise_list[i,1]:excise_list[i+1,0]]
    newstring += oldstring[excise_list[len(excise_list)-1,1]:]
    return newstring

def cleanWikiMarkup(text):
    
    '''Part 1: Clean all markup except for {{}}'''
    
    RE_BRACKETS = re.compile(r'{{(?:.(?!{{|}}).)*}}')#  {{}} and everything inside as long as there are no nested {{}}
    RE = re.compile(r'''\[{2}(File|Category):.+?\]{2}|        #  [[File: ]]
                                    [\s\w#()]+?\||            #  
                                    (\[{2}|\]{2})|            #  [[ or ]]
                                    \'{2,5}|                  #  two to five occurences of '
                                    (<s>|<!--).+?(</s>|-->)|  #  comments
                                    ={1,6}|                   #  one to six occurences of =
                                    \<ref.+?(\/>|\<\/ref\>)|  #  <ref /> and <ref></ref> tags
                                    \<.+?\>                   #  any tags
                                    ''',   
                                    re.X)
    text = re.sub(RE, ' ', text)
    text = re.sub('\s+', ' ', text)
    text = re.sub('\( ', '(', text)
    text = re.sub(' \)', ')', text)
    text = re.sub('\|', '| ', text) 
    text = re.sub(' \.', '.', text)
    
    '''Part 2: Clean {{}}'''
    
    braces_list_signed = re.findall('{{|}}', text)
    braces_list_signed = [+1 if brace == '{{' else -1 for brace in braces_list_signed]
    braces_list_positions = [m.start(0) for m in re.finditer('{{|}}', text)]
    excise_list = get_reducer_ranges(braces_list_signed, braces_list_positions, 2)
    text = apply_excise_list_to_text(excise_list, text)
    
    return text


In [217]:
UCB_clean = cleanWikiMarkup(UCB)

In [218]:
UCB_clean

"The University of California, Berkeley School of Public Health, commonly called the Berkeley School of Public Health or UC Berkeley School of Public Health, is one of 14 schools and colleges at the University of California, Berkeley . The School of Public Health is consistently rated among the best in the nation, with recent rankings placing its doctoral programs in Epidemiology and Environmental Health Sciences 1st, its doctoral program in Health Policy 2nd, and its Master of Public Health program 8th in their respective categories. Established in 1943, it was the first school of public health west of the Mississippi River . The school is currently accredited by the Council on Education for Public Health . History The School of Public Health has its origins in the Department of Hygiene, which pioneered much of California's start of the 20th century public health endeavors. It was Karl F. Meyer , however, whose compelling 1930s Public Health curriculum demonstrated a pressing need for

In [219]:
acronym_extract.extract_acronym(UCB_clean)

[['MBA', 'Master of Business Administration', 'The'],
 ['MCP', 'Master of City Planning', 'The'],
 ['MPP', 'Master of Public Policy', 'The'],
 ['MSW', 'Master of Social Work', 'The'],
 ['MCRP', 'Master of City and Regional Planning', 'The'],
 ['UCSF', 'University of California, San Francisco', 'rankings']]