## Parsing

#### This script parses scraped data to retrieve dependencies, POS and lemmatized tokens. 

Spacy is required for this. Use the following commands to download Spacy:
```
conda install -c conda-forge spacy

python -m spacy download en
```

## Import required packages

In [1]:
import pandas as pd
import json
import spacy
import pickle
from spacy import displacy

## Functions for loading data and creating output

In [9]:
def load_data(picklePath = None):
    """Load data - if picklePath is specified, load the pickle. Else, try json file
    """
    if picklePath is not None:
        l_docs = pickle.load(open(picklePath, "rb" ))
    else:

        nlp = spacy.load('en')
        #if above doesn't work, load english model from local 
        #nlp = spacy.load('E:/Users/nasser_qadri/AppData/Local/conda/conda/envs/multivac/Lib/site-packages/en_core_web_sm/en_core_web_sm-2.0.0')

        #Read JSON data into the datastore variable - this comes from Peter and Domonique's effort. Don
        with open('../../data/20181212.json', 'r') as f:
            datastore = json.load(f)

        ## Create nlpified object
        l_docs = [nlp(value['text']) for key,value in list(datastore.items())[0:100] if value['text']]

        ## Save pickle of nlpified 
        with open('NLPifiedDocs-first100.pkl', 'wb') as f:
            pickle.dump(l_docs, f)

    print('# of documents: ', len(l_docs))
    return l_docs
    
    

    
def retrieve_JSON_output(l_docs):
    """Create a JSON output of dependency trees 
    
    This has been replaced with different function
    """
    
    sentences = []
    dependencyDocuments = []
    
    for di, doc in enumerate(l_docs[0:]):    
        for sent in list(doc.sents)[0:]:
            sentenceObj = {}
            sentenceObj['sentence']=sent.text
            words = []

            for token in sent:        
                wordObj = {
                    'tokenText':token.text,
                     'tokenTag':token.tag_,
                     'tokenDep':token.dep_,
                     'tokenHeadText':token.head.text,
                     'tokenHeadTag':token.head.tag_
                }
                words.append(wordObj)
                #print("{0}/{1} <--{2}-- {3}/{4}".format(token.text, token.tag_, token.dep_, token.head.text, token.head.tag_))

            sentenceObj['words'] = words
            sentences.append(sentenceObj)

        docObject = {}
        docObject['id']=di
        docObject['sentences']=sentences
        dependencyDocuments.append(docObject)
    
    return dependencyDocuments



def create_parse_files(l_docs, writeFile = True, pathToFolders=''):
    """ Creates parse files and stores them in the data/proecssed folder when writeFile=True and pathToFolders is provided
        The following file types are created
            * dep -- for dependencies
            * input -- for POS tagging
            * morph -- lemmatized words
    """
    
    d_documentData = {
        'depData' : [],
        'posData' : [],
        'morData' : []
    }
    
    for di, doc in enumerate(l_docs[0:]):

        l_depSentences = [] # for dependencies
        l_posSentences = [] # for POS tagging
        l_morSentences = [] # for morphology/lemmatization 
        
        for sent in list(doc.sents)[0:]:
            
            l_depTokens=[]
            l_posTokens=[]
            l_morTokens=[]
            
            for token in sent:
                
                ## For dependency trees
                childTokenPosition = token.i - sent.start  + 1
                headTokenPosition =  token.head.i - sent.start +1 

                if token.dep_ not in ['ROOT','punct']:
                    l_depTokens.append("{0}({1}-{2}, {3}-{4})".format(token.dep_, token.head.text, headTokenPosition, token.text, childTokenPosition ))

                ## For POS
                l_posTokens.append("{0}_{1}".format(token, token.tag_))  
                #print(token.tag_)

                ## For Morphologies
                l_morTokens.append(token.lemma_)


            l_depSentences.append("\n".join(l_depTokens))
            l_posSentences.append("\n".join(l_posTokens))
            l_morSentences.append("\n".join(l_morTokens))
    
        d_documentData['depData'].append(l_depSentences)
        d_documentData['posData'].append(l_posSentences)
        d_documentData['morData'].append(l_morSentences)

        if writeFile:
            with open(pathToFolders+'\\dep\\{0:04d}.dep'.format(di), "w", encoding='utf8') as text_file:
                text_file.write('\n\n'.join(l_depSentences))
            with open(pathToFolders+'\\input\\{0:04d}.input'.format(di), "w", encoding='utf8') as text_file:
                text_file.write('\n\n'.join(l_posSentences))
            with open(pathToFolders+'\\morph\\{0:04d}.morph'.format(di), "w", encoding='utf8') as text_file:
                text_file.write('\n\n'.join(l_morSentences))
            
            print('Files written to folder:', pathToFolders)
    return d_documentData

## Load and Parse data

In [5]:
allDocs = load_data(picklePath='../../data/pickle/NLPifiedDocs-first100.pkl')
documentData = create_parse_files(allDocs, True, '..\\..\\data\\processed')

# of documents:  100
