# Parsing 

In this project we will use Python to connect to a political party database via SQL. We will then use spaCy to parse the most important Subject-Verb-Object (SVO) phrases from each convention.

In [1]:
import os
import random
import nltk
import sqlite3
import spacy
import numpy as np
from spacy import displacy

from subject_object_extraction import findSVOs
from collections import Counter
from collections import defaultdict
from string import punctuation
from pprint import pprint
from nltk.corpus import stopwords
from collections import OrderedDict

In [2]:
#stop words in English
sw = stopwords.words('english')

## Connect to Database

In [3]:
#connect to ConventionSpeeches database
convention_parties_db = sqlite3.connect("ConventionSpeeches.db")
con_cur = convention_parties_db.cursor() 

In [4]:
#run query here pull speaker text
query_results = con_cur.execute('''
                SELECT text, party
                FROM conventions
                WHERE speaker != "Unknown"''')

In [5]:
#add punctuation set 
punctuation = set(punctuation)
punctuation.add("'")

In [6]:
#create defaultdict, if we go to a spot with no key we will have empty string 
con_data = defaultdict(str)

#process text for parties
for row in query_results : 
    
    text, party = row
    
    text = "".join([ch for ch in text if ch not in punctuation])
    text = [w.lower() for w in text.split() if w.isalpha()]
    
    
    con_data[party] += " ".join(text) + " "

This code above is processing a query result to extract text and party information. It is joining the text together, stripping out any punctuation, making all words lowercase, and appending only alphabetic words to the party data. The result is a string of words which has been processed for the parties.

In [7]:
#now lets split the con_data into corpus1 and then corpus2
#split on whitespace, output one huge paragraph

repubs = con_data['Republican'][:1000000]

democs = con_data['Democratic'][:1000000]

In [8]:
parser = spacy.load('en_core_web_md')

In [9]:
rdoc = parser(repubs)

In [10]:
reubs_res = findSVOs(rdoc)

In [11]:
repubs_svo_count = dict()


for svo in reubs_res :
    
    
    if svo not in repubs_svo_count :
        repubs_svo_count[svo] = 1
        
    else :
        repubs_svo_count[svo] += 1
    

In [12]:
sorted(repubs_svo_count.items(), key=lambda x: x[1], reverse=True)[:25]


[(('god', 'bless', 'you'), 54),
 (('i', 'tell', 'you'), 52),
 (('you', 'thank', 'you'), 51),
 (('we', 'need', 'president'), 20),
 (('god', 'bless', 'states'), 19),
 (('god', 'bless', 'america'), 16),
 (('me', 'tell', 'you'), 15),
 (('god', 'thank', 'you'), 12),
 (('we', 'did', 'it'), 10),
 (('i', 'talk', 'you'), 10),
 (('email', 'submit', 'stay'), 10),
 (('i', 'thank', 'you'), 9),
 (('we', 'thank', 'you'), 9),
 (('we', 'have', 'president'), 9),
 (('they', 'get', 'it'), 9),
 (('we', 'need', 'leader'), 8),
 (('we', 'cut', 'taxes'), 8),
 (('we', 'need', 'leaders'), 8),
 (('i', 'stand', 'you'), 8),
 (('there', 'are', 'people'), 8),
 (('i', 'had', 'privilege'), 8),
 (('he', 'do', 'it'), 8),
 (('news', 'enable', 'javascript'), 8),
 (('news', 'enable', 'browser'), 8),
 (('we', 'throw', 'him'), 7)]

In [13]:
ddoc = parser(democs)

In [14]:
democs_res = findSVOs(ddoc)

In [15]:
democs_svo_count = dict()


for svo in democs_res :
    
    
    if svo not in democs_svo_count :
        democs_svo_count[svo] = 1
        
    else :
        democs_svo_count[svo] += 1
    

In [16]:
sorted(democs_svo_count.items(), key=lambda x: x[1], reverse=True)[:25]

[(('god', 'bless', 'you'), 32),
 (('we', 'need', 'president'), 29),
 (('i', 'tell', 'you'), 29),
 (('you', 'thank', 'you'), 27),
 (('we', 'do', 'it'), 26),
 (('me', 'tell', 'you'), 21),
 (('i', 'seen', 'it'), 17),
 (('we', '!afford', 'years'), 15),
 (('we', 'reelect', 'president'), 14),
 (('we', 'elect', 'president'), 14),
 (('we', 'have', 'president'), 13),
 (('we', '!afford', 'more'), 13),
 (('god', 'bless', 'states'), 11),
 (('we', 'need', 'change'), 11),
 (('we', 'elect', 'barack'), 10),
 (('i', 'stand', 'you'), 10),
 (('we', 'elect', 'obama'), 10),
 (('he', 'cut', 'taxes'), 9),
 (('god', 'bless', 'america'), 9),
 (('we', 'have', 'choice'), 8),
 (('we', 'keep', 'america'), 8),
 (('president', 'barack', 'obama'), 8),
 (('obama', 'took', 'office'), 8),
 (('we', 'need', 'leader'), 8),
 (('that', 's', 'change'), 8)]

## Visualizing the Entity Recognizer
This next section has a lengthy output, but it shows the power of spaCy. 

In [25]:
repubs_v = repubs[:9000]

In [26]:
doc = nlp(repubs_v)
displacy.serve(doc, style="ent")


Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [27]:
democs_v = democs[:9000]
doc = nlp(democs_v)
displacy.serve(doc, style="ent")


Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.
