In [2]:
from pymongo import MongoClient
client = MongoClient('localhost', 27017)
db = client['test_database']
tab = db['test_table']

In [3]:
from bs4 import BeautifulSoup
from collections import Counter
import warnings
import pprint
import copy
import pandas as pd
import numpy as np
import requests

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

from os import path
import json
import re
import unicodedata
import string

In [4]:
vectorizer = TfidfVectorizer(stop_words='english', max_features = 5000)
ss = SnowballStemmer(language='english')
def preprocess(arr):
    '''
    removes puncuation from reviews, puts in lower case
    '''
    REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
    REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
    step1 = [REPLACE_NO_SPACE.sub("", line.lower()) for line in arr]
    step2 = [REPLACE_WITH_SPACE.sub(" ", line) for line in step1]
    return step2

def stemmer(arr):
    '''
    takes a corpus in an array and returns a simillar arr of stemmed words
    '''
    output = list()
    for text in arr:
        current = ""
        for word in text.split():
            current += ss.stem(word) + " "
        output.append(current)
    return output

def pipeline(arr):
    step1 = preprocess(arr)
    step2 = stemmer(step1)
    step3 = vectorizer.fit_transform(np.array(step2))
    return step3

# Democratic Primary 1

In [5]:
dem1_url = 'https://www.presidency.ucsb.edu/documents/democratic-candidates-debate-las-vegas-nevada'

r_d1 = requests.get(dem1_url)
db_d1 = client.dem1
pages_d1 = db_d1.pages
pages_d1.insert_one({'html': r_d1.content})
soup_d1 = BeautifulSoup(r_d1.content, "html")

div_d1 = soup_d1.find("div", {"class": "wrapper"})
row_d1 = div_d1.find("div", {"class": "row"})
region_d1 = row_d1.find("div", {"class": "region region-content"})
docs_d1 = region_d1.find("div", {"class": "field-docs-content"})

rows = docs_d1.find_all('p')[2:]
all_rows = []
empty_row = {
    "Speaker":None, "Line":None
}
for i, row in enumerate(rows):
    new_row = copy.copy(empty_row)
    if rows[i].find('strong') != None:
        a = rows[i].find('strong').text
        new_row['Speaker'] = a
    elif rows[i].find('strong') == None:
        new_row['Speaker'] = a
    new_row['Line'] = rows[i].text
    all_rows.append(new_row)
    
for i in all_rows:
    if i['Line'][:8] == 'COOPER: ':
        i['Line'] = i['Line'][8:]
    if i['Line'][:8] == 'CHAFEE: ':
        i['Line'] = i['Line'][8:]
    if i['Line'][:6] == 'WEBB: ':
        i['Line'] = i['Line'][6:]
    if i['Line'][:10] == "O'MALLEY: ":
        i['Line'] = i['Line'][10:]
    if i['Line'][:9] == 'SANDERS: ' or i['Line'][:9] == 'SANDERS:-':
        i['Line'] = i['Line'][9:]
    if i['Line'][:9] == 'CLINTON: ':
        i['Line'] = i['Line'][9:]
        
for row in all_rows:
    pages_d1.insert_one(row)
    
rows = pages_d1.find()
dem1 = pd.DataFrame(list(rows))

dem1 = dem1[1:]
dem1 = dem1.drop(['_id', 'html'], axis=1)
dem1 = dem1.dropna()
speakers = [i[:-1].lower() for i in dem1['Speaker']]
dem1['Speaker'] = speakers
dem1 = dem1[-734:]
dem1 = dem1[dem1['Line'] != 'Advertisement']
dem1.index = range(len(dem1))
dem1

Unnamed: 0,Speaker,Line
0,cooper,I'm Anderson Cooper. Thanks for joining us. We...
1,cooper,"As the moderator, I'll ask questions, followup..."
2,cooper,Our viewers should know that we have lights th...
3,cooper,I want the candidates to be able to introduce ...
4,cooper,Let's begin with Governor Chafee.
...,...,...
729,clinton,"Now, I revere my late mother, and she gave me ..."
730,clinton,America's been knocked down. That Great Recess...
731,clinton,My mission as president will be to raise incom...
732,clinton,Please join me in this campaign. Please come a...


# Democratic Primary 2

In [6]:
dem2_url = 'https://www.presidency.ucsb.edu/documents/democratic-candidates-debate-des-moines-iowa'

r_d2 = requests.get(dem2_url)
db_d2 = client.deme2
pages_d2 = db_d2.pages
pages_d2.insert_one({'html': r_d2.content})
soup_d2 = BeautifulSoup(r_d2.content, "html")

div_d2 = soup_d2.find("div", {"class": "wrapper"})
row_d2 = div_d2.find("div", {"class": "row"})
region_d2 = row_d2.find("div", {"class": "region region-content"})
docs_d2 = region_d2.find("div", {"class": "field-docs-content"})

rows = docs_d2.find_all('p')[2:]
all_rows = []
empty_row = {
    "Speaker":None, "Line":None
}
for i, row in enumerate(rows):
    new_row = copy.copy(empty_row)
    if rows[i].find('strong') != None:
        a = rows[i].find('strong').text
        new_row['Speaker'] = a
    elif rows[i].find('strong') == None:
        new_row['Speaker'] = a
    new_row['Line'] = rows[i].text
    all_rows.append(new_row)
    
for i in all_rows:
    if i['Line'][:11] == 'DICKERSON: ':
        i['Line'] = i['Line'][11:]
    if i['Line'][:8] == 'CORDES: ':
        i['Line'] = i['Line'][8:]
    if i['Line'][:8] == 'COONEY: ':
        i['Line'] = i['Line'][8:]
    if i['Line'][:10] == "O'MALLEY: ":
        i['Line'] = i['Line'][10:]
    if i['Line'][:9] == 'SANDERS:-' or i['Line'][:9] == 'SANDERS:-':
        i['Line'] = i['Line'][9:]
    if i['Line'][:9] == 'CLINTON: ':
        i['Line'] = i['Line'][9:]
    if i['Line'][:12] == 'OBRADOVICH: ':
        i['Line'] = i['Line'][12:]
        
for row in all_rows:
    pages_d2.insert_one(row)
    
rows = pages_d2.find()
dem2 = pd.DataFrame(list(rows))
dem2 = dem2.drop(['_id', 'html'], axis=1)
dem2 = dem2.dropna()
speakers = [i[:-1].lower() for i in dem2['Speaker']]
dem2['Speaker'] = speakers

In [7]:
dem2 = dem2[dem2['Line'] != '[commercial break]']
dem2.index = range(len(dem2))
dem2

Unnamed: 0,Speaker,Line
0,dickerson,Before we start the debate here are the rules....
1,dickerson,And when time's up the light turns red. That m...
2,sanders,"SANDERS: Well, John, let me concur with you an..."
3,sanders,"Together, leading the world, this country will..."
4,sanders,"I'm running for president, because as I go aro..."
...,...,...
450,clinton,And that's what I will do as your president. I...
451,dickerson,Senator Sanders?
452,sanders,"SANDERS: John -- John, this country today has ..."
453,sanders,That's not the America that I think we should ...


# Democratic Primary 3

In [8]:
dem3_url = 'https://www.presidency.ucsb.edu/documents/democratic-candidates-debate-manchester-new-hampshire'

r_d3 = requests.get(dem3_url)
db_d3 = client.dem3
pages_d3 = db_d3.pages
pages_d3.insert_one({'html': r_d3.content})
soup_d3 = BeautifulSoup(r_d3.content, "html")

div_d3 = soup_d3.find("div", {"class": "wrapper"})
row_d3 = div_d3.find("div", {"class": "row"})
region_d3 = row_d3.find("div", {"class": "region region-content"})
docs_d3 = region_d3.find("div", {"class": "field-docs-content"})

rows = docs_d3.find_all('p')[2:]
all_rows = []
empty_row = {
    "Speaker":None, "Line":None
}
for i, row in enumerate(rows):
    new_row = copy.copy(empty_row)
    if rows[i].find('strong') != None:
        a = rows[i].find('strong').text
        new_row['Speaker'] = a
    elif rows[i].find('strong') == None:
        new_row['Speaker'] = a
    new_row['Line'] = rows[i].text
    all_rows.append(new_row)
    
for i in all_rows:
    if i['Line'][:6] == 'MUIR: ':
        i['Line'] = i['Line'][6:]
    if i['Line'][:9] == 'RADDATZ: ':
        i['Line'] = i['Line'][9:]
    if i['Line'][:10] == "O'MALLEY: ":
        i['Line'] = i['Line'][10:]
    if i['Line'][:9] == 'SANDERS: ':
        i['Line'] = i['Line'][9:]
    if i['Line'][:9] == 'CLINTON: ':
        i['Line'] = i['Line'][9:]
        
for row in all_rows:
    pages_d3.insert_one(row)
    
rows = pages_d3.find()
dem3 = pd.DataFrame(list(rows))
dem3 = dem3.drop(['_id', 'html'], axis=1)
dem3 = dem3.dropna()
speakers = [i[:-1].lower() for i in dem3['Speaker']]
dem3['Speaker'] = speakers

In [9]:
dem3.index = range(len(dem3))
dem3

Unnamed: 0,Speaker,Line
0,raddatz,Good evening to you all. The rules for tonight...
1,muir,We will be tackling many critical issues right...
2,clinton,"Well, thank you. And I'm delighted to be here ..."
3,clinton,"You know, the American president has to both k..."
4,clinton,I'm very clear that we have a distinct differe...
...,...,...
575,clinton,"On January 20th, 2017, the next president of t..."
576,clinton,"Social Security, which Republicans call a Ponz..."
577,clinton,"You know, everybody says every election's impo..."
578,clinton,"You know, I became a grandmother 15 months ago..."


# Democratic Primary 4

In [10]:
dem4_url = 'https://www.presidency.ucsb.edu/documents/democratic-candidates-debate-charleston-south-carolina'

r_d4 = requests.get(dem4_url)
db_d4 = client.dem4
pages_d4 = db_d4.pages
pages_d4.insert_one({'html': r_d4.content})
soup_d4 = BeautifulSoup(r_d4.content, "html")

div_d4 = soup_d4.find("div", {"class": "wrapper"})
row_d4 = div_d4.find("div", {"class": "row"})
region_d4 = row_d4.find("div", {"class": "region region-content"})
docs_d4 = region_d4.find("div", {"class": "field-docs-content"})

rows = docs_d4.find_all('p')[2:]
all_rows = []
empty_row = {
    "Speaker":None, "Line":None
}
for i, row in enumerate(rows):
    new_row = copy.copy(empty_row)
    if rows[i].find('strong') != None:
        a = rows[i].find('strong').text
        new_row['Speaker'] = a
    elif rows[i].find('strong') == None:
        new_row['Speaker'] = a
    new_row['Line'] = rows[i].text
    all_rows.append(new_row)
    
for i in all_rows:
    if i['Line'][:6] == 'HOLT: ':
        i['Line'] = i['Line'][6:]
    if i['Line'][:10] == 'MITCHELL: ':
        i['Line'] = i['Line'][10:]
    if i['Line'][:10] == "O'MALLEY: ":
        i['Line'] = i['Line'][10:]
    if i['Line'][:9] == 'SANDERS: ':
        i['Line'] = i['Line'][9:]
    if i['Line'][:9] == 'CLINTON: ':
        i['Line'] = i['Line'][9:]
        
for row in all_rows:
    pages_d4.insert_one(row)
    
rows = pages_d4.find()
dem4 = pd.DataFrame(list(rows))
dem4 = dem4.drop(['_id', 'html'], axis=1)
dem4 = dem4.dropna()
speakers = [i[:-1].lower() for i in dem4['Speaker']]
dem4['Speaker'] = speakers

In [11]:
dem4.index = range(len(dem4))
dem4

Unnamed: 0,Speaker,Line
0,holt,Good evening and welcome to the NBC News Youtu...
1,holt,Tonight will be the final opportunity to see t...
2,holt,"Please welcome Secretary Hillary Clinton, Sena..."
3,holt,"Well, welcome to all of you. Hope you're excit..."
4,holt,This is a critical point in the race. You've b...
...,...,...
533,sanders,"Well, Secretary Clinton was right and what I d..."
534,sanders,"Now, we are a great nation — and we've heard a..."
535,sanders,"We've got to get rid of Super PACs, we've got ..."
536,holt,All right. Well thank you and thanks to all of...


# Democratic Primary 5

In [12]:
dem5_url = 'https://www.presidency.ucsb.edu/documents/democratic-candidates-debate-durham-new-hampshire'

r_d5 = requests.get(dem5_url)
db_d5 = client.dem5
pages_d5 = db_d5.pages
pages_d5.insert_one({'html': r_d5.content})
soup_d5 = BeautifulSoup(r_d5.content, "html")

div_d5 = soup_d5.find("div", {"class": "wrapper"})
row_d5 = div_d5.find("div", {"class": "row"})
region_d5 = row_d5.find("div", {"class": "region region-content"})
docs_d5 = region_d5.find("div", {"class": "field-docs-content"})

rows = docs_d5.find_all('p')[2:]
all_rows = []
empty_row = {
    "Speaker":None, "Line":None
}
for i, row in enumerate(rows):
    new_row = copy.copy(empty_row)
    if rows[i].find('strong') != None:
        a = rows[i].find('strong').text
        new_row['Speaker'] = a
    elif rows[i].find('strong') == None:
        new_row['Speaker'] = a
    new_row['Line'] = rows[i].text
    all_rows.append(new_row)
    
for i in all_rows:
    if i['Line'][:6] == 'TODD: ':
        i['Line'] = i['Line'][6:]
    if i['Line'][:8] == 'MADDOW: ':
        i['Line'] = i['Line'][8:]
    if i['Line'][:8] == 'SANDERS:':
        i['Line'] = i['Line'][9:]
    if i['Line'][:9] == 'CLINTON: ':
        i['Line'] = i['Line'][9:]
        
for row in all_rows:
    pages_d5.insert_one(row)
    
rows = pages_d5.find()
dem5 = pd.DataFrame(list(rows))
dem5 = dem5.drop(['_id', 'html'], axis=1)
dem5 = dem5.dropna()
speakers = [i[:-1].lower() for i in dem5['Speaker']]
dem5['Speaker'] = speakers

In [13]:
dem5.index = range(len(dem5))
dem5

Unnamed: 0,Speaker,Line
0,todd,"Good evening, and welcome to the MSNBC Democra..."
1,maddow,We are super excited to be here at the Univers...
2,todd,And neither party has seen this yet. These can...
3,maddow,And we do hope that the candidates will take t...
4,maddow,We're not here for talking points. We're to le...
...,...,...
468,todd,Our debate coverage—[inaudible] debate coverag...
469,todd,And we want to thank all of you for being here...
470,maddow,"We also want to thank our host, the University..."
471,todd,We'll see you in a few minutes. Thank you.


# Democratic Primary 6

In [14]:
dem6_url = 'https://www.presidency.ucsb.edu/documents/democratic-candidates-debate-milwaukee-wisconsin'

r_d6 = requests.get(dem6_url)
db_d6 = client.dem6
pages_d6 = db_d6.pages
pages_d6.insert_one({'html': r_d6.content})
soup_d6 = BeautifulSoup(r_d6.content, "html")

div_d6 = soup_d6.find("div", {"class": "wrapper"})
row_d6 = div_d6.find("div", {"class": "row"})
region_d6 = row_d6.find("div", {"class": "region region-content"})
docs_d6 = region_d6.find("div", {"class": "field-docs-content"})

rows = docs_d6.find_all('p')[2:]
all_rows = []
empty_row = {
    "Speaker":None, "Line":None
}
for i, row in enumerate(rows):
    new_row = copy.copy(empty_row)
    if rows[i].find('strong') != None:
        a = rows[i].find('strong').text
        new_row['Speaker'] = a
    elif rows[i].find('strong') == None:
        new_row['Speaker'] = a
    new_row['Line'] = rows[i].text
    all_rows.append(new_row)
    
for i in all_rows:
    if i['Line'][:7] == 'IFILL: ':
        i['Line'] = i['Line'][7:]
    if i['Line'][:10] == 'WOODRUFF: ':
        i['Line'] = i['Line'][10:]
    if i['Line'][:9] == 'SANDERS: ':
        i['Line'] = i['Line'][9:]
    if i['Line'][:9] == 'CLINTON: ':
        i['Line'] = i['Line'][9:]
        
for row in all_rows:
    pages_d6.insert_one(row)
    
rows = pages_d6.find()
dem6 = pd.DataFrame(list(rows))
dem6 = dem6.drop(['_id', 'html'], axis=1)
dem6 = dem6.dropna()
speakers = [i[:-1].lower() for i in dem6['Speaker']]
dem6['Speaker'] = speakers
dem6.index = range(len(dem6))
dem6

Unnamed: 0,Speaker,Line
0,woodruff,"Good evening, and thank you. We are happy to w..."
1,woodruff,We are especially pleased to thank our partner...
2,ifill,We want to also extend our warm thanks to Milw...
3,woodruff,"Welcome, Senator, great to see you. And former..."
4,clinton,Thank you.
...,...,...
150,ifill,Thank you. Thank you Senator Clinton. Thank yo...
151,woodruff,"And we want to thank our audience, our quiet a..."
152,ifill,I'm going to remain here in Milwaukee tomorrow...
153,woodruff,And I'm going to be returning to Washington. I...


# Democratic Primary 7

In [15]:
dem7_url = 'https://www.presidency.ucsb.edu/documents/democratic-candidates-debate-flint-michigan'

r_d7 = requests.get(dem7_url)
db_d7 = client.dem7
pages_d7 = db_d7.pages
pages_d7.insert_one({'html': r_d7.content})
soup_d7 = BeautifulSoup(r_d7.content, "html")

div_d7 = soup_d7.find("div", {"class": "wrapper"})
row_d7 = div_d7.find("div", {"class": "row"})
region_d7 = row_d7.find("div", {"class": "region region-content"})
docs_d7 = region_d7.find("div", {"class": "field-docs-content"})

rows = docs_d7.find_all('p')[2:]
all_rows = []
empty_row = {
    "Speaker":None, "Line":None
}
for i, row in enumerate(rows):
    new_row = copy.copy(empty_row)
    if rows[i].find('strong') != None:
        a = rows[i].find('strong').text
        new_row['Speaker'] = a
    elif rows[i].find('strong') == None:
        new_row['Speaker'] = a
    new_row['Line'] = rows[i].text
    all_rows.append(new_row)
    
for i in all_rows:
    if i['Line'][:7] == 'LEMON: ':
        i['Line'] = i['Line'][7:]
    if i['Line'][:8] == 'COOPER: ':
        i['Line'] = i['Line'][8:]
    if i['Line'][:9] == 'SANDERS: ':
        i['Line'] = i['Line'][9:]
    if i['Line'][:9] == 'CLINTON: ':
        i['Line'] = i['Line'][9:]
        
for row in all_rows:
    pages_d7.insert_one(row)
    
rows = pages_d7.find()
dem7 = pd.DataFrame(list(rows))
dem7 = dem7.drop(['_id', 'html'], axis=1)
dem7 = dem7.dropna()
speakers = [i[:-1].lower() for i in dem7['Speaker']]
dem7['Speaker'] = speakers
dem7.index = range(len(dem7))
dem7

Unnamed: 0,Speaker,Line
0,cooper,And welcome to The Whiting Auditorium on the c...
1,cooper,I'm Anderson Cooper. I want to welcome our vie...
2,cooper,And now we want to welcome the Democratic cand...
3,cooper,Please welcome former Secretary of State Hilla...
4,cooper,"Ladies and gentlemen, the Democratic candidate..."
...,...,...
495,clinton,I don't intend to get into the gutter with who...
496,cooper,I want to thank both the candidates. While we'...
497,cooper,"I want to thank the candidates, the Democratic..."
498,cooper,On Wednesday we'll simulcast the next Democrat...


# Democratic Primary 8

In [16]:
dem8_url = 'https://www.presidency.ucsb.edu/documents/democratic-candidates-debate-miami-florida'

r_d8 = requests.get(dem8_url)
db_d8 = client.dem8
pages_d8 = db_d8.pages
pages_d8.insert_one({'html': r_d8.content})
soup_d8 = BeautifulSoup(r_d8.content, "html")

div_d8 = soup_d8.find("div", {"class": "wrapper"})
row_d8 = div_d8.find("div", {"class": "row"})
region_d8 = row_d8.find("div", {"class": "region region-content"})
docs_d8 = region_d8.find("div", {"class": "field-docs-content"})

rows = docs_d8.find_all('p')[2:]
all_rows = []
empty_row = {
    "Speaker":None, "Line":None
}
for i, row in enumerate(rows):
    new_row = copy.copy(empty_row)
    if rows[i].find('strong') != None:
        a = rows[i].find('strong').text
        new_row['Speaker'] = a
    elif rows[i].find('strong') == None:
        new_row['Speaker'] = a
    new_row['Line'] = rows[i].text
    all_rows.append(new_row)
    
for i in all_rows:
    if i['Line'][:7] == 'RAMOS: ':
        i['Line'] = i['Line'][7:]
    if i['Line'][:9] == 'SALINAS: ':
        i['Line'] = i['Line'][9:]
    if i['Line'][:9] == 'TUMULTY: ':
        i['Line'] = i['Line'][9:]
    if i['Line'][:9] == 'SANDERS: ':
        i['Line'] = i['Line'][9:]
    if i['Line'][:9] == 'CLINTON: ':
        i['Line'] = i['Line'][9:]
    if i['Line'][:30] == 'SALINAS [through translator]: ':
        i['Line'] = i['Line'][30:]
    if i['Line'][:28] == 'RAMOS [through translator]: ':
        i['Line'] = i['Line'][28:]
        
for row in all_rows:
    pages_d8.insert_one(row)
    
rows = pages_d8.find()
dem8 = pd.DataFrame(list(rows))
dem8 = dem8.drop(['_id', 'html'], axis=1)
dem8 = dem8.dropna()
speakers = [i[:-1].lower() for i in dem8['Speaker']]
dem8['Speaker'] = speakers
dem8 = dem8[-471:]
dem8.index = range(len(dem8))
dem8

Unnamed: 0,Speaker,Line
0,salinas [through translator],This will be the first and only debate the can...
1,ramos [through translator],"Here with us tonight is Karen Tumulty, Washing..."
2,salinas,"Welcome, Karen."
3,salinas [through translator],"Now, we're going to welcome the protagonists o..."
4,salinas [through translator],And Senator from Vermont Bernie Sanders. [appl...
...,...,...
466,tumulty,And on behalf of The Washington Post and Univi...
467,ramos [through translator],"We want to use these last moments, it's very i..."
468,ramos [through translator],"So you know it perfectly, who does not vote do..."
469,salinas [through translator],"And as you know, nobody can reach the White Ho..."


# Presidential Debate 1

In [17]:
pres_url = 'https://www.politico.com/story/2016/09/full-transcript-first-2016-presidential-debate-228761'

r_pres = requests.get(pres_url)
db_pres = client.presidente1
pages_pres = db_pres.pages
pages_pres.insert_one({'html': r_pres.content})
soup_pres = BeautifulSoup(r_pres.content, "html")

div = soup_pres.find("div", {"class": "global-wrapper"})
main = div.find("main", {"class": "super-duper"})
article = main.find("article", {"class": "story-main-content"})
text = article.find("div", {"class": "story-text"})

rows = text.find_all('p')[2:-1]
all_rows = []
empty_row = {
    "Speaker":None, "Line":None
}
for i, row in enumerate(rows):
    new_row = copy.copy(empty_row)
    if rows[i].find('b') != None:
        a = rows[i].find('b').text
        new_row['Speaker'] = a
    elif rows[i].find('b') == None:
        new_row['Speaker'] = a
    new_row['Line'] = rows[i].text
    all_rows.append(new_row)
    
for i in all_rows:
    if i['Line'] == '(APPLAUSE)':
        i['Speaker'] = 'AUDIENCE'
        i['Line'] = 'Applause'
    if i['Line'][:5] == 'HOLT:':
        i['Line'] = i['Line'][5:]
    if i['Line'][:8] == 'CLINTON:':
        i['Line'] = i['Line'][8:]
    if i['Line'][:6] == 'TRUMP:':
        i['Line'] = i['Line'][6:]
        
for row in all_rows:
    pages_pres.insert_one(row)
    
rows = pages_pres.find()
pres1 = pd.DataFrame(list(rows))
pres1 = pres1.drop(['_id', 'html'], axis=1)
pres1 = pres1[1:]

speakers = [i.lower() for i in pres1['Speaker']]
pres1['Speaker'] = speakers
pres1.index = range(len(pres1))

In [18]:
pres1 = pres1[pres1['Line'] != 'Advertisement']
pres1

Unnamed: 0,Speaker,Line
0,holt,Good evening from Hofstra University in Hemps...
1,holt,The participants tonight are Donald Trump and ...
3,holt,The 90-minute debate is divided into six segme...
4,holt,The questions are mine and have not been share...
5,holt,"I will invite you to applaud, however, at this..."
...,...,...
440,holt,"Mr. Trump, very quickly, same question: Will ..."
441,trump,I want to make America great again. We are a ...
442,trump,The other day we were deporting 800 people. An...
443,holt,Will you accept the outcome of the election?


# Presidential Debate 2

In [19]:
pres2_url = 'https://www.politico.com/story/2016/10/2016-presidential-debate-transcript-229519'
r_pres2 = requests.get(pres2_url)
db_pres = client.president2
pages_pres = db_pres.pages
pages_pres.insert_one({'html': r_pres2.content})
soup_pres = BeautifulSoup(r_pres2.content, "html")

div = soup_pres.find("div", {"class": "global-wrapper"})
main = div.find("main", {"class": "super-duper"})
article = main.find("article", {"class": "story-main-content"})
text = article.find("div", {"class": "story-text"})

rows = text.find_all('p')[2:-1]
all_rows2 = []
empty_row = {
    "Speaker":None, "Line":None
}
for i, row in enumerate(rows):
    new_row = copy.copy(empty_row)
    if rows[i].find('b') != None:
        a = rows[i].find('b').text
        new_row['Speaker'] = a
    elif rows[i].find('b') == None:
        new_row['Speaker'] = a
    new_row['Line'] = rows[i].text
    all_rows2.append(new_row)
    
for i in all_rows2:
    if i['Line'] == '[Applause]':
        i['Speaker'] = 'audience'
        i['Line'] = 'Applause'
    if i['Line'][:8] == 'Raddatz:':
        i['Line'] = i['Line'][8:]
    if i['Line'][:7] == 'Cooper:':
        i['Line'] = i['Line'][7:]
    if i['Line'][:6] == 'Brock:':
        i['Line'] = i['Line'][6:]
    if i['Line'][:8] == 'Clinton:':
        i['Line'] = i['Line'][8:]
    if i['Line'][:6] == 'Trump:':
        i['Line'] = i['Line'][6:]
        
for row in all_rows2:
    pages_pres.insert_one(row)
    
rows = pages_pres.find()
pres2 = pd.DataFrame(list(rows))
pres2 = pres2.drop(['_id', 'html'], axis=1)
pres2 = pres2[2:]

speakers = [i.lower() for i in pres2['Speaker']]
pres2['Speaker'] = speakers
pres2 = pres2[pres2['Line'] != 'Advertisement']
pres2.index = range(len(pres2))

In [20]:
pres2

Unnamed: 0,Speaker,Line
0,raddatz,Good evening. I'm Martha Raddatz from ABC news.
1,cooper,"And I'm Anderson Cooper from CNN, we want to ..."
2,cooper,Thank you very much for being here. We will b...
3,brock,Thank you and good evening. The last presiden...
4,clinton,"Thank you. Are you a teacher? Yes, I think th..."
...,...,...
201,raddatz,Mr. Trump?
202,trump,"Well, I consider her statement about my child..."
203,raddatz,Thanks to both of you.
204,cooper,I want to thank the university here and this ...


# Presidential Debate 3

In [21]:
pres3_url = 'https://www.politico.com/story/2016/10/full-transcript-third-2016-presidential-debate-230063'
r_pres3 = requests.get(pres3_url)
db_pres = client.presidente3
pages_pres = db_pres.pages
pages_pres.insert_one({'html': r_pres3.content})
soup_pres = BeautifulSoup(r_pres3.content, "html")

div = soup_pres.find("div", {"class": "global-wrapper"})
main = div.find("main", {"class": "super-duper"})
article = main.find("article", {"class": "story-main-content"})
text = article.find("div", {"class": "story-text"})

rows = text.find_all('p')[2:-1]
all_rows = []
empty_row = {
    "Speaker":None, "Line":None
}
for i, row in enumerate(rows):
    new_row = copy.copy(empty_row)
    if rows[i].find('b') != None:
        a = rows[i].find('b').text
        new_row['Speaker'] = a
    elif rows[i].find('b') == None:
        new_row['Speaker'] = a
    new_row['Line'] = rows[i].text
    all_rows.append(new_row)
    
for i in all_rows:
    if i['Line'] == '(Applause)':
        i['Speaker'] = 'AUDIENCE'
        i['Line'] = 'APPLAUSE'
    if i['Line'][:14] == 'Chris Wallace:':
        i['Line'] = i['Line'][14:]
    if i['Line'][:8] == 'Wallace:':
        i['Line'] = i['Line'][8:]
    if i['Line'][:7] == 'Cooper:':
        i['Line'] = i['Line'][7:]
    if i['Line'][:6] == 'Brock:':
        i['Line'] = i['Line'][6:]
    if i['Line'][:8] == 'Clinton:':
        i['Line'] = i['Line'][8:]
    if i['Line'][:6] == 'Trump:':
        i['Line'] = i['Line'][6:]
        
for row in all_rows:
    pages_pres.insert_one(row)
    
rows = pages_pres.find()
pres3 = pd.DataFrame(list(rows))
pres3 = pres3.drop(['_id', 'html'], axis=1)
pres3 = pres3[1:]

speakers = [i.lower() for i in pres3['Speaker']]
pres3['Speaker'] = speakers
pres3 = pres3[pres3['Line'] != 'Advertisement']
pres3.index = range(len(pres3))
pres3['Speaker'][0] = 'wallace'
pres3

Unnamed: 0,Speaker,Line
0,wallace,Good evening from the Thomas and Mack Center ...
1,wallace,"Secretary Clinton, Mr. Trump, welcome. Let's ..."
2,clinton,Thank you very much Chris and thanks to UNLV ...
3,wallace,"Secretary Clinton, thank you. Mr. Trump, same..."
4,trump,"Well, first of all, it’s so great to be with ..."
...,...,...
252,wallace,"This is a final time, probably to both of you..."
253,clinton,Well I would like to say to everyone watching...
254,wallace,"Secretary Clinton, thank you. Mr. Trump?"
255,trump,She's raising the money from the people she w...


# Merging Tables

In [40]:
all_primaries = dem1.append(dem2).append(dem3).append(dem4).append(dem5).append(dem6).append(dem7).append(dem8)
all_pres = pres1.append(pres2).append(pres3)

In [23]:
all_primaries

Unnamed: 0,Speaker,Line
0,cooper,I'm Anderson Cooper. Thanks for joining us. We...
1,cooper,"As the moderator, I'll ask questions, followup..."
2,cooper,Our viewers should know that we have lights th...
3,cooper,I want the candidates to be able to introduce ...
4,cooper,Let's begin with Governor Chafee.
...,...,...
466,tumulty,And on behalf of The Washington Post and Univi...
467,ramos [through translator],"We want to use these last moments, it's very i..."
468,ramos [through translator],"So you know it perfectly, who does not vote do..."
469,salinas [through translator],"And as you know, nobody can reach the White Ho..."


In [24]:
all_pres

Unnamed: 0,Speaker,Line
0,holt,Good evening from Hofstra University in Hemps...
1,holt,The participants tonight are Donald Trump and ...
3,holt,The 90-minute debate is divided into six segme...
4,holt,The questions are mine and have not been share...
5,holt,"I will invite you to applaud, however, at this..."
...,...,...
252,wallace,"This is a final time, probably to both of you..."
253,clinton,Well I would like to say to everyone watching...
254,wallace,"Secretary Clinton, thank you. Mr. Trump?"
255,trump,She's raising the money from the people she w...


# Preparation

In [25]:
all_primaries = all_primaries[(all_primaries['Speaker'] == 'clinton') | (all_primaries['Speaker'] == 'sanders') | (all_primaries['Speaker'] == "o'malley")]
all_primaries.index = range(len(all_primaries))
bools =[]
for i in all_primaries['Speaker']:
    if i == 'clinton':
        bools.append(1)
    else:
        bools.append(0)
all_primaries['Labels'] = bools

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [26]:
all_pres = all_pres[(all_pres['Speaker'] == 'clinton') | (all_pres['Speaker'] == 'trump')]
all_pres.index = range(len(all_pres))
boolean = []
for i in all_pres['Speaker']:
    if i == 'clinton':
        boolean.append(1)
    else:
        boolean.append(0)
all_pres['Labels'] = boolean

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [27]:
all_primaries['tokenized'] = all_primaries['Line'].apply(sent_tokenize)

s = all_primaries.apply(lambda x: pd.Series(x['tokenized']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'tokenized'
all_primaries2 = all_primaries.drop('tokenized', axis=1).join(s)
all_primaries2['tokenized'] = pd.Series(all_primaries2['tokenized'], dtype=object)
all_primaries2 = all_primaries2.drop('Line', axis=1)
all_primaries2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Speaker,Labels,tokenized
0,o'malley,0,"My name is Martin O'Malley, former Mayor of Ba..."
1,o'malley,0,"My wife Katie and I have four great kids, Grac..."
1,o'malley,0,"And, like you, there is nothing we wouldn't do..."
1,o'malley,0,There are some things that I have learned to d...
1,o'malley,0,"And, after 15 years of executive experience, I..."
...,...,...,...
2382,sanders,0,Which is why I believe we've got to overturn C...
2383,sanders,0,"Is it right that in the greatest, wealthiest c..."
2383,sanders,0,"If we stand up, fight back, we can do a lot be..."
2383,sanders,0,That's why I'm running for president.


In [28]:
all_pres['tokenized'] = all_pres['Line'].apply(sent_tokenize)

s = all_pres.apply(lambda x: pd.Series(x['tokenized']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'tokenized'
all_pres2 = all_pres.drop('tokenized', axis=1).join(s)
all_pres2['tokenized'] = pd.Series(all_pres2['tokenized'], dtype=object)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [29]:
all_primaries2.index = range(len(all_primaries2))

In [30]:
all_pres2.index = range(len(all_pres2))

In [31]:
all_primaries2['tokenized'] = preprocess(all_primaries2['tokenized'])

In [32]:
all_pres2['tokenized'] = preprocess(all_pres2['tokenized'])

In [33]:
stop = stopwords.words('english')
stop.append('get')
stop.append('—')
stop.append('thats')
stop.append('dont')
stop.append('us')
stop.append('im')

all_primaries2['tokenized'] = all_primaries2['tokenized'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [34]:
all_pres2['tokenized'] = all_pres2['tokenized'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [35]:
all_primaries2 = all_primaries2[all_primaries2['tokenized'] != '']
all_pres2 = all_pres2[all_pres2['tokenized'] != '']

In [36]:
all_primaries2.to_csv('Primaries', index=False)
all_pres2.to_csv('Presidential', index=False)

In [38]:
all_pres2

Unnamed: 0,Speaker,Line,Labels,tokenized
0,clinton,"How are you, Donald?",1,donald
1,clinton,"Well, thank you, Lester, and thanks to Hofstr...",1,well thank lester thanks hofstra hosting
2,clinton,The central question in this election is reall...,1,central question election really kind country ...
3,clinton,The central question in this election is reall...,1,today granddaughters second birthday think lot
4,clinton,The central question in this election is reall...,1,first build economy works everyone top
...,...,...,...,...
3030,trump,She's raising the money from the people she w...,0,education
3031,trump,She's raising the money from the people she w...,0,jobs
3032,trump,She's raising the money from the people she w...,0,african americans latinos ten lifetimes
3033,trump,She's raising the money from the people she w...,0,shes done talk african americans latinos vote ...


In [39]:
all_pres

Unnamed: 0,Speaker,Line,Labels,tokenized
0,clinton,"How are you, Donald?",1,"[ How are you, Donald?]"
1,clinton,"Well, thank you, Lester, and thanks to Hofstr...",1,"[ Well, thank you, Lester, and thanks to Hofst..."
2,clinton,The central question in this election is reall...,1,[The central question in this election is real...
3,clinton,I want us to invest in you. I want us to inves...,1,"[I want us to invest in you., I want us to inv..."
4,clinton,I also want to see more companies do profit-s...,1,[ I also want to see more companies do profit-...
...,...,...,...,...
613,clinton,Replenish the trust fund by making sure that ...,1,[ Replenish the trust fund by making sure that...
614,trump,Your husband disagrees with you.,0,[ Your husband disagrees with you.]
615,clinton,We’ve got to go after the long-term health ca...,1,[ We’ve got to go after the long-term health c...
616,clinton,Well I would like to say to everyone watching...,1,[ Well I would like to say to everyone watchin...


In [44]:
all_pres

Unnamed: 0,Speaker,Line
0,holt,Good evening from Hofstra University in Hemps...
1,holt,The participants tonight are Donald Trump and ...
3,holt,The 90-minute debate is divided into six segme...
4,holt,The questions are mine and have not been share...
5,holt,"I will invite you to applaud, however, at this..."
...,...,...
252,wallace,"This is a final time, probably to both of you..."
253,clinton,Well I would like to say to everyone watching...
254,wallace,"Secretary Clinton, thank you. Mr. Trump?"
255,trump,She's raising the money from the people she w...


In [47]:
all_pres.index = range(len(all_pres))

In [59]:
all_pres['Line'][905]

" She's raising the money from the people she wants to control. Doesn't work that way. But when I started this campaign, I started it very strongly. It's called Make America Great Again. We're going to make America great. We have a depleted military. It has to be helped. It has to be fixed. We have the greatest people on Earth in our military. We don't take care of our veterans. We take care of illegal immigrants, people that come into our country illegally better than we take care of our vets. That can’t happen. Our policemen and women are disrespected. We need law and order, but we need justice too. Our inner cities are a disaster. You get shot walking to the store. They have no education. They have no jobs. I will do more for African-Americans and Latinos that she can do for ten lifetimes. All she's done is talk to the African-Americans and to the Latinos, but they get the vote and then they come back, they say ‘we’ll see you in four years.’ We are going to make America strong again