** Goal: extract information from the web and store it in a desired format. **


In [64]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
r = requests.get('https://www.nytimes.com/interactive/2017/06/23/opinion/trumps-lies.html')

In [6]:
r.text[:500]

u'<!DOCTYPE html>\n<!--[if (gt IE 9)|!(IE)]> <!--><html lang="en" class="no-js page-interactive section-opinion page-theme-standard tone-opinion page-interactive-default limit-small layout-xlarge app-interactive" itemid="https://www.nytimes.com/interactive/2017/06/23/opinion/trumps-lies.html" itemtype="http://schema.org/NewsArticle" itemscope xmlns:og="http://opengraphprotocol.org/schema/"><!--<![endif]-->\n<!--[if IE 9]> <html lang="en" class="no-js ie9 lt-ie10 page-interactive section-opinion page'

In [10]:
soup = BeautifulSoup(r.text,'html.parser')

## Take the advantages from the patern of the article

<span class="short-desc"><strong>Jan. 21&nbsp;</strong>“I wasn't a fan of Iraq. I didn't want to go into Iraq.” <span class="short-truth"><a href="https://www.buzzfeed.com/andrewkaczynski/in-2002-donald-trump-said-he-supported-invading-iraq-on-the" target="_blank">(He was for an invasion before he was against it.)</a></span></span>&nbsp;&nbsp;

In [47]:
#We have collected all the results
results=soup.find_all('span',attrs={'class':'short-desc'})
results[0]

<span class="short-desc"><strong>Jan. 21\xa0</strong>\u201cI wasn't a fan of Iraq. I didn't want to go into Iraq.\u201d <span class="short-truth"><a href="https://www.buzzfeed.com/andrewkaczynski/in-2002-donald-trump-said-he-supported-invading-iraq-on-the" target="_blank">(He was for an invasion before he was against it.)</a></span></span>

In [33]:
#separate results into its components
first_result = results[0]
date = first_result.find('strong').text
date

u'Jan. 21\xa0'

In [85]:
#remove '\xa0'
date = first_result.find('strong').text[:-1]+',2017'
date

u'Jan. 21,2017'

In [44]:
#extract a lie
first_result.contents[1]

u"\u201cI wasn't a fan of Iraq. I didn't want to go into Iraq.\u201d "

In [55]:
first_result.contents[1][1:-2]

u"I wasn't a fan of Iraq. I didn't want to go into Iraq."

In [58]:
#extract explanation
first_result.contents[2].text[1:-1]

u'He was for an invasion before he was against it.'

In [59]:
#extract link
first_result.find('a')['href']

u'https://www.buzzfeed.com/andrewkaczynski/in-2002-donald-trump-said-he-supported-invading-iraq-on-the'

In [103]:
#Record all the lies told by Trump
records = []
for result in results:
    date = result.find('strong').text[:-1]+', 2017'
    lie  = result.contents[1][1:-2]
    explanation = result.contents[2].text[1:-1]
    link = result.find('a')['href']
    records.append((date,lie,explanation,link))
records[0]  

(u'Jan. 21, 2017',
 u"I wasn't a fan of Iraq. I didn't want to go into Iraq.",
 u'He was for an invasion before he was against it.',
 u'https://www.buzzfeed.com/andrewkaczynski/in-2002-donald-trump-said-he-supported-invading-iraq-on-the')

In [104]:
df=pd.DataFrame(records,columns=['Date','Lie','Explanation','URL'])
df.head()

Unnamed: 0,Date,Lie,Explanation,URL
0,"Jan. 21, 2017",I wasn't a fan of Iraq. I didn't want to go in...,He was for an invasion before he was against it.,https://www.buzzfeed.com/andrewkaczynski/in-20...
1,"Jan. 21, 2017",A reporter for Time magazine — and I have been...,Trump was on the cover 11 times and Nixon appe...,http://nation.time.com/2013/11/06/10-things-yo...
2,"Jan. 23, 2017",Between 3 million and 5 million illegal votes ...,There's no evidence of illegal voting.,https://www.nytimes.com/2017/01/23/us/politics...
3,"Jan. 25, 2017","Now, the audience was the biggest ever. But th...",Official aerial photos show Obama's 2009 inaug...,https://www.nytimes.com/2017/01/21/us/politics...
4,"Jan. 25, 2017",Take a look at the Pew reports (which show vot...,The report never mentioned voter fraud.,https://www.nytimes.com/2017/01/24/us/politics...


In [105]:
df['Date']=pd.to_datetime(df['Date'])
df.head(3)

Unnamed: 0,Date,Lie,Explanation,URL
0,2017-01-21,I wasn't a fan of Iraq. I didn't want to go in...,He was for an invasion before he was against it.,https://www.buzzfeed.com/andrewkaczynski/in-20...
1,2017-01-21,A reporter for Time magazine — and I have been...,Trump was on the cover 11 times and Nixon appe...,http://nation.time.com/2013/11/06/10-things-yo...
2,2017-01-23,Between 3 million and 5 million illegal votes ...,There's no evidence of illegal voting.,https://www.nytimes.com/2017/01/23/us/politics...


In [106]:
df.to_csv('Trump_lies.csv',index=False,encoding='utf-8')