# Reading the web page into Python

#### Using the request library read the html of the article into python

In [48]:
import requests

In [49]:
r = requests.get("https://www.nytimes.com/interactive/2017/06/23/opinion/trumps-lies.html")

In [50]:
print(r.text[0:500])

<!DOCTYPE html>
<!--[if (gt IE 9)|!(IE)]> <!--><html lang="en" class="no-js page-interactive section-opinion page-theme-standard tone-opinion page-interactive-default limit-small layout-xlarge app-interactive" itemid="https://www.nytimes.com/interactive/2017/06/23/opinion/trumps-lies.html" itemtype="http://schema.org/NewsArticle" itemscope xmlns:og="http://opengraphprotocol.org/schema/"><!--<![endif]-->
<!--[if IE 9]> <html lang="en" class="no-js ie9 lt-ie10 page-interactive section-opinion page


In [14]:
#r1 = requests.get("http://www.nature.nps.gov/air/WebCams/parks/grsmcam/grsmcam.cfm")

In [15]:
#print(r1.text)

In [17]:
# The current humidity level at Great Smoky Mountains National Park
from lxml import html
import requests
url = "http://www.nature.nps.gov/air/WebCams/parks/grsmcam/grsmcam.cfm"
doc = html.fromstring(requests.get(url).text)
#print(doc.cssselect('#CollapsiblePanel6 div div div')[3].text_content())

### Parsing the HTML using Beautiful Soup library 

In [47]:
from bs4 import BeautifulSoup

In [21]:
soup = BeautifulSoup(r.text,'html.parser')

#### Find all the records

In [22]:
results = soup.find_all('span',attrs={'class':'short-desc'})

#### First 3 results

In [23]:
results[0:3]

[<span class="short-desc"><strong>Jan. 21 </strong>“I wasn't a fan of Iraq. I didn't want to go into Iraq.” <span class="short-truth"><a href="https://www.buzzfeed.com/andrewkaczynski/in-2002-donald-trump-said-he-supported-invading-iraq-on-the" target="_blank">(He was for an invasion before he was against it.)</a></span></span>,
 <span class="short-desc"><strong>Jan. 21 </strong>“A reporter for Time magazine — and I have been on their cover 14 or 15 times. I think we have the all-time record in the history of Time magazine.” <span class="short-truth"><a href="http://nation.time.com/2013/11/06/10-things-you-didnt-know-about-time/" target="_blank">(Trump was on the cover 11 times and Nixon appeared 55 times.)</a></span></span>,
 <span class="short-desc"><strong>Jan. 23 </strong>“Between 3 million and 5 million illegal votes caused me to lose the popular vote.” <span class="short-truth"><a href="https://www.nytimes.com/2017/01/23/us/politics/donald-trump-congress-democrats.html" target="_

#### Last result

In [30]:
results[-1]

<span class="short-desc"><strong>Nov. 11 </strong>“I'd rather have him  – you know, work with him on the Ukraine than standing and arguing about whether or not  – because that whole thing was set up by the Democrats.” <span class="short-truth"><a href="https://www.nytimes.com/interactive/2017/12/10/us/politics/trump-and-russia.html" target="_blank">(There is no evidence that Democrats "set up" Russian interference in the election.)</a></span></span>

#### Length of the results

In [31]:
len(results)

180

### Extracting the Date

In [56]:
first_result = results[0]

In [57]:
print(first_result)

<span class="short-desc"><strong>Jan. 21 </strong>“I wasn't a fan of Iraq. I didn't want to go into Iraq.” <span class="short-truth"><a href="https://www.buzzfeed.com/andrewkaczynski/in-2002-donald-trump-said-he-supported-invading-iraq-on-the" target="_blank">(He was for an invasion before he was against it.)</a></span></span>


In [58]:
first_result.find('strong')

<strong>Jan. 21 </strong>

In [59]:
first_result.find('strong').text

'Jan. 21\xa0'

In [60]:
first_result.find('strong').text[0:-1]

'Jan. 21'

#### Add the year

In [61]:
first_result.find('strong').text[0:-1] + ' ,2017'

'Jan. 21 ,2017'

### Extracting the lie

In [62]:
first_result.contents

[<strong>Jan. 21 </strong>,
 "“I wasn't a fan of Iraq. I didn't want to go into Iraq.” ",
 <span class="short-truth"><a href="https://www.buzzfeed.com/andrewkaczynski/in-2002-donald-trump-said-he-supported-invading-iraq-on-the" target="_blank">(He was for an invasion before he was against it.)</a></span>]

In [63]:
first_result.contents[1]

"“I wasn't a fan of Iraq. I didn't want to go into Iraq.” "

In [67]:
first_result.contents[1][1:-2]

"I wasn't a fan of Iraq. I didn't want to go into Iraq."

### Extracting the explanation

In [69]:
first_result.find('a').text

'(He was for an invasion before he was against it.)'

In [74]:
first_result.find('a').text[1:-1]

'He was for an invasion before he was against it.'

### Extracting the URL

In [75]:
first_result

<span class="short-desc"><strong>Jan. 21 </strong>“I wasn't a fan of Iraq. I didn't want to go into Iraq.” <span class="short-truth"><a href="https://www.buzzfeed.com/andrewkaczynski/in-2002-donald-trump-said-he-supported-invading-iraq-on-the" target="_blank">(He was for an invasion before he was against it.)</a></span></span>

In [76]:
first_result.find('a')

<a href="https://www.buzzfeed.com/andrewkaczynski/in-2002-donald-trump-said-he-supported-invading-iraq-on-the" target="_blank">(He was for an invasion before he was against it.)</a>

In [78]:
first_result.find('a')['href']

'https://www.buzzfeed.com/andrewkaczynski/in-2002-donald-trump-said-he-supported-invading-iraq-on-the'

### Building a Dataset

In [83]:
records =[]
for result in results:
    date = result.find('strong').text[0:-1] + ',2017'
    lie = result.contents[1][1:-2]
    explanation = result.find('a').text[1:-1]
    url = result.find('a')['href']
    records.append((date,lie,explanation,url))

In [86]:
records[0:3]

[('Jan. 21,2017',
  "I wasn't a fan of Iraq. I didn't want to go into Iraq.",
  'He was for an invasion before he was against it.',
  'https://www.buzzfeed.com/andrewkaczynski/in-2002-donald-trump-said-he-supported-invading-iraq-on-the'),
 ('Jan. 21,2017',
  'A reporter for Time magazine — and I have been on their cover 14 or 15 times. I think we have the all-time record in the history of Time magazine.',
  'Trump was on the cover 11 times and Nixon appeared 55 times.',
  'http://nation.time.com/2013/11/06/10-things-you-didnt-know-about-time/'),
 ('Jan. 23,2017',
  'Between 3 million and 5 million illegal votes caused me to lose the popular vote.',
  "There's no evidence of illegal voting.",
  'https://www.nytimes.com/2017/01/23/us/politics/donald-trump-congress-democrats.html')]

#### Add the records to Pandas Dataframe for tabular structure

In [87]:
import pandas as pd

In [90]:
df = pd.DataFrame(records,columns=['Date','Lie','Explanation','Url'])

In [97]:
df.tail()

Unnamed: 0,Date,Lie,Explanation,Url
175,"Oct. 25,2017",We have trade deficits with almost everybody.,We have trade surpluses with more than 100 cou...,https://www.bea.gov/newsreleases/international...
176,"Oct. 27,2017","Wacky & totally unhinged Tom Steyer, who has b...",Steyer has financially supported many winning ...,https://www.opensecrets.org/donor-lookup/resul...
177,"Nov. 1,2017","Again, we're the highest-taxed nation, just ab...",We're not.,http://www.politifact.com/truth-o-meter/statem...
178,"Nov. 7,2017",When you look at the city with the strongest g...,"Several other cities, including New York and L...",http://www.politifact.com/truth-o-meter/statem...
179,"Nov. 11,2017","I'd rather have him – you know, work with him...","There is no evidence that Democrats ""set up"" R...",https://www.nytimes.com/interactive/2017/12/10...


In [103]:
#df['Date'] = pd.to_datetime(df['Date'])


#### Exporting the dataset to a csv file

In [None]:
df.to_csv('trump_lies.csv',index=False)