# Working with BeautifulSoup

In [30]:
##https://www.dataquest.io/blog/web-scraping-tutorial-python/

In [17]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [18]:
page=urlopen("http://dataquestio.github.io/web-scraping-pages/simple.html")

In [23]:
soup = BeautifulSoup(page, 'html.parser')

In [27]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <p>
   Here is some simple content for this page.
  </p>
 </body>
</html>


In [26]:
print(soup)

<!DOCTYPE html>

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<p>Here is some simple content for this page.</p>
</body>
</html>


In [28]:
list(soup.children)

['html', '\n', <html>
 <head>
 <title>A simple example page</title>
 </head>
 <body>
 <p>Here is some simple content for this page.</p>
 </body>
 </html>]

In [29]:
[type(item) for item in list(soup.children)] 

[bs4.element.Doctype, bs4.element.NavigableString, bs4.element.Tag]

In [31]:
html = list(soup.children)[2]

In [32]:
list(html.children)

['\n', <head>
 <title>A simple example page</title>
 </head>, '\n', <body>
 <p>Here is some simple content for this page.</p>
 </body>, '\n']

In [33]:
body = list(html.children)[3]

In [34]:
list(body.children)

['\n', <p>Here is some simple content for this page.</p>, '\n']

In [41]:
p = list(body.children)[1]

In [42]:
print(p)

<p>Here is some simple content for this page.</p>


In [43]:
p.get_text()

'Here is some simple content for this page.'

In [45]:
p_new=soup.find_all('p')

In [48]:
print(p.get_text())

Here is some simple content for this page.


In [49]:
page2=urlopen("http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")

In [50]:
soup2 = BeautifulSoup(page2, 'html.parser')

In [51]:
print(soup2.prettify())

<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <div>
   <p class="inner-text first-item" id="first">
    First paragraph.
   </p>
   <p class="inner-text">
    Second paragraph.
   </p>
  </div>
  <p class="outer-text first-item" id="second">
   <b>
    First outer paragraph.
   </b>
  </p>
  <p class="outer-text">
   <b>
    Second outer paragraph.
   </b>
  </p>
 </body>
</html>


In [58]:
print(soup2.find_all('p'))

[<p class="inner-text first-item" id="first">
                First paragraph.
            </p>, <p class="inner-text">
                Second paragraph.
            </p>, <p class="outer-text first-item" id="second">
<b>
                First outer paragraph.
            </b>
</p>, <p class="outer-text">
<b>
                Second outer paragraph.
            </b>
</p>]


In [62]:
soup2.find_all('p')[3].get_text()

'\n\n                Second outer paragraph.\n            \n'

In [65]:
soup2.find_all('p', class_='outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [66]:
page3=urlopen("http://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168")

In [68]:
soup3 = BeautifulSoup(page3, 'html.parser')

In [84]:
temp1=soup3.find_all('div',class_="tombstone-container")

In [82]:
seven_day = soup.find(id="seven-day-forecast")

In [89]:
temp_main=temp1[0]

In [90]:
print(temp_main.prettify())

<div class="tombstone-container">
 <p class="period-name">
  This
  <br>
   Afternoon
  </br>
 </p>
 <p>
  <img alt="This Afternoon: Sunny, with a high near 58. Northeast wind around 8 mph. " class="forecast-icon" src="newimages/medium/few.png" title="This Afternoon: Sunny, with a high near 58. Northeast wind around 8 mph. "/>
 </p>
 <p class="short-desc">
  Sunny
 </p>
 <p class="temp temp-high">
  High: 58 °F
 </p>
</div>


In [103]:
period = temp_main.find(class_="period-name").get_text()
short_desc = temp_main.find(class_="short-desc").get_text()
temp = temp_main.find(class_="temp").get_text()
img=temp_main.find(class_="forecast-icon")
desc = img['title']

In [104]:
print(period)
print("Clouds are",short_desc)
print(temp)
print(desc)


ThisAfternoon
Clouds are Sunny
High: 58 °F
This Afternoon: Sunny, with a high near 58. Northeast wind around 8 mph. 


# Print thsi in text file

In [112]:
raw_data= period +' '+ short_desc +' '+ temp +' '+ desc
raw_data

'ThisAfternoon Sunny High: 58 °F This Afternoon: Sunny, with a high near 58. Northeast wind around 8 mph. '

In [105]:
import csv  
from datetime import datetime  

In [113]:
with open('Current temp.txt', 'a') as csv_file:  
    writer = csv.writer(csv_file)
    writer.writerow([raw_data, datetime.now()])

In [118]:
seven_day = soup3.find(id="seven-day-forecast")

In [120]:
period_tags = seven_day.select(".tombstone-container .period-name")

In [121]:
periods = [pt.get_text() for pt in period_tags]
periods

['ThisAfternoon',
 'Tonight',
 'Friday',
 'FridayNight',
 'Saturday',
 'SaturdayNight',
 "NewYear'sDay",
 'SundayNight',
 'Monday']

In [122]:
short_descs = [sd.get_text() for sd in seven_day.select(".tombstone-container .short-desc")]
temps = [t.get_text() for t in seven_day.select(".tombstone-container .temp")]
descs = [d["title"] for d in seven_day.select(".tombstone-container img")]

print(short_descs)
print(temps)
print(descs)

['Sunny', 'Mostly Clear', 'Mostly Sunny', 'Partly Cloudy', 'Slight ChanceShowers', 'Partly Cloudy', 'Slight ChanceShowers', 'ChanceShowers', 'ChanceShowers']
['High: 58 °F', 'Low: 45 °F', 'High: 55 °F', 'Low: 45 °F', 'High: 53 °F', 'Low: 44 °F', 'High: 51 °F', 'Low: 43 °F', 'High: 49 °F']
['This Afternoon: Sunny, with a high near 58. Northeast wind around 8 mph. ', 'Tonight: Mostly clear, with a low around 45. North wind 3 to 6 mph. ', 'Friday: Mostly sunny, with a high near 55. Light and variable wind becoming west southwest 5 to 7 mph in the morning. ', 'Friday Night: Partly cloudy, with a low around 45. West wind around 14 mph, with gusts as high as 18 mph. ', 'Saturday: A 20 percent chance of showers.  Mostly sunny, with a high near 53. West wind around 14 mph becoming north in the morning. Winds could gust as high as 18 mph.  New precipitation amounts of less than a tenth of an inch possible. ', 'Saturday Night: Partly cloudy, with a low around 44.', "New Year's Day: A 20 percent 

In [123]:
import pandas as pd
weather = pd.DataFrame({
        "period": periods, 
        "short_desc": short_descs, 
        "temp": temps, 
        "desc":descs
    })
weather

Unnamed: 0,desc,period,short_desc,temp
0,"This Afternoon: Sunny, with a high near 58. No...",ThisAfternoon,Sunny,High: 58 °F
1,"Tonight: Mostly clear, with a low around 45. N...",Tonight,Mostly Clear,Low: 45 °F
2,"Friday: Mostly sunny, with a high near 55. Lig...",Friday,Mostly Sunny,High: 55 °F
3,"Friday Night: Partly cloudy, with a low around...",FridayNight,Partly Cloudy,Low: 45 °F
4,Saturday: A 20 percent chance of showers. Mos...,Saturday,Slight ChanceShowers,High: 53 °F
5,"Saturday Night: Partly cloudy, with a low arou...",SaturdayNight,Partly Cloudy,Low: 44 °F
6,New Year's Day: A 20 percent chance of showers...,NewYear'sDay,Slight ChanceShowers,High: 51 °F
7,Sunday Night: A chance of showers. Mostly clo...,SundayNight,ChanceShowers,Low: 43 °F
8,"Monday: A chance of showers. Partly sunny, wi...",Monday,ChanceShowers,High: 49 °F


In [128]:
weather.to_csv('weather.csv')

In [129]:
github=urlopen("https://github.com/j-delaney/easy-application")

In [130]:
github2 = BeautifulSoup(github, 'html.parser')

In [140]:
raw_companies=github2.find_all('table')[1]

In [141]:
#Generate lists
A=[]
B=[]

In [152]:
companies=raw_companies.find_all('a')

In [210]:
locations=raw_companies.find_all("td")


In [212]:
(locations)

[<td><a href="https://jobs.lever.co/21">21</a></td>,
 <td>San Francisco, CA</td>,
 <td><a href="https://www.23andme.com/careers/">23andMe</a></td>,
 <td>Mountain View, CA</td>,
 <td><a href="https://6sense.com/about-us/careers-and-culture/">6sense</a></td>,
 <td>San Francisco, CA</td>,
 <td><a href="https://www.a9.com/careers/">A9</a></td>,
 <td>Palo Alto, CA</td>,
 <td><a href="https://www.academia.edu/hiring">Academia</a></td>,
 <td>San Francisco, CA</td>,
 <td><a href="http://www.achievers.com/careers/jobs">Achievers</a></td>,
 <td>San Francisco, CA</td>,
 <td><a href="https://www.addepar.com/careers/">Addepar</a></td>,
 <td>Mountain View, CA</td>,
 <td><a href="https://www.aerofs.com/company/careers/">AeroFS</a></td>,
 <td>San Francisco, CA</td>,
 <td><a href="https://www.affirm.com/careers/">Affirm</a></td>,
 <td>San Francisco, CA</td>,
 <td><a href="https://www.airbnb.com/careers">Airbnb</a></td>,
 <td>San Francisco, CA</td>,
 <td><a href="https://airtable.com/jobs">Airtable</a><

In [187]:
print (x for x in range(0,101,2))

<generator object <genexpr> at 0x07B6B300>


In [189]:
print(*range(1,101,2))

1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63 65 67 69 71 73 75 77 79 81 83 85 87 89 91 93 95 97 99
