# Reading files

## Work with text files

In [1]:
# 'r' - read-only files
file_for_reading = open('test_text_file.txt', 'r')

# 'w' writes into file - it will erase the file if it already exists
file_for_writing = open('writing_file.txt', 'w')

# a - for appending to the end of the file
file_for_appending = open('appending_file.txt', 'a')

# don't forget to close the file at the end of the work
file_for_writing.close()

In [2]:
# Just stick some data there
with open('email_addresses.txt', 'w') as f:
    f.write("joelgrus@gmail.com\n")
    f.write("joel@m.datasciencester.com\n")
    f.write("joelgrus@m.datasciencester.com\n")

def get_domain(email_address):
    # split on '@' and return the last piece
    return email_address.lower().split("@")[-1]

In [3]:
get_domain('joelgrus@gmail.com')

'gmail.com'

In [4]:
get_domain('joel@datasciencester.com')

'datasciencester.com'

## Files with separators

In [5]:
def process(date, symbol, closing_price):
    # Imaginge that this function actually does something.
    assert closing_price > 0.0

In [6]:
import csv

with open('tab_delimited_stock_prices.txt', 'w') as f:
    f.write("""6/20/2014\tAAPL\t90.91
6/20/2014\tMSFT\t41.68
6/20/2014\tFB\t64.5
6/19/2014\tAAPL\t91.86
6/19/2014\tMSFT\t41.51
6/19/2014\tFB\t64.34
""")

In [7]:
with open('tab_delimited_stock_prices.txt') as f:
    reader = csv.reader(f, delimiter='\t')
    for row in reader:
        date = row[0]
        symbol = row[1]
        closing_price = float(row[2])
        process(date, symbol, closing_price)

In [8]:
with open('colon_delimited_stock_prices.txt', 'w') as f:
    f.write("""date:symbol:closing_price
6/20/2014:AAPL:90.91
6/20/2014:MSFT:41.68
6/20/2014:FB:64.5
""")

In [9]:
with open('colon_delimited_stock_prices.txt') as f:
    colon_reader = csv.DictReader(f, delimiter=':')
    for dict_row in colon_reader:
        date = dict_row["date"]
        symbol = dict_row["symbol"]
        closing_price = float(dict_row["closing_price"])
        process(date, symbol, closing_price)

In [10]:
todays_prices = {'AAPL': 90.91, 'MSFT': 41.68, 'FB': 64.5 }

In [11]:
with open('comma_delimited_stock_prices.txt', 'w') as f:
    csv_writer = csv.writer(f, delimiter=',')
    for stock, price in todays_prices.items():
        csv_writer.writerow([stock, price])

In [12]:
results = [["test1", "success", "Monday"],
           ["test2", "success, kind of", "Tuesday"],
           ["test3", "failure, kind of", "Wednesday"],
           ["test4", "failure, utter", "Thursday"]]

In [13]:
# don't do this!
with open('bad_csv.txt', 'w') as f:
    for row in results:
        f.write(",".join(map(str, row))) # might have too many commas in it!
        f.write("\n")                    # row might have newlines as well!

# Web-scraping

## HTML code analysis

In [14]:
from bs4 import BeautifulSoup
import requests
html = requests.get('http://www.example.com').text
soup = BeautifulSoup(html, 'html5lib')

In [15]:
first_paragraph = soup.find('p')             # the first tag <p> or just soup.p
first_paragraph

<p>This domain is for use in illustrative examples in documents. You may use this
    domain in literature without prior coordination or asking for permission.</p>

In [16]:
first_paragraph_text = soup.p.text           # text of the first element <p>
first_paragraph_text

'This domain is for use in illustrative examples in documents. You may use this\n    domain in literature without prior coordination or asking for permission.'

In [17]:
first_paragraph_words = soup.p.text.split()  # words of the first element
first_paragraph_words

['This',
 'domain',
 'is',
 'for',
 'use',
 'in',
 'illustrative',
 'examples',
 'in',
 'documents.',
 'You',
 'may',
 'use',
 'this',
 'domain',
 'in',
 'literature',
 'without',
 'prior',
 'coordination',
 'or',
 'asking',
 'for',
 'permission.']

In [18]:
#first_paragraph_id = soup.p['id']           # raises KeyError if no 'id'

first_paragraph_id2 = soup.p.get('id')       # returns None if no 'id'
first_paragraph_id2

In [19]:
all_paragraphs = soup.find_all('p')          # or just soup('p')
all_paragraphs

[<p>This domain is for use in illustrative examples in documents. You may use this
     domain in literature without prior coordination or asking for permission.</p>,
 <p><a href="https://www.iana.org/domains/example">More information...</a></p>]

In [20]:
paragraphs_with_ids = [p for p in soup('p') if p.get('id')]
paragraphs_with_ids

[]

In [21]:
important_paragraphs = soup('p', {'class' : 'important'})
important_paragraphs

[]

In [22]:
important_paragraphs2 = soup('p', 'important')
important_paragraphs2

[]

In [23]:
important_paragraphs3 = [p for p in soup('p')
                         if 'important' in p.get('class', [])]
important_paragraphs3

[]

In [24]:
# warning, will return the same span multiple times
# if it sits inside multiple divs
# be more clever if that's the case
spans_inside_divs = [span
                     for div in soup('div')     # for each <div> on the page
                     for span in div('span')]   # find each <span> inside it

## Example: O'Reilly Publisher's books about data analysis

In [25]:
url = 'http://shop.oreilly.com/category/browse-subjects/data.do?sortby=publicationDate&page=1'
soup = BeautifulSoup(requests.get(url).text, 'html5lib')
soup

<!DOCTYPE html>
<html lang="en"><head>
    
    <meta charset="utf-8"/>
    <meta content="width=device-width, initial-scale=1" name="viewport"/>
    <meta content="#B9002D" name="theme-color"/>
        <meta content="Gain technology and business knowledge and hone your skills with learning resources created and curated by O'Reilly's experts: live online training, video, books, conferences, our platform has content from 200+ of the world's best publishers." name="description"/>
        <meta content="O'Reilly Media - Technology and Business Training" name="twitter:title"/>
        <meta content="Gain technology and business knowledge and hone your skills with learning resources created and curated by O'Reilly's experts: live online training, video, books, conferences, our platform has content from 200+ of the world's best publishers." name="twitter:description"/>
        <meta content="@OReillyMedia" name="twitter:site"/>
        <meta content="website" property="og:type"/>
        <me

In [26]:
# in 2020 on the link specified in the book the code is already different,
# there isn't tag <video>, so the code is not executable.
# I can only write code for Python 3, but without execution.

In [27]:
tds = soup('td', 'thumbtext')
tds

[]

In [28]:
def is_video(td):
    # it's a video if it has exactly one pricelabel, and if
    # the stripped text inside that pricelabel starts with 'Video'
    # pricelabels = td('span', 'pricelabel')
    return (len(pricelabels) == 1 and
            pricelabels[0].text.strip().startswith('Video'))

len([td for td in tds if not is_video(td)])

0

In [29]:
def book_info(td):
    # given a BeautifulSoup <td> Tag representing a book,
    # extract the book's details and return a dict

    title = td.find('div', 'thumbheader').a.text
    by_author = td.find('div', 'AuthorName').text
    authors = [x.strip() for x in re.sub('^By ', '', by_author).split(',')]
    isbn_link = td.find('div', 'thumbheader').a.get('href')
    isbn = re.match('/product/(.*)\.do', isbn_link).groups()[0]
    date = td.find('span', 'directorydate').text.strip()

    return {
        'title' : title,
        'authors' : authors,
        'isbn' : isbn,
        'date' : date
    }

In [30]:
from time import sleep

def scrape(num_pages=30):
    base_url = "http://shop.oreilly.com/category/browse-subjects/data.do?sortby=publicationDate&page=1"

    books = []

    for page_num in range(1, num_pages + 1):
        print('souping page', page_num)
        url = base_url + str(page_num)
        soup = BeautifulSoup(requests.get(url).text, 'html5lib')

        for td in soup('td', 'thumbtext'):
            if not is_video(td):
                books.append(book_info(td))

        # now be a good citizen and respect the robots.txt!
        sleep(30)

    return books

In [31]:
def get_year(book):
    # book["date"] looks like 'July 2020' so we need to
    # split on the space and then take the second piece
    return int(book['date'].split()[1])

In [32]:
def plot_years(plt, books):
    # 2019 is the last complete year of data 
    year_counts = Counter(get_year(book) for book in books
                          if get_year(book) <= 2019)

    years = sorted(year_counts)
    book_counts = [year_counts[year] for year in x]
    plt.bar([x - 0.5 for x in years], book_counts)
    plt.xlabel('year')
    plt.ylabel('# of data books')
    plt.title('Data is Big!')

# Using API

## JSON and XML formats

In [44]:
import json

serialized = '''{ "title" : "Data Science Book",
                  "author" : "Joel Grus",
                  "publicationYear" : "2015",
                  "topics" : [ "data", "science", "data science"] }'''

# parse the JSON to create a Python dict
deserialized = json.loads(serialized)
if "data science" in deserialized["topics"]:
    print(deserialized)

{'title': 'Data Science Book', 'author': 'Joel Grus', 'publicationYear': '2015', 'topics': ['data', 'science', 'data science']}


## Using unverified API

In [45]:
import requests
from collections import Counter

endpoint = 'https://api.github.com/users/MarkVoitov/repos' #  public storages in my GitHub account

repos = json.loads(requests.get(endpoint).text)

In [46]:
from dateutil.parser import parse

dates = [parse(repo["created_at"]) for repo in repos]
dates

[datetime.datetime(2020, 2, 29, 22, 24, 50, tzinfo=tzutc()),
 datetime.datetime(2020, 2, 24, 10, 32, 32, tzinfo=tzutc()),
 datetime.datetime(2020, 3, 23, 18, 1, 36, tzinfo=tzutc()),
 datetime.datetime(2020, 6, 19, 15, 12, 51, tzinfo=tzutc()),
 datetime.datetime(2020, 6, 30, 20, 38, 45, tzinfo=tzutc()),
 datetime.datetime(2020, 3, 10, 7, 52, 54, tzinfo=tzutc()),
 datetime.datetime(2020, 6, 30, 20, 5, 46, tzinfo=tzutc()),
 datetime.datetime(2019, 10, 21, 13, 5, 28, tzinfo=tzutc()),
 datetime.datetime(2020, 4, 4, 21, 43, 39, tzinfo=tzutc()),
 datetime.datetime(2020, 4, 22, 21, 22, 33, tzinfo=tzutc()),
 datetime.datetime(2020, 3, 5, 10, 45, 39, tzinfo=tzutc()),
 datetime.datetime(2020, 4, 11, 11, 52, 52, tzinfo=tzutc()),
 datetime.datetime(2019, 12, 10, 15, 0, 58, tzinfo=tzutc()),
 datetime.datetime(2020, 4, 18, 13, 7, 39, tzinfo=tzutc()),
 datetime.datetime(2020, 6, 21, 21, 2, 29, tzinfo=tzutc()),
 datetime.datetime(2020, 2, 27, 8, 32, 26, tzinfo=tzutc()),
 datetime.datetime(2019, 11, 22,

In [47]:
month_counts = Counter(date.month for date in dates)
month_counts

Counter({2: 3, 3: 3, 6: 4, 10: 1, 4: 4, 12: 1, 11: 1})

In [48]:
weekday_counts = Counter(date.weekday() for date in dates)
weekday_counts

Counter({5: 4, 0: 3, 4: 2, 1: 4, 2: 1, 3: 2, 6: 1})

In [51]:
last_5_repositories = sorted(repos, key=lambda r: r['created_at'], reverse=True)[:5]
last_5_repositories

[{'id': 276203647,
  'node_id': 'MDEwOlJlcG9zaXRvcnkyNzYyMDM2NDc=',
  'name': 'data-science-from-scratch',
  'full_name': 'MarkVoitov/data-science-from-scratch',
  'private': False,
  'owner': {'login': 'MarkVoitov',
   'id': 56827081,
   'node_id': 'MDQ6VXNlcjU2ODI3MDgx',
   'avatar_url': 'https://avatars1.githubusercontent.com/u/56827081?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/MarkVoitov',
   'html_url': 'https://github.com/MarkVoitov',
   'followers_url': 'https://api.github.com/users/MarkVoitov/followers',
   'following_url': 'https://api.github.com/users/MarkVoitov/following{/other_user}',
   'gists_url': 'https://api.github.com/users/MarkVoitov/gists{/gist_id}',
   'starred_url': 'https://api.github.com/users/MarkVoitov/starred{/owner}{/repo}',
   'subscriptions_url': 'https://api.github.com/users/MarkVoitov/subscriptions',
   'organizations_url': 'https://api.github.com/users/MarkVoitov/orgs',
   'repos_url': 'https://api.github.com/users/MarkVoitov/r

In [52]:
last_5_languages = [repo['language'] for repo in last_5_repositories]
last_5_languages

['Jupyter Notebook',
 'Python',
 'Jupyter Notebook',
 'Python',
 'Jupyter Notebook']

## API search

In [None]:
# all API links available at https://github.com/MarkVoitov/data-science-from-scratch/blob/master/links.md

# Example: using Twitter API

## Getting credentials

In [None]:
# https://developer.twitter.com/apps

## Using Twython

In [53]:
from twython import Twython

twitter = Twython(CONSUMER_KEY, CONSUMER_SECRET)

# search for tweets including phrase 'data science'
for status in twitter.search(q='"data science"')["statuses"]:
    user = status["user"]["screen_name"].encode('utf-8')
    print(user, ":", text)

ModuleNotFoundError: No module named 'twython'

In [54]:
from twython import TwythonStreamer

tweets = []

class MyStreamer(TwythonStreamer):
    def on_success(self, data):
        if data ['lang'] == 'en':
            tweets.append(data)
            print("received tweet #", len(tweets))
        if len(tweets) >= 1000:
            self.disconnect()
            
            
    def on_error(self, status_code, data):
        print(status_code, data)
        self.disconnect()

ModuleNotFoundError: No module named 'twython'

In [55]:
stream = MyStreamer(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
stream.statuses.filter(track='data')

NameError: name 'MyStreamer' is not defined

In [56]:
top_hashtags = Counter(hashtag['text'].lower()
                      for tweet in tweets
                      for hashtag in tweet["entities"]["hashtags"])
top_hashtags.most_common(5)

NameError: name 'tweets' is not defined