# Part I: Python Basics
* Basic data types in Python
* Make sense of method, loops and functions

In [7]:
import sys
import os
os.getcwd() # check the current work directory 
os.chdir("C:\\Users\\Administrator\\Desktop") # change the work directory to our desktop 

## Basic data types in Python

In [2]:
print(2+2) # integer 

print(8/2) # floating-point numbers
type(8/2)

4
4.0


float

In [3]:
text = "Text analysis is pretty cool!"
type(text) # string 

str

In [4]:
list_example = [12, "hungry", "dogs"]
type(list_example) # list
print(list_example[2]) # the index starts from 0
print(list_example[0:2]) # slice the list 

dogs
[12, 'hungry']


In [5]:
tuple_example = (12, "hungry", "dogs") # note the difference between () and []
len(tuple_example)
type(tuple_example) # tuple 
print(tuple_example(2)) # the error says the tuple object is not callable; tuple is immutable while list is mutable 

TypeError: 'tuple' object is not callable

In [None]:
dict_example = {"a": 1, "b" : 42, "text" : "hi there"}
type(dict_example) # dict
print(dict_example["text"]) # we can use key to get what we want 

## Method

In [None]:
text_list = [2, 5, "yes"]
text_list.insert(0, "no") # insert "no" to the location 0
print(text_list) 

text_list.append("whatever") # append "whatever" at the end of the list
print(text_list)

## Loops

In [None]:
test_list = ["Dr.Slater", "Dr.Pepinsky", "Lily", "Samantha"]

for element in test_list:
    if "Dr." in element:
        print("Hello " + element + "!")

In [None]:
#use a loop to apply a method to a list 5 times 
myList=[]
for element in range(5):
    myList.append(element) # append is a method or attribute of the "list" object (or module)
print(myList)

## Functions

In [None]:
def perfect(score):
    print ("I got a perfect " + score)
    
perfect(score='100') 

In [None]:
def pinfo(name, age):
   print("Name:", name)
   print("Age:", age)
   
pinfo(age=25, name="Joanne" )

## Regular expressions

* A sequence of characters that define a search pattern. For example, '\,' says look for a comma. See https://docs.python.org/3.4/library/re.html

In [9]:
#split 'happy, go lucky wherever there is a comma, or whereever #there is a space

from bs4 import re
re.split('\,', 'happy, go lucky') 

re.split('\s', 'happy, go lucky') #split whereever there is a space  \s

#play with regular expressions here  http://www.regexr.com/

['happy,', 'go', 'lucky']

In [12]:
#Data are not always well behaved. The ignore and try/except commands can be used to say in effect:
#If it works do it, if not move on to the next case!
#Here's an example that is part of a function

def divide(x, y):
    try:
        result = x / y
    except ZeroDivisionError:
        print("division by zero!")
    else:
        print("result is", result)


divide(2.0,3.0)
divide(2.0, 0)

result is 0.6666666666666666
division by zero!


In [13]:
#If you just want to ignore bad cases (rather than printing an error message):

def divide(x, y):
    try:
        result = x / y
    except:
        pass
    else:
        print("result is", result)

divide(2.0,3.0)
divide(2.0, 0)

result is 0.6666666666666666


### How to remove unicode from files?

In [21]:
# This script says convert the text to strings that are sentences in ascii format. 
# If a character can't be converted to ascii (e.g. some unicode characters), ignore it 
cleanwords=str(sentence.encode('ascii',errors='ignore'))

In [23]:
# Finally, here's a more deliberate way to remove unicode from a list of files.
# This worked when the above ignore command did not 
list2 = []
file_counter = 0
for file in list1:
    file_counter += 1
    missing_words = 0
    out_file = ''
    word_list = file.split()
    for word in word_list:
        try:
            new_word = str(word)
            out_file = '%s %s'%(out_file, new_word)
        except:
            missing_words += 1
    list2.append(out_file)
    print('%s%s%s%s'%('File: ', file_counter, '| Missing words: ', missing_words)) 

NameError: name 'list1' is not defined

# Part II: Scraping Data 
* Scraping data (using API or website)
* Parsing documents
* Tokenizing and cleaning a document
* Tokenize and parse each of the files in fomclist by sentence

## Scrape data using API
* This [tutorial](https://dlab.berkeley.edu/blog/scraping-new-york-times-articles-python-tutorial) demonstrates how to use the New York Times Articles Search API using Python. 

## Scrape a single website

In [27]:
from urllib.request import urlopen

polisci_url = urlopen('https://www.polisci.washington.edu/people')
type(polisci_url) # http.client.HTTPResponse means an object representing the connection
polisci_page = polisci_url.read()

print(polisci_page)
type(polisci_page) # Sometimes the content contains an unrecognized character and Python will convert it into bytes

# read the correct character encoding from `Content-Type` request header
charset_encoding = polisci_url.info().get_content_charset()
# apply encoding
polisci_page = polisci_url.read().decode(charset_encoding)

type(polisci_page) # Now it is a long string


b'<!DOCTYPE html>\n<!--[if IEMobile 7]><html class="iem7"  lang="en" dir="ltr"><![endif]-->\n<!--[if lte IE 6]><html class="lt-ie9 lt-ie8 lt-ie7"  lang="en" dir="ltr"><![endif]-->\n<!--[if (IE 7)&(!IEMobile)]><html class="lt-ie9 lt-ie8"  lang="en" dir="ltr"><![endif]-->\n<!--[if IE 8]><html class="lt-ie9"  lang="en" dir="ltr"><![endif]-->\n<!--[if (gte IE 9)|(gt IEMobile 7)]><!--><html  lang="en" dir="ltr"><!--<![endif]-->\n\n<head>\n  <meta charset="utf-8" />\n<link rel="shortcut icon" href="https://www.polisci.washington.edu/sites/all/themes/uwnorthanger_zen_boundless/favicon.ico" type="image/vnd.microsoft.icon" />\n  <title>People - A-Z Directory | Department of Political Science | University of Washington</title>\n\n      <meta name="MobileOptimized" content="width">\n    <meta name="HandheldFriendly" content="true">\n    <meta name="viewport" content="width=device-width">\n    <meta http-equiv="cleartype" content="on">\n\n  <link type="text/css" rel="stylesheet" href="https://www.

str

In [29]:
# A handier way to fix possible encoding issues
polisci_page = urlopen('https://www.polisci.washington.edu/people').read().decode('utf-8')
print(polisci_page)
type(polisci_page)

<!DOCTYPE html>
<!--[if IEMobile 7]><html class="iem7"  lang="en" dir="ltr"><![endif]-->
<!--[if lte IE 6]><html class="lt-ie9 lt-ie8 lt-ie7"  lang="en" dir="ltr"><![endif]-->
<!--[if (IE 7)&(!IEMobile)]><html class="lt-ie9 lt-ie8"  lang="en" dir="ltr"><![endif]-->
<!--[if IE 8]><html class="lt-ie9"  lang="en" dir="ltr"><![endif]-->
<!--[if (gte IE 9)|(gt IEMobile 7)]><!--><html  lang="en" dir="ltr"><!--<![endif]-->

<head>
  <meta charset="utf-8" />
<link rel="shortcut icon" href="https://www.polisci.washington.edu/sites/all/themes/uwnorthanger_zen_boundless/favicon.ico" type="image/vnd.microsoft.icon" />
  <title>People - A-Z Directory | Department of Political Science | University of Washington</title>

      <meta name="MobileOptimized" content="width">
    <meta name="HandheldFriendly" content="true">
    <meta name="viewport" content="width=device-width">
    <meta http-equiv="cleartype" content="on">

  <link type="text/css" rel="stylesheet" href="https://www.polisci.washington.

str

In [30]:
# Create a new list that split by words (at every \n or newline)
polisci_list = [polisci_page.split()]
polisci_list

[['<!DOCTYPE',
  'html>',
  '<!--[if',
  'IEMobile',
  '7]><html',
  'class="iem7"',
  'lang="en"',
  'dir="ltr"><![endif]-->',
  '<!--[if',
  'lte',
  'IE',
  '6]><html',
  'class="lt-ie9',
  'lt-ie8',
  'lt-ie7"',
  'lang="en"',
  'dir="ltr"><![endif]-->',
  '<!--[if',
  '(IE',
  '7)&(!IEMobile)]><html',
  'class="lt-ie9',
  'lt-ie8"',
  'lang="en"',
  'dir="ltr"><![endif]-->',
  '<!--[if',
  'IE',
  '8]><html',
  'class="lt-ie9"',
  'lang="en"',
  'dir="ltr"><![endif]-->',
  '<!--[if',
  '(gte',
  'IE',
  '9)|(gt',
  'IEMobile',
  '7)]><!--><html',
  'lang="en"',
  'dir="ltr"><!--<![endif]-->',
  '<head>',
  '<meta',
  'charset="utf-8"',
  '/>',
  '<link',
  'rel="shortcut',
  'icon"',
  'href="https://www.polisci.washington.edu/sites/all/themes/uwnorthanger_zen_boundless/favicon.ico"',
  'type="image/vnd.microsoft.icon"',
  '/>',
  '<title>People',
  '-',
  'A-Z',
  'Directory',
  '|',
  'Department',
  'of',
  'Political',
  'Science',
  '|',
  'University',
  'of',
  'Washington<

## How to make the text more readable?

In [31]:
from urllib.request import urlopen
from bs4 import BeautifulSoup, re

polisci_url = urlopen("https://www.polisci.washington.edu/people")
polisci_page = BeautifulSoup(polisci_url.read())
polisci_text = polisci_page.get_text()
print(polisci_text)
type(polisci_text) # string





People - A-Z Directory | Department of Political Science | University of Washington









.background-layer{background-image:url('https://www.polisci.washington.edu/sites/polisci/files/polisci-background-cherry-blossoms-mobile_0.jpg') !important;}@media all and (min-width:768px){.background-layer{background-image:url('https://www.polisci.washington.edu/sites/polisci/files/polisci-background-cherry-blossoms-boundless_0.jpg') !important;}}.lt-ie9 .background-layer{background-image:url('https://www.polisci.washington.edu/sites/polisci/files/polisci-background-cherry-blossoms-boundless_0.jpg') !important;}











(function(i,s,o,g,r,a,m){i["GoogleAnalyticsObject"]=r;i[r]=i[r]||function(){(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)})(window,document,"script","https://www.google-analytics.com/analytics.js","ga");ga("create", "UA-25223004-31", {"cookieDomain":"auto"});g

str

In [32]:
# remove the blank lines 
lines = (line.strip() for line in polisci_text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
politext = '\n'.join(chunk for chunk in chunks if chunk)
print(politext)

People - A-Z Directory | Department of Political Science | University of Washington
.background-layer{background-image:url('https://www.polisci.washington.edu/sites/polisci/files/polisci-background-cherry-blossoms-mobile_0.jpg') !important;}@media all and (min-width:768px){.background-layer{background-image:url('https://www.polisci.washington.edu/sites/polisci/files/polisci-background-cherry-blossoms-boundless_0.jpg') !important;}}.lt-ie9 .background-layer{background-image:url('https://www.polisci.washington.edu/sites/polisci/files/polisci-background-cherry-blossoms-boundless_0.jpg') !important;}
(function(i,s,o,g,r,a,m){i["GoogleAnalyticsObject"]=r;i[r]=i[r]||function(){(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)})(window,document,"script","https://www.google-analytics.com/analytics.js","ga");ga("create", "UA-25223004-31", {"cookieDomain":"auto"});ga("send", "pageview");ga

In [34]:
# extract the emails from the web page using a regular expression 
polisci_emails = set()
emails = set(re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", politext, re.I)) 
polisci_emails.update(emails)

print(polisci_emails)
type(polisci_emails)

{'switek@uw.edu', 'kamano@uw.edu', 'yuehou@sas.upenn.edu', 'mwack@uw.edu', 'pslabstaff@gmail.com', 'glovell@uw.edu', 'cadolph@uw.edu', 'goehrunr@uw.edu', 'breebj@uw.edu', 'jcbeck@uw.edu', 'echrist2@uw.edu', 'lingfu@uw.edu', 'jdlong@uw.edu', 'riddhimn@uw.edu', 'sske@uw.edu', 'gcharper@uw.edu', 'sbutorac@uw.edu', 'goldberg@uw.edu', 'meroy@uw.edu', 'winklers@uw.edu', 'kohlj@uw.edu', 'bthorpe@uw.edu', 'aseem@uw.edu', 'rmcassel@uw.edu', 'erics1@uw.edu', 'tamars@uw.edu', 'hjwang@uw.edu', 'ekier@uw.edu', 'travisn@uw.edu', 'gochberg@uw.edu', 'mdubeau@uw.edu', 'soxvlin@uw.edu', 'zhaotan@uw.edu', 'leo4@uw.edu', 'nelam@uw.edu', 'distefan@uw.edu', 'mercer@uw.edu', 'bphuang@uw.edu', 'jgrove91@uw.edu', 'mlevi@uw.edu', 'weitzen@uw.edu', 'kevina4@uw.edu', 'cainsley@uw.edu', 'acancino@uw.edu', 'magistro@uw.edu', 'vmenaldo@uw.edu', 'sechrest@uw.edu', 'azelenz@uw.edu', 'dclucas@uw.edu', 'masmith@uw.edu', 'kylemurphy514@gmail.com', 'jasonm@uw.edu', 'buscherf@uw.edu', 'cpcoll93@uw.edu', 'jtonghan@uw.edu', 

set

In [38]:
# convert the set to a list of emails 
email_list = list(polisci_emails)
print(email_list)

['switek@uw.edu', 'kamano@uw.edu', 'yuehou@sas.upenn.edu', 'mwack@uw.edu', 'pslabstaff@gmail.com', 'glovell@uw.edu', 'cadolph@uw.edu', 'goehrunr@uw.edu', 'breebj@uw.edu', 'jcbeck@uw.edu', 'echrist2@uw.edu', 'lingfu@uw.edu', 'jdlong@uw.edu', 'riddhimn@uw.edu', 'sske@uw.edu', 'gcharper@uw.edu', 'sbutorac@uw.edu', 'goldberg@uw.edu', 'meroy@uw.edu', 'winklers@uw.edu', 'kohlj@uw.edu', 'bthorpe@uw.edu', 'aseem@uw.edu', 'rmcassel@uw.edu', 'erics1@uw.edu', 'tamars@uw.edu', 'hjwang@uw.edu', 'ekier@uw.edu', 'travisn@uw.edu', 'gochberg@uw.edu', 'mdubeau@uw.edu', 'soxvlin@uw.edu', 'zhaotan@uw.edu', 'leo4@uw.edu', 'nelam@uw.edu', 'distefan@uw.edu', 'mercer@uw.edu', 'bphuang@uw.edu', 'jgrove91@uw.edu', 'mlevi@uw.edu', 'weitzen@uw.edu', 'kevina4@uw.edu', 'cainsley@uw.edu', 'acancino@uw.edu', 'magistro@uw.edu', 'vmenaldo@uw.edu', 'sechrest@uw.edu', 'azelenz@uw.edu', 'dclucas@uw.edu', 'masmith@uw.edu', 'kylemurphy514@gmail.com', 'jasonm@uw.edu', 'buscherf@uw.edu', 'cpcoll93@uw.edu', 'jtonghan@uw.edu', 

### Challenge: retrieve information and save it locally

In [38]:
import requests 
from bs4 import BeautifulSoup

# collect and parse the page 
r = requests.get("https://www.seattle.gov/elected-officials")
soup = BeautifulSoup(r.text, 'html.parser')
name_list = soup.find_all('div', class_ = 'primaryContent')
name_list

name_item = []
position_item = []

for name in name_list:
    if name.find('h3'):
        name_item.append(name.find('h3').text)
print(name_item)
print(position_item)

# delete the last one "court"
del name_item[-1]
name_item

for position in name_list:
    if position.find('span'):
        position_item.append(position.find('span').text)
print(position_item)
del position_item[-1]
position_item

# for better presentation 
import pandas
pandas.DataFrame({'position':position_item, 'name':name_item}).to_csv('seattle_officials.csv', index = False)

['Jenny A. Durkan', 'M. Lorena González', 'Lisa Herbold', 'Debora Juarez', 'Andrew J. Lewis', 'Tammy J. Morales', 'Teresa Mosqueda', 'Alex Pedersen', 'Kshama Sawant', 'Dan Strauss', 'Pete Holmes', 'Edward McKenna', 'Faye Chess', 'Andrea Chin', 'Anita Crawford-Willis', 'Adam Eisenberg', 'Willie J. Gregory', 'Damon Shadid', 'Court']
[]
[' Mayor', ' Council President', ' Councilmember', ' Councilmember', ' Councilmember', ' Councilmember', ' Councilmember', ' Councilmember', ' Councilmember', ' Councilmember', ' City Attorney', ' Presiding Judge', ' Judge', ' Judge', ' Judge', ' Judge', ' Judge', ' Judge', ' Seattle Municipal']


## Parsing documents

In [None]:
import requests
r = requests.get('https://www.nytimes.com/interactive/2017/06/23/opinion/trumps-lies.html')

from bs4 import BeautifulSoup
soup = BeautifulSoup(r.text, 'html.parser')
results = soup.find_all('span', attrs={'class':'short-desc'})

records = []
for result in results:
    date = result.find('strong').text[0:-1] + ', 2017'
    lie = result.contents[1][1:-2]
    explanation = result.find('a').text[1:-1]
    url = result.find('a')['href']
    records.append((date, lie, explanation, url))

import pandas as pd
df = pd.DataFrame(records, columns=['date', 'lie', 'explanation', 'url'])
df['date'] = pd.to_datetime(df['date'])
df.to_csv('trump_lies.csv', index=False, encoding='utf-8')