In [1]:
import pycurl, json, sys
import re
import pandas as pd
import numpy as np
import datetime as dt
from common_func import check_url
from BeautifulSoup import BeautifulSoup
import urllib2
import unidecode
import tqdm

In [2]:
pkls = !ls pkl/itunes*COMPLETE.pkl
podcastDf = pd.read_pickle(pkls[-1])
podcastDf.shape

(49930, 36)

In [3]:
# filter out podcasts with no itunes collection id
podcastDf = podcastDf[np.isfinite(podcastDf['collectionId'])]
podcastDf.shape

(33227, 36)

In [4]:
# filter out podcasts without recent episodes
days_thresh = 45
thresh_date = dt.datetime.today() - dt.timedelta(days=days_thresh)

podcastDf['releaseDate'] = pd.to_datetime(podcastDf['releaseDate'])
podcastDf = podcastDf[podcastDf.releaseDate > thresh_date]
podcastDf.shape

(6217, 36)

In [5]:
# convert collectionId to int
podcastDf['collectionId'] = [int(x) for x in podcastDf['collectionId']]

In [98]:
# make sample for testing
testDf = podcastDf.sample(n=20)

In [6]:
baseUrl = 'https://itunes.apple.com/us/podcast/id'

In [7]:
# will store pycurl output
class Test:
   def __init__(self):
       self.contents = ''

   def body_callback(self, buf):
       self.contents = self.contents + buf

In [8]:
def run_curl(url):
    t = Test()
    c = pycurl.Curl()
    c.setopt(pycurl.URL, url)
    c.setopt(pycurl.HTTPHEADER, ['X-Apple-Store-Front: 143441-1,12', 'X-Apple-Tz: 3600'])
    c.setopt(pycurl.USERAGENT, 'iTunes/9.2.1 (Macintosh; Intel Mac OS X 10.5.8) AppleWebKit/533.16')
    c.setopt(pycurl.SSL_VERIFYHOST, 0)
    c.setopt(pycurl.SSL_VERIFYPEER, 0)
    c.setopt(pycurl.WRITEFUNCTION, t.body_callback)
    c.perform()
    return t

In [9]:
def clean_description(d):
    d = unidecode.unidecode(d)
    d = d.replace('\n', ' ')
    if re.findall(r'(.*) brought to you by.*', d):
       d = re.sub(r'brought to you by.*', '', d)
    if re.search(r'(.*) sponsored by.*', d):
       d = re.sub(r'sponsored by.*', '', d)
    return d

In [461]:
colNames = ['collectionId', 'podcastSummary', 'episodeNames', 'episodeDescriptions', 'alsoSubscribed']
scrapeResults = pd.DataFrame(columns=colNames)

for ind, row in tqdm.tqdm(podcastDf.iterrows(), total=podcastDf.shape[0]):
    collectionId = row['collectionId']
    scrapeUrl = baseUrl + str(collectionId)
    
    # get podcast summary
    t = run_curl(scrapeUrl)
    soup = BeautifulSoup(t.contents)
    p = soup.p
    if p:
        podcastSummary = soup.p.string
    else: # redirect
        newUrl = soup.findAll(text=re.compile(r'https'))
        newUrl = newUrl[0]
        newUrl = re.sub(r'&amp;', r'&', newUrl)
        try:
            t = run_curl(newUrl)
            soup = BeautifulSoup(t.contents)
            p = soup.p
            if p:
                podcastSummary = soup.p.string
            else:
                podcastSummary = np.nan
                episodeNames = np.nan
                episodeDescriptions = np.nan
                alsoSubscribed = np.nan
                thisResult = pd.DataFrame({'collectionId' : int(collectionId),
                                          'podcastSummary' : [podcastSummary],
                                          'episodeNames' : [episodeNames],
                                          'episodeDescriptions' : [episodeDescriptions],
                                          'alsoSubscribed' : [alsoSubscribed]})
                scrapeResults = pd.concat([scrapeResults, thisResult], axis=0)
                continue
        except:
            podcastSummary = np.nan
            episodeNames = np.nan
            episodeDescriptions = np.nan
            alsoSubscribed = np.nan
            thisResult = pd.DataFrame({'collectionId' : int(collectionId),
                                      'podcastSummary' : [podcastSummary],
                                      'episodeNames' : [episodeNames],
                                      'episodeDescriptions' : [episodeDescriptions],
                                      'alsoSubscribed' : [alsoSubscribed]})
            scrapeResults = pd.concat([scrapeResults, thisResult], axis=0)
            continue
    
    # get episode names
    episodeData = soup.findAll('button', kind='episode')
    try:
        episodeNames = [unidecode.unidecode(e['item-name']) for e in episodeData]
    except: # no name
        episodeNames = np.nan
    
    # get episode descriptions
    try:
        episodeDescriptions = [clean_description(e['description']) for e in episodeData]
    except: # no description
        episodeDescriptions = np.nan
    
    # get also subscribed podcasts
    alsoSubscribed = re.findall(r'adam-id="(\d+)" aria-label=', t.contents)
    try:
        alsoSubscribed = [int(x) for x in alsoSubscribed]
    except:
        alsoSubscribed = np.nan
        
    # append results
    thisResult = pd.DataFrame({'collectionId' : int(collectionId),
                                      'podcastSummary' : [podcastSummary],
                                      'episodeNames' : [episodeNames],
                                      'episodeDescriptions' : [episodeDescriptions],
                                      'alsoSubscribed' : [alsoSubscribed]})
    scrapeResults = pd.concat([scrapeResults, thisResult], axis=0)

 18%|█▊        | 1147/6273 [1:03:01<15:24:29, 10.82s/it]

KeyboardInterrupt: 

error: (23, 'Failed writing body (0 != 16374)')

In [474]:
# pickle the current results
scrapeResults['episodeDescriptions'] = [unicode(x) for x in scrapeResults['episodeDescriptions']]
scrapeResults['episodeNames'] = [unicode(x) for x in scrapeResults['episodeNames']]
scrapeResults['podcastSummary'] = [unicode(x) for x in scrapeResults['podcastSummary']]
scrapeResults.to_pickle('pkl/scraped_podcasts_pt1.pkl')

In [477]:
# remove already-retrieved values from podcastDf
doneIds = scrapeResults['collectionId']

subDf = podcastDf[~podcastDf.collectionId.isin(doneIds)]

In [478]:
subDf.shape

(5126, 36)

In [479]:
for ind, row in tqdm.tqdm(subDf.iterrows(), total=subDf.shape[0]):
    collectionId = row['collectionId']
    scrapeUrl = baseUrl + str(collectionId)
    
    # get podcast summary
    t = run_curl(scrapeUrl)
    soup = BeautifulSoup(t.contents)
    p = soup.p
    if p:
        podcastSummary = soup.p.string
    else: # redirect
        newUrl = soup.findAll(text=re.compile(r'https'))
        newUrl = newUrl[0]
        newUrl = re.sub(r'&amp;', r'&', newUrl)
        try:
            t = run_curl(newUrl)
            soup = BeautifulSoup(t.contents)
            p = soup.p
            if p:
                podcastSummary = soup.p.string
            else:
                podcastSummary = np.nan
                episodeNames = np.nan
                episodeDescriptions = np.nan
                alsoSubscribed = np.nan
                thisResult = pd.DataFrame({'collectionId' : int(collectionId),
                                          'podcastSummary' : [podcastSummary],
                                          'episodeNames' : [episodeNames],
                                          'episodeDescriptions' : [episodeDescriptions],
                                          'alsoSubscribed' : [alsoSubscribed]})
                scrapeResults = pd.concat([scrapeResults, thisResult], axis=0)
                continue
        except:
            podcastSummary = np.nan
            episodeNames = np.nan
            episodeDescriptions = np.nan
            alsoSubscribed = np.nan
            thisResult = pd.DataFrame({'collectionId' : int(collectionId),
                                      'podcastSummary' : [podcastSummary],
                                      'episodeNames' : [episodeNames],
                                      'episodeDescriptions' : [episodeDescriptions],
                                      'alsoSubscribed' : [alsoSubscribed]})
            scrapeResults = pd.concat([scrapeResults, thisResult], axis=0)
            continue
    
    # get episode names
    episodeData = soup.findAll('button', kind='episode')
    try:
        episodeNames = [unidecode.unidecode(e['item-name']) for e in episodeData]
    except: # no name
        episodeNames = np.nan
    
    # get episode descriptions
    try:
        episodeDescriptions = [clean_description(e['description']) for e in episodeData]
    except: # no description
        episodeDescriptions = np.nan
    
    # get also subscribed podcasts
    alsoSubscribed = re.findall(r'adam-id="(\d+)" aria-label=', t.contents)
    try:
        alsoSubscribed = [int(x) for x in alsoSubscribed]
    except:
        alsoSubscribed = np.nan
        
    # append results
    thisResult = pd.DataFrame({'collectionId' : int(collectionId),
                                      'podcastSummary' : [podcastSummary],
                                      'episodeNames' : [episodeNames],
                                      'episodeDescriptions' : [episodeDescriptions],
                                      'alsoSubscribed' : [alsoSubscribed]})
    scrapeResults = pd.concat([scrapeResults, thisResult], axis=0)

  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
 29%|██▊       | 1466/5126 [1:38:47<3:01:09,  2.97s/it]

IndexError: list index out of range

In [481]:
# pickle the current results
scrapeResults['episodeDescriptions'] = [unicode(x) for x in scrapeResults['episodeDescriptions']]
scrapeResults['episodeNames'] = [unicode(x) for x in scrapeResults['episodeNames']]
scrapeResults['podcastSummary'] = [unicode(x) for x in scrapeResults['podcastSummary']]
scrapeResults.to_pickle('pkl/scraped_podcasts_pt2.pkl')

# remove already-retrieved values from podcastDf
doneIds = scrapeResults['collectionId']

subDf = podcastDf[~podcastDf.collectionId.isin(doneIds)]

In [None]:
for ind, row in tqdm.tqdm(subDf.iterrows(), total=subDf.shape[0]):
    collectionId = row['collectionId']
    scrapeUrl = baseUrl + str(collectionId)
    
    # get podcast summary
    t = run_curl(scrapeUrl)
    soup = BeautifulSoup(t.contents)
    p = soup.p
    if p:
        podcastSummary = soup.p.string
    else: # redirect
        newUrl = soup.findAll(text=re.compile(r'https'))
        try:
            newUrl = newUrl[0]
            newUrl = re.sub(r'&amp;', r'&', newUrl)

            t = run_curl(newUrl)
            soup = BeautifulSoup(t.contents)
            p = soup.p
            if p:
                podcastSummary = soup.p.string
            else:
                podcastSummary = np.nan
                episodeNames = np.nan
                episodeDescriptions = np.nan
                alsoSubscribed = np.nan
                thisResult = pd.DataFrame({'collectionId' : int(collectionId),
                                          'podcastSummary' : [podcastSummary],
                                          'episodeNames' : [episodeNames],
                                          'episodeDescriptions' : [episodeDescriptions],
                                          'alsoSubscribed' : [alsoSubscribed]})
                scrapeResults = pd.concat([scrapeResults, thisResult], axis=0)
                continue
        except:
            podcastSummary = np.nan
            episodeNames = np.nan
            episodeDescriptions = np.nan
            alsoSubscribed = np.nan
            thisResult = pd.DataFrame({'collectionId' : int(collectionId),
                                      'podcastSummary' : [podcastSummary],
                                      'episodeNames' : [episodeNames],
                                      'episodeDescriptions' : [episodeDescriptions],
                                      'alsoSubscribed' : [alsoSubscribed]})
            scrapeResults = pd.concat([scrapeResults, thisResult], axis=0)
            continue
    
    # get episode names
    episodeData = soup.findAll('button', kind='episode')
    try:
        episodeNames = [unidecode.unidecode(e['item-name']) for e in episodeData]
    except: # no name
        episodeNames = np.nan
    
    # get episode descriptions
    try:
        episodeDescriptions = [clean_description(e['description']) for e in episodeData]
    except: # no description
        episodeDescriptions = np.nan
    
    # get also subscribed podcasts
    alsoSubscribed = re.findall(r'adam-id="(\d+)" aria-label=', t.contents)
    try:
        alsoSubscribed = [int(x) for x in alsoSubscribed]
    except:
        alsoSubscribed = np.nan
        
    # append results
    thisResult = pd.DataFrame({'collectionId' : int(collectionId),
                                      'podcastSummary' : [podcastSummary],
                                      'episodeNames' : [episodeNames],
                                      'episodeDescriptions' : [episodeDescriptions],
                                      'alsoSubscribed' : [alsoSubscribed]})
    scrapeResults = pd.concat([scrapeResults, thisResult], axis=0)

 23%|██▎       | 832/3658 [1:01:17<2:41:33,  3.43s/it]

In [10]:
scrapeResults = pd.read_pickle('pkl/scraped_podcasts_pt2.pkl')

In [12]:
# remove already-retrieved values from podcastDf
doneIds = scrapeResults['collectionId']

subDf = podcastDf[~podcastDf.collectionId.isin(doneIds)]

In [None]:
for ind, row in tqdm.tqdm(subDf.iterrows(), total=subDf.shape[0]):
    collectionId = row['collectionId']
    scrapeUrl = baseUrl + str(collectionId)
    
    # get podcast summary
    t = run_curl(scrapeUrl)
    soup = BeautifulSoup(t.contents)
    p = soup.p
    if p:
        podcastSummary = soup.p.string
    else: # redirect
        newUrl = soup.findAll(text=re.compile(r'https'))
        try:
            newUrl = newUrl[0]
            newUrl = re.sub(r'&amp;', r'&', newUrl)

            t = run_curl(newUrl)
            soup = BeautifulSoup(t.contents)
            p = soup.p
            if p:
                podcastSummary = soup.p.string
            else:
                podcastSummary = np.nan
                episodeNames = np.nan
                episodeDescriptions = np.nan
                alsoSubscribed = np.nan
                thisResult = pd.DataFrame({'collectionId' : int(collectionId),
                                          'podcastSummary' : [podcastSummary],
                                          'episodeNames' : [episodeNames],
                                          'episodeDescriptions' : [episodeDescriptions],
                                          'alsoSubscribed' : [alsoSubscribed]})
                scrapeResults = pd.concat([scrapeResults, thisResult], axis=0)
                continue
        except:
            podcastSummary = np.nan
            episodeNames = np.nan
            episodeDescriptions = np.nan
            alsoSubscribed = np.nan
            thisResult = pd.DataFrame({'collectionId' : int(collectionId),
                                      'podcastSummary' : [podcastSummary],
                                      'episodeNames' : [episodeNames],
                                      'episodeDescriptions' : [episodeDescriptions],
                                      'alsoSubscribed' : [alsoSubscribed]})
            scrapeResults = pd.concat([scrapeResults, thisResult], axis=0)
            continue
    
    # get episode names
    episodeData = soup.findAll('button', kind='episode')
    try:
        episodeNames = [unidecode.unidecode(e['item-name']) for e in episodeData]
    except: # no name
        episodeNames = np.nan
    
    # get episode descriptions
    try:
        episodeDescriptions = [clean_description(e['description']) for e in episodeData]
    except: # no description
        episodeDescriptions = np.nan
    
    # get also subscribed podcasts
    alsoSubscribed = re.findall(r'adam-id="(\d+)" aria-label=', t.contents)
    try:
        alsoSubscribed = [int(x) for x in alsoSubscribed]
    except:
        alsoSubscribed = np.nan
        
    # append results
    thisResult = pd.DataFrame({'collectionId' : int(collectionId),
                                      'podcastSummary' : [podcastSummary],
                                      'episodeNames' : [episodeNames],
                                      'episodeDescriptions' : [episodeDescriptions],
                                      'alsoSubscribed' : [alsoSubscribed]})
    scrapeResults = pd.concat([scrapeResults, thisResult], axis=0)

  1%|          | 30/3632 [01:43<4:20:33,  4.34s/it]