In [389]:
import pycurl, json, sys
import re
import pandas as pd
import numpy as np
import datetime as dt
from common_func import check_url
from BeautifulSoup import BeautifulSoup
import urllib2
import unidecode

In [321]:
pkls = !ls pkl/itunes*COMPLETE.pkl
podcastDf = pd.read_pickle(pkls[-1])
podcastDf.shape

(49930, 36)

In [322]:
# filter out podcasts with no itunes collection id
podcastDf = podcastDf[np.isfinite(podcastDf['collectionId'])]
podcastDf.shape

(33227, 36)

In [323]:
# filter out podcasts without recent episodes
days_thresh = 45
thresh_date = dt.datetime.today() - dt.timedelta(days=days_thresh)

podcastDf['releaseDate'] = pd.to_datetime(podcastDf['releaseDate'])
podcastDf = podcastDf[podcastDf.releaseDate > thresh_date]
podcastDf.shape

(6273, 36)

In [324]:
# convert collectionId to int
podcastDf['collectionId'] = [int(x) for x in podcastDf['collectionId']]

In [98]:
# make sample for testing
testDf = podcastDf.sample(n=20)

In [205]:
baseUrl = 'https://itunes.apple.com/us/podcast/id'

In [173]:
# will store pycurl output
class Test:
   def __init__(self):
       self.contents = ''

   def body_callback(self, buf):
       self.contents = self.contents + buf

In [313]:
def run_curl(url):
    t = Test()
    c = pycurl.Curl()
    c.setopt(pycurl.URL, url)
    c.setopt(pycurl.HTTPHEADER, ['X-Apple-Store-Front: 143441-1,12', 'X-Apple-Tz: 3600'])
    c.setopt(pycurl.USERAGENT, 'iTunes/9.2.1 (Macintosh; Intel Mac OS X 10.5.8) AppleWebKit/533.16')
    c.setopt(pycurl.SSL_VERIFYHOST, 0)
    c.setopt(pycurl.SSL_VERIFYPEER, 0)
    c.setopt(pycurl.WRITEFUNCTION, t.body_callback)
    c.perform()
    return t

In [426]:
def clean_description(d):
    d = unidecode.unidecode(d)
    d = d.replace('\n', ' ')
    if re.findall(r'(.*) brought to you by.*', d):
       d = re.sub(r'brought to you by.*', '', d)
    if re.search(r'(.*) sponsored by.*', d):
       d = re.sub(r'sponsored by.*', '', d)
    return d

In [430]:
colNames = ['collectionId', 'podcastSummary', 'episodeNames', 'episodeDescriptions', 'alsoSubscribed']
scrapeResults = = pd.DataFrame(columns=colNames)

for ind, row in testDf.iterrows():
    collectionId = row['collectionId']
    scrapeUrl = baseUrl + str(collectionId)
    
    # get podcast summary
    t = run_curl(scrapeUrl)
    soup = BeautifulSoup(t.contents)
    p = soup.p
    if p:
        podcastSummary = soup.p.string
    else: # redirect
        newUrl = soup.findAll(text=re.compile(r'https'))
        newUrl = newUrl[0]
        newUrl = re.sub(r'&amp;', r'&', newUrl)
        t = run_curl(newUrl)
        soup = BeautifulSoup(t.contents)
        p = soup.p
        if p:
            podcastSummary = soup.p.string
        else:
            podcastSummary = np.nan
            episodeNames = np.nan
            episodeDescriptions = np.nan
            alsoSubscribed = np.nan
    
    # get episode names
    episodeData = soup.findAll('button', kind='episode')
    try:
        episodeNames = [unidecode.unidecode(e['item-name']) for e in episodeData]
    except: # no name
        episodeNames = np.nan
    
    # get episode descriptions
    try:
        episodeDescriptions = [clean_description(e['description']) for e in episodeData]
    except: # no description
        episodeDescriptions = np.nan
    
    # get also subscribed podcasts
    alsoSubscribed = re.findall(r'adam-id="(\d+)" aria-label=', t.contents)
    try:
        alsoSubscribed = [int(x) for x in alsoSubscribed]
    except:
        alsoSubscribed = np.nan

no data


In [449]:
# get political gabfest data
scrapeUrl = baseUrl + '158004641'
t = run_curl(scrapeUrl)
soup = BeautifulSoup(t.contents)
newUrl = soup.findAll(text=re.compile(r'https'))
newUrl = newUrl[0]
newUrl = re.sub(r'&amp;', r'&', newUrl)
t = run_curl(newUrl)
soup = BeautifulSoup(t.contents)

In [445]:
# get savage lovecast data
scrapeUrl = baseUrl + '201376301'
t = run_curl(scrapeUrl)
soup = BeautifulSoup(t.contents)
newUrl = soup.findAll(text=re.compile(r'https'))
newUrl = newUrl[0]
newUrl = re.sub(r'&amp;', r'&', newUrl)
t = run_curl(newUrl)
soup = BeautifulSoup(t.contents)

In [454]:
testDf.columns

Index([              u'artistId',             u'artistName',
                u'artistViewUrl',          u'artworkUrl100',
                 u'artworkUrl30',           u'artworkUrl60',
                u'artworkUrl600', u'collectionCensoredName',
       u'collectionExplicitness',      u'collectionHdPrice',
                 u'collectionId',         u'collectionName',
              u'collectionPrice',      u'collectionViewUrl',
        u'contentAdvisoryRating',                u'country',
                     u'currency',                u'feedUrl',
                     u'genreIds',                 u'genres',
                           u'id',                   u'kind',
             u'primaryGenreName',        u'radioStationUrl',
                  u'releaseDate',      u'trackCensoredName',
                   u'trackCount',      u'trackExplicitness',
                 u'trackHdPrice',     u'trackHdRentalPrice',
                      u'trackId',              u'trackName',
                   u'tra