In [2]:
import re
import requests
import urllib2
from BeautifulSoup import BeautifulSoup
import pandas as pd
from common_func import convert_unicode

In [17]:
# load pickled data
pkls = !ls pkl/itunes*.pkl
pkl_dfs = [pd.DataFrame(pd.read_pickle(pkl)) for pkl in pkls]
podcastDf = pd.concat(pkl_dfs)
podcastDf.shape

(50959, 16)

In [18]:
# remove duplicate rows
podcastDf = podcastDf.drop_duplicates('name')
podcastDf.shape

(49813, 16)

In [19]:
# small sample for testing
podcastTest = podcastDf.sample(n=20)

In [20]:
def clean_results(results):
    
    # handle nested lists
    if any(isinstance(i, list) for i in results):
        results = [item for sublist in results for item in sublist]
    
    # strip leading/trailing whitespace
    results = [result.strip() for result in results]
    
    # remove duplicates
    results = list(set(results))
    
    return results

In [None]:
feed_url_valid = []
itunes_keywords = []
itunes_categories = []
itunes_description = []
itunes_subtitles = []
itunes_summaries = []

for ind, row in podcastDf.iterrows():
    try:
        urllib2.urlopen(row['feed_url'])
    except:
        feed_url_valid.append(False)
        itunes_keywords.append(False)
        itunes_categories.append(False)
        itunes_description.append(False)
        itunes_subtitles.append(False)
        itunes_summaries.append(False)
    else:
    
        # if valid url, get the data 
        feed_url_valid.append(True)
        r = requests.get(row['feed_url'])
        data = r.text
        data = convert_unicode(data)

        # extract keywords 
        keywords = re.findall(r'<itunes:keywords>([\w, ]+)</itunes:keywords>', data)
        keywords = [kwlist.split(',') for kwlist in keywords]
        keywords = clean_results(keywords)
        itunes_keywords.append(keywords)

        # extract categories
        categories = re.findall(r'<itunes:category text="([\w "&amp;"]+)"[ />]', data)
        categories = clean_results(categories)
        itunes_categories.append(categories)

        # extract description
        description = re.findall(r'<description>(<!\[CDATA\[)*(.*?)(\]\]>)*</description>', data)
        description = [d[1] for d in description]
        itunes_description.append(description)

        # extract episode subtitles
        subtitles = re.findall(r'<itunes:subtitle>(<!\[CDATA\[)*(.*?)(\]\]>)*</itunes:subtitle', data)
        subtitles = [s[1] for s in subtitles]
        itunes_subtitles.append(subtitles)

        # extract episode summaries
        summaries = re.findall('<itunes:summary>(\r\n\s*)*(<!\[CDATA\[)*(.*?)(\]\]>)*(\r\n\s*)*</itunes:summary', data)
        summaries = [s[2] for s in summaries]
        summaries_clean = []
        for s in summaries:
            if re.search(r'brought to you by', s):
                newS = re.findall(r'(.*) brought to you by.*', s)
                summaries_clean.append(newS)
            elif re.search(r'sponsored by', s):
                newS = re.findall(r'(.*) sponsored by.*', s)
                summaries_clean.append(newS)
            else:
                summaries_clean.append(s)
        itunes_summaries.append(summaries_clean)

In [None]:
podcastDf = podcastDf.assign(feed_url_valid = feed_url_valid,
                                 itunes_keywords = itunes_keywords,
                                 itunes_categories = itunes_categories,
                                 itunes_description = itunes_description,
                                 itunes_subtitles = itunes_subtitles,
                                 itunes_summaries = itunes_summaries)