In [85]:
import pandas as pd
import re
import psycopg2
from tqdm import tqdm

In [47]:
# set up database connection
user = 'lindsay'          
host = 'localhost'
dbname = 'podcast'
con = None
con = psycopg2.connect(database = dbname, user = user)
cursor = con.cursor()

In [3]:
# load data
scrapeDf = pd.read_pickle('pkl/scraped_podcasts_COMPLETE.pkl')
apiDf = pd.read_pickle('pkl/itunes_podcasts_v3_COMPLETE.pkl')

In [4]:
# clean column names because SQL doesn't like camelCase
def convert(name):
    """Converts camelCase to camel_case."""
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()

scrapeDf.columns = [convert(x) for x in scrapeDf.columns]
apiDf.columns = [convert(x) for x in apiDf.columns]

In [5]:
# join df
podcastDf = pd.merge(apiDf, scrapeDf, how = 'inner', on = 'collection_id')

In [6]:
# convert to int
podcastDf['collection_id'] = [int(x) for x in podcastDf['collection_id']]

In [48]:
# query db for urls
query = 'SELECT artwork_url30, view_url, name, id FROM podcast;'
query_results = pd.read_sql_query(query, con)

In [49]:
query_results.head()

Unnamed: 0,artwork_url30,view_url,name,id
0,http://is5.mzstatic.com/image/thumb/Music2/v4/...,https://itunes.apple.com/us/podcast/your-voice...,Your Voice For Hope,18489
1,http://is5.mzstatic.com/image/thumb/Music/v4/1...,https://itunes.apple.com/us/podcast/a.m-best-r...,A.M Best Radio Podcast,12350
2,http://is2.mzstatic.com/image/thumb/Music/v4/2...,https://itunes.apple.com/us/podcast/a-move-rad...,A-move Radio,12351
3,http://is5.mzstatic.com/image/thumb/Music3/v4/...,https://itunes.apple.com/us/podcast/new-beginn...,A New Beginning with Greg Laurie,12352
4,http://is5.mzstatic.com/image/thumb/Music/v4/1...,https://itunes.apple.com/us/podcast/a.t.tipsca...,A.T.TIPScast,12355


In [51]:
query_results[query_results['id']==16392]

Unnamed: 0,artwork_url30,view_url,name,id
5803,http://is4.mzstatic.com/image/thumb/Music2/v4/...,https://itunes.apple.com/us/podcast/no-title/i...,No Title,16392


In [57]:
podcast_db = pd.merge(podcastDf, query_results, how = 'inner', left_on=['collection_view_url', 'collection_censored_name'], right_on=['view_url', 'name'])

In [58]:
podcastDf.shape

(6284, 39)

In [59]:
query_results.shape

(5804, 4)

In [60]:
podcast_db.shape

(5494, 43)

In [82]:
query = """
UPDATE podcast
SET raw_summary = (%s),
collection_id = (%s)
WHERE id = (%s);
"""
query = query.replace('\n', ' ')
query

' UPDATE podcast SET raw_summary = (%s), collection_id = (%s) WHERE id = (%s); '

In [90]:
for ind, row in tqdm(podcast_db.iterrows(), total=podcast_db.shape[0]):
    data = ((row['podcast_summary'], ), (row['collection_id'], ), (row['id'], ))
    cursor.execute(query, data)
    
con.commit()



In [91]:
podcast_db.dtypes

artist_id                    object
artist_name                  object
artist_view_url              object
artwork_url100               object
artwork_url30_x              object
artwork_url60                object
artwork_url600               object
collection_censored_name     object
collection_explicitness      object
collection_hd_price         float64
collection_id                 int64
collection_name              object
collection_price            float64
collection_view_url          object
content_advisory_rating      object
country                      object
currency                     object
feed_url                     object
genre_ids                    object
genres                       object
kind                         object
primary_genre_name           object
radio_station_url            object
release_date                 object
track_censored_name          object
track_count                 float64
track_explicitness           object
track_hd_price              

In [92]:
test = podcast_db['podcast_summary']

In [97]:
type(test[0])

unicode

In [98]:
str(test[0])

'A quasi-weekly discussion of the plots, themes and highlights of the HBO series "Game of Thrones."  Your hosts Dave Chen (The /Filmcast and The Tobolowsky Files) and Joanna Robinson (contributor for VanityFair.com) will dish up both praise and criticism and will try not to sound too pompous if the book was better. Questions, comments, oaths of fealty?  Email acastofkings@gmail.com'

In [99]:
str(test[0]).replace('"', "'")

"A quasi-weekly discussion of the plots, themes and highlights of the HBO series 'Game of Thrones.'  Your hosts Dave Chen (The /Filmcast and The Tobolowsky Files) and Joanna Robinson (contributor for VanityFair.com) will dish up both praise and criticism and will try not to sound too pompous if the book was better. Questions, comments, oaths of fealty?  Email acastofkings@gmail.com"

In [123]:
# Get from db podcasts with no collection_id
query = "SELECT * FROM podcast WHERE raw_summary ISNULL;"
query_null = pd.read_sql_query(query, con)

In [125]:
query_null.shape

(361, 14)

In [121]:
#query_null['collection_id'] = [int(re.findall(r'/id(\d+)', x)[0]) for x in query_null['view_url']]

In [126]:
query_null.columns

Index([u'id', u'artwork_url30', u'artwork_url60', u'artwork_url100',
       u'artwork_url600', u'explicit', u'name', u'view_url', u'summary',
       u'artist_id', u'episode_descriptions', u'episode_names', u'raw_summary',
       u'collection_id'],
      dtype='object')

In [158]:
query_null_merge = pd.merge(query_null, podcastDf, how = 'inner', left_on=['collection_id', 'artwork_url30'], right_on = ['collection_id', 'artwork_url30'])

In [161]:
query_null_merge[query_null_merge.duplicated('collection_id')]

Unnamed: 0,id,artwork_url30,artwork_url60_x,artwork_url100_x,artwork_url600_x,explicit,name,view_url,summary,artist_id_x,...,track_id,track_name,track_price,track_rental_price,track_view_url,wrapper_type,also_subscribed,episode_descriptions_y,episode_names_y,podcast_summary
74,13105,http://is1.mzstatic.com/image/thumb/Music4/v4/...,http://is1.mzstatic.com/image/thumb/Music4/v4/...,http://is1.mzstatic.com/image/thumb/Music4/v4/...,http://is1.mzstatic.com/image/thumb/Music4/v4/...,Clean,Closet Radio – BlasphuphmusRadio.com,https://itunes.apple.com/us/podcast/closet-rad...,Miss Rikki presents lost music live bands spec...,10346,...,666137437,Closet Radio – BlasphuphmusRadio.com,0,0,https://itunes.apple.com/us/podcast/closet-rad...,track,[],['Closet Radio Episode 204: The Left One Is Fa...,['Closet Radio Episode 204: The Left One Is Fa...,"Miss Rikki presents lost music, live bands, sp..."
75,13105,http://is1.mzstatic.com/image/thumb/Music4/v4/...,http://is1.mzstatic.com/image/thumb/Music4/v4/...,http://is1.mzstatic.com/image/thumb/Music4/v4/...,http://is1.mzstatic.com/image/thumb/Music4/v4/...,Clean,Closet Radio – BlasphuphmusRadio.com,https://itunes.apple.com/us/podcast/closet-rad...,Miss Rikki presents lost music live bands spec...,10346,...,666137437,Closet Radio – BlasphuphmusRadio.com,0,0,https://itunes.apple.com/us/podcast/closet-rad...,track,[],['Closet Radio Episode 204: The Left One Is Fa...,['Closet Radio Episode 204: The Left One Is Fa...,"Miss Rikki presents lost music, live bands, sp..."
76,13105,http://is1.mzstatic.com/image/thumb/Music4/v4/...,http://is1.mzstatic.com/image/thumb/Music4/v4/...,http://is1.mzstatic.com/image/thumb/Music4/v4/...,http://is1.mzstatic.com/image/thumb/Music4/v4/...,Clean,Closet Radio – BlasphuphmusRadio.com,https://itunes.apple.com/us/podcast/closet-rad...,Miss Rikki presents lost music live bands spec...,10346,...,666137437,Closet Radio – BlasphuphmusRadio.com,0,0,https://itunes.apple.com/us/podcast/closet-rad...,track,[],['Closet Radio Episode 204: The Left One Is Fa...,['Closet Radio Episode 204: The Left One Is Fa...,"Miss Rikki presents lost music, live bands, sp..."
201,15375,http://is4.mzstatic.com/image/thumb/Music4/v4/...,http://is4.mzstatic.com/image/thumb/Music4/v4/...,http://is4.mzstatic.com/image/thumb/Music4/v4/...,http://is4.mzstatic.com/image/thumb/Music4/v4/...,Clean,Podcast – Kitchen Sink WordPress,https://itunes.apple.com/us/podcast/podcast-ki...,Listen to Adam share his knowledge and passion...,12194,...,862210489,Podcast – Kitchen Sink WordPress,0,0,https://itunes.apple.com/us/podcast/podcast-ki...,track,"[885696994, 858903353, 882349295, 566009415, 6...",['This week the show is hijacked by Dustin Har...,"['Podcast E100 - The KSWP Show is Hijacked!', ...",Listen to Adam share his knowledge and passion...
202,15375,http://is4.mzstatic.com/image/thumb/Music4/v4/...,http://is4.mzstatic.com/image/thumb/Music4/v4/...,http://is4.mzstatic.com/image/thumb/Music4/v4/...,http://is4.mzstatic.com/image/thumb/Music4/v4/...,Clean,Podcast – Kitchen Sink WordPress,https://itunes.apple.com/us/podcast/podcast-ki...,Listen to Adam share his knowledge and passion...,12194,...,862210489,Podcast – Kitchen Sink WordPress,0,0,https://itunes.apple.com/us/podcast/podcast-ki...,track,"[885696994, 858903353, 882349295, 566009415, 6...",['This week the show is hijacked by Dustin Har...,"['Podcast E100 - The KSWP Show is Hijacked!', ...",Listen to Adam share his knowledge and passion...
203,15375,http://is4.mzstatic.com/image/thumb/Music4/v4/...,http://is4.mzstatic.com/image/thumb/Music4/v4/...,http://is4.mzstatic.com/image/thumb/Music4/v4/...,http://is4.mzstatic.com/image/thumb/Music4/v4/...,Clean,Podcast – Kitchen Sink WordPress,https://itunes.apple.com/us/podcast/podcast-ki...,Listen to Adam share his knowledge and passion...,12194,...,862210489,Podcast – Kitchen Sink WordPress,0,0,https://itunes.apple.com/us/podcast/podcast-ki...,track,"[885696994, 858903353, 882349295, 566009415, 6...",['This week the show is hijacked by Dustin Har...,"['Podcast E100 - The KSWP Show is Hijacked!', ...",Listen to Adam share his knowledge and passion...


In [162]:
query_null_merge.drop(query_null_merge.index[[75, 76, 202, 203]], inplace=True)

AttributeError: 'NoneType' object has no attribute 'shape'

In [163]:
query_null_merge.shape

(363, 51)

In [167]:
query_null_merge[query_null_merge.duplicated(['artwork_url30', 'collection_id']) == True]

Unnamed: 0,id,artwork_url30,artwork_url60_x,artwork_url100_x,artwork_url600_x,explicit,name,view_url,summary,artist_id_x,...,track_id,track_name,track_price,track_rental_price,track_view_url,wrapper_type,also_subscribed,episode_descriptions_y,episode_names_y,podcast_summary
74,13105,http://is1.mzstatic.com/image/thumb/Music4/v4/...,http://is1.mzstatic.com/image/thumb/Music4/v4/...,http://is1.mzstatic.com/image/thumb/Music4/v4/...,http://is1.mzstatic.com/image/thumb/Music4/v4/...,Clean,Closet Radio – BlasphuphmusRadio.com,https://itunes.apple.com/us/podcast/closet-rad...,Miss Rikki presents lost music live bands spec...,10346,...,666137437,Closet Radio – BlasphuphmusRadio.com,0,0,https://itunes.apple.com/us/podcast/closet-rad...,track,[],['Closet Radio Episode 204: The Left One Is Fa...,['Closet Radio Episode 204: The Left One Is Fa...,"Miss Rikki presents lost music, live bands, sp..."
201,15375,http://is4.mzstatic.com/image/thumb/Music4/v4/...,http://is4.mzstatic.com/image/thumb/Music4/v4/...,http://is4.mzstatic.com/image/thumb/Music4/v4/...,http://is4.mzstatic.com/image/thumb/Music4/v4/...,Clean,Podcast – Kitchen Sink WordPress,https://itunes.apple.com/us/podcast/podcast-ki...,Listen to Adam share his knowledge and passion...,12194,...,862210489,Podcast – Kitchen Sink WordPress,0,0,https://itunes.apple.com/us/podcast/podcast-ki...,track,"[885696994, 858903353, 882349295, 566009415, 6...",['This week the show is hijacked by Dustin Har...,"['Podcast E100 - The KSWP Show is Hijacked!', ...",Listen to Adam share his knowledge and passion...


In [168]:
query_null_merge[query_null_merge['id']==13105]

Unnamed: 0,id,artwork_url30,artwork_url60_x,artwork_url100_x,artwork_url600_x,explicit,name,view_url,summary,artist_id_x,...,track_id,track_name,track_price,track_rental_price,track_view_url,wrapper_type,also_subscribed,episode_descriptions_y,episode_names_y,podcast_summary
73,13105,http://is1.mzstatic.com/image/thumb/Music4/v4/...,http://is1.mzstatic.com/image/thumb/Music4/v4/...,http://is1.mzstatic.com/image/thumb/Music4/v4/...,http://is1.mzstatic.com/image/thumb/Music4/v4/...,Clean,Closet Radio – BlasphuphmusRadio.com,https://itunes.apple.com/us/podcast/closet-rad...,Miss Rikki presents lost music live bands spec...,10346,...,666137437,Closet Radio – BlasphuphmusRadio.com,0,0,https://itunes.apple.com/us/podcast/closet-rad...,track,[],['Closet Radio Episode 204: The Left One Is Fa...,['Closet Radio Episode 204: The Left One Is Fa...,"Miss Rikki presents lost music, live bands, sp..."
74,13105,http://is1.mzstatic.com/image/thumb/Music4/v4/...,http://is1.mzstatic.com/image/thumb/Music4/v4/...,http://is1.mzstatic.com/image/thumb/Music4/v4/...,http://is1.mzstatic.com/image/thumb/Music4/v4/...,Clean,Closet Radio – BlasphuphmusRadio.com,https://itunes.apple.com/us/podcast/closet-rad...,Miss Rikki presents lost music live bands spec...,10346,...,666137437,Closet Radio – BlasphuphmusRadio.com,0,0,https://itunes.apple.com/us/podcast/closet-rad...,track,[],['Closet Radio Episode 204: The Left One Is Fa...,['Closet Radio Episode 204: The Left One Is Fa...,"Miss Rikki presents lost music, live bands, sp..."


In [169]:
query_null_merge.drop(query_null_merge.index[[74, 201]], inplace=True)

In [170]:
# update db with collection ids and summaries
query = """
UPDATE podcast
SET raw_summary = (%s)
WHERE id = (%s);
"""
query = query.replace('\n', ' ')
for ind, row in tqdm(query_null_merge.iterrows(), total=query_null_merge.shape[0]):
    data = ((row['podcast_summary'], ),  (row['id'], ))
    cursor.execute(query, data)
    
con.commit()

