In [23]:
import psycopg2
import pandas as pd

In [24]:
dbname = 'podcast'
username = 'lindsay'

In [25]:
podcastDf = pd.read_pickle('pkl/clean_podcast_data.pkl')

In [26]:
con = psycopg2.connect(database = dbname, user = username)

In [27]:
cursor = con.cursor()

In [28]:
del podcastDf['id']
del podcastDf['artist_id']
del podcastDf['genre_ids']

In [30]:
# insert podcast data into podcast table
query = "INSERT INTO podcast (artwork_url30, artwork_url60, artwork_url100, artwork_url600, explicit, name, view_url, summary, episode_descriptions, episode_names) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) RETURNING id;"
podcast_id = []
for ind, thisPod in podcastDf.iterrows():
    data = (thisPod['artwork_url30'], thisPod['artwork_url60'], thisPod['artwork_url100'], thisPod['artwork_url600'], thisPod['content_advisory_rating'], thisPod['collection_censored_name'], thisPod['collection_view_url'], thisPod['podcast_summary'], thisPod['clean_episode_description'], thisPod['clean_episode_name'])
    cursor.execute(query, data)
    podcast_id.append(cursor.fetchone()[0])
    
con.commit()
podcastDf['podcast_id'] = podcast_id

In [31]:
# get unique artists
artistDf = podcastDf[['artist_name', 'artist_view_url']]
artistDf = artistDf.drop_duplicates()
artistDf.shape

(4863, 2)

In [32]:
# insert artist data into artist table
query = "INSERT INTO artist (view_url, name) VALUES (%s, %s) RETURNING id;"
artist_id = []
for ind, row in artistDf.iterrows():
    data = (row['artist_view_url'], row['artist_name'])
    cursor.execute(query, data)
    artist_id.append(cursor.fetchone()[0])
con.commit()
artistDf['artist_id'] = artist_id

In [33]:
# get unique genres
genreDf = podcastDf['genres']
genreList = []
for row in genreDf:
    for genre in row:
        genreList.append(genre)
genreList = list(set(genreList))

In [34]:
# insert genre data into genre table
query = "INSERT INTO genre (name) VALUES (%s) RETURNING id;"
genre_id = []
for item in genreList:
    data = (item, )
    cursor.execute(query, data)
    genre_id.append(cursor.fetchone()[0])
con.commit()
genreDf = pd.DataFrame({'name' : genreList,
                       'genre_id': genre_id})

In [35]:
podcast_artist = pd.merge(podcastDf, artistDf, how = 'inner', on = 'artist_name')

In [36]:
# insert artist id into podcast table
query = "UPDATE podcast SET artist_id=(%s) WHERE id = (%s);"
for ind, row in podcast_artist.iterrows():
    data = (row['artist_id'], row['podcast_id'])
    cursor.execute(query, data)
    con.commit()

In [37]:
test = podcastDf[['podcast_id', 'genres']]

In [38]:
podcast_genre = pd.DataFrame(columns=['podcast_id', 'genre_name'])
for ind, row in test.iterrows():
    for genre in row['genres']:
        
        podcast_genre = podcast_genre.append(pd.DataFrame({'podcast_id' : [row['podcast_id']],
                                            'genre_name' : [genre]}))

In [39]:
podcast_genre = pd.merge(podcast_genre, genreDf, how = 'inner', left_on='genre_name', right_on='name')

In [40]:
# insert podcast-genre mappings into podcast_has_genre table
query = "INSERT INTO podcast_has_genre (podcast_id, genre_id) VALUES (%s, %s);"
for ind, row in podcast_genre.iterrows():
    data = (row['podcast_id'], row['genre_id'])
    cursor.execute(query, data)
con.commit()

In [41]:
tmp_also_subscribed = podcastDf[['podcast_id', 'also_subscribed']]
also_subscribed = pd.DataFrame(columns = ['podcast_id', 'also_subscribed'])
for ind, row in tmp_also_subscribed.iterrows():
    for sub in row['also_subscribed']:
        also_subscribed = also_subscribed.append(pd.DataFrame({'podcast_id' : [row['podcast_id']],
                                                              'also_subscribed' : [sub]}))

In [42]:
also_subscribed.rename(columns = {'podcast_id':'initial_podcast_id'}, inplace=True)

In [43]:
podcastId = podcastDf[['podcast_id', 'collection_id']]
podcastId.head()

Unnamed: 0,podcast_id,collection_id
0,12349,515836681
1,12350,126723118
2,12351,699748055
3,12352,80693391
4,12353,579605641


In [44]:
also_subscribed = pd.merge(also_subscribed, podcastId, how = 'inner', left_on='also_subscribed', right_on='collection_id')

In [45]:
# insert also_subscribed data
query = "INSERT INTO also_subscribed (initial_podcast, subscribed_podcast) VALUES (%s, %s);"
for ind, row in also_subscribed.iterrows():
    data = (row['initial_podcast_id'], row['podcast_id'])
    cursor.execute(query, data)
con.commit()