In [58]:
import psycopg2
import pandas as pd
from gensim import corpora, models, similarities
import preprocess_text
import get_similarity

In [2]:
dbname = 'podcast'
username = 'lindsay'

con = psycopg2.connect(database = dbname, user = username)
cursor = con.cursor()

In [28]:
id_mapping = pd.read_pickle('flask_app/flask_podcast/static/data/podcast_id_to_gensim_id.pkl')

In [62]:
# load gensim data
index = similarities.MatrixSimilarity.load('gensim/tfidf_lsi_similarities.index')
corpus = corpora.MmCorpus('gensim/corpus_tfidf.mm')
lsi = models.LsiModel.load('gensim/model.lsi')

In [8]:
query = """
SELECT name, podcast.id, podcast_has_genre.genre_id
FROM podcast
INNER JOIN podcast_has_genre
ON podcast.id = podcast_has_genre.podcast_id;
"""
query = query.replace('\n', ' ')
cursor.execute(query)

podcast_df = cursor.fetchall()

In [17]:
query = "SELECT name, id FROM genre;"
cursor.execute(query)

genre_df = cursor.fetchall()

In [11]:
podcast_df = pd.DataFrame({'podcast_name' : [x[0] for x in podcast_df],
                           'podcast_id' : [x[1] for x in podcast_df],
                           'genre_id' : [x[2] for x in podcast_df]})

In [19]:
genre_df = pd.DataFrame({'genre_name' : [x[0] for x in genre_df],
                         'genre_id' : [x[1] for x in genre_df]})

In [38]:
podcast_genre = pd.merge(podcast_df, genre_df, how = 'inner', on='genre_id')

In [42]:
del(podcast_df['genre_id'])

In [30]:
# run query for subscribed data
subscribed_query = "SELECT initial_podcast, subscribed_podcast FROM also_subscribed;"
cursor.execute(subscribed_query)
subscribed_df = cursor.fetchall()

In [32]:
subscribed_df = pd.DataFrame({'podcast_id' : [x[0] for x in subscribed_df],
                              'subscribed_id' : [x[1] for x in subscribed_df]})

In [33]:
subscribed_df.head()

Unnamed: 0,podcast_id,subscribed_id
0,12351,12824
1,15884,12824
2,16597,12824
3,17423,12824
4,12351,13129


In [45]:
podcast_subscribed = pd.merge(podcast_df, subscribed_df, how = 'inner', on='podcast_id')

In [46]:
podcast_subscribed.head()

Unnamed: 0,podcast_id,podcast_name,subscribed_id
0,12382,Aca-Media Podcast - Aca-Media,16087
1,12382,Aca-Media Podcast - Aca-Media,16082
2,12382,Aca-Media Podcast - Aca-Media,16087
3,12382,Aca-Media Podcast - Aca-Media,16082
4,12404,Acid Pop Cult,16620


In [47]:
get_similarity.map_id(podcast_subscribed.iloc[0]['podcast_id'], id_mapping)

32

In [49]:
# add in gensim IDs
podcast_subscribed['podcast_gensim_id'] = [get_similarity.map_id(x, id_mapping) for x in podcast_subscribed['podcast_id']]
podcast_subscribed['subscribed_gensim_id'] = [get_similarity.map_id(x, id_mapping) for x in podcast_subscribed['subscribed_id']]
podcast_genre['podcast_gensim_id'] = [get_similarity.map_id(x, id_mapping) for x in podcast_genre['podcast_id']]

In [54]:
podcast_genre.head()

Unnamed: 0,genre_id,podcast_id,podcast_name,genre_name,podcast_gensim_id
0,214,12349,A Cast of Kings - A Game of Thrones Podcast,TV & Film,4221
1,214,12382,Aca-Media Podcast - Aca-Media,TV & Film,32
2,214,12404,Acid Pop Cult,TV & Film,39
3,214,12406,Across the Universe: The Chicks With Accents P...,TV & Film,41
4,214,12413,Actors Anonymous Podcast,TV & Film,49


In [64]:
# podcast_genre similarity analysis
test_sim = get_similarity.sim(podcast_genre.iloc[0]['podcast_id'], id_mapping, corpus, lsi, index)

In [78]:
# sample random podcasts
frac_podcasts = 10
sub_sample = id_mapping.sample(frac = frac_podcasts)

In [79]:
# get similarity scores for sub_sample
sim_scores = pd.DataFrame(columns = ['search_podcast_id', 'result_podcast_id', 'similarity'])
for ind, row in sub_sample.iterrows():
    sim_result = get_similarity.sim(row['podcast_id'], id_mapping, corpus, lsi, index)
    this_result = pd.DataFrame({'search_podcast_id' : [int(row['podcast_id'])] * sim_result.shape[0],
                                'result_podcast_id' : sim_result['podcast_id'],
                                'similarity' : sim_result['similarity']})
    sim_scores = sim_scores.append(this_result)

In [85]:
# drop rows where result_podcast_id == search_podcast_id
sim_scores = sim_scores[sim_scores['result_podcast_id'] != sim_scores['search_podcast_id']]

In [86]:
podcast_subscribed_sim = pd.merge(podcast_subscribed, sim_scores, how = 'inner', left_on=['podcast_id', 'subscribed_id'], right_on=['search_podcast_id', 'result_podcast_id'])

In [93]:
# for some reason duplicates rows, so remove now
podcast_subscribed_sim = podcast_subscribed_sim.drop_duplicates()
podcast_subscribed_sim.shape

(686, 8)

In [95]:
# remove duplicate columns
del(podcast_subscribed_sim['podcast_gensim_id'])
del(podcast_subscribed_sim['subscribed_gensim_id'])
del(podcast_subscribed_sim['result_podcast_id'])
del(podcast_subscribed_sim['search_podcast_id'])

In [96]:
podcast_subscribed_sim.head()

Unnamed: 0,podcast_id,podcast_name,subscribed_id,similarity
0,12382,Aca-Media Podcast - Aca-Media,16087,0.763451
2,12382,Aca-Media Podcast - Aca-Media,16082,0.622512
4,12505,All My Heroes Wear Masks,14949,0.599157
6,12505,All My Heroes Wear Masks,16319,0.452908
8,13098,BLACK TRIBBLES,16054,0.708524


In [101]:
podcast_subscribed_sim.loc[:, 'subscribe_status'] = True

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [103]:
# get similarity for not subscribed
podcast_unsubscribed_sim = pd.merge(sim_scores, podcast_subscribed, how = 'left', right_on=['podcast_id', 'subscribed_id'], left_on=['search_podcast_id', 'result_podcast_id'])

In [106]:
# remove unneeded rows
del(podcast_unsubscribed_sim['podcast_id'])
del(podcast_unsubscribed_sim['podcast_name'])
del(podcast_unsubscribed_sim['subscribed_id'])
del(podcast_unsubscribed_sim['podcast_gensim_id'])
del(podcast_unsubscribed_sim['subscribed_gensim_id'])

In [107]:
podcast_unsubscribed_sim.loc[:, 'subscribe_status'] = False

In [108]:
podcast_unsubscribed_sim.head()

Unnamed: 0,result_podcast_id,search_podcast_id,similarity,subscribe_status
0,13596,18463,0.695872,False
1,16583,18463,0.655369,False
2,14124,18463,0.622745,False
3,14484,18463,0.620272,False
4,16251,18463,0.596339,False


In [109]:
del(podcast_subscribed_sim['podcast_name'])

In [111]:
podcast_sim_by_subscribe.shape

(3367306, 8)

In [112]:
podcast_unsubscribed_sim.shape[0] + podcast_subscribed_sim.shape[0]

3367992

In [121]:
podcast_subscribed_sim.columns=['search_podcast_id', 'result_podcast_id', 'similarity', 'subscribe_status']

In [115]:
podcast_unsubscribed_sim.head()

Unnamed: 0,result_podcast_id,search_podcast_id,similarity,subscribe_status
0,13596,18463,0.695872,False
1,16583,18463,0.655369,False
2,14124,18463,0.622745,False
3,14484,18463,0.620272,False
4,16251,18463,0.596339,False


In [117]:
podcast_subscribed_sim.head()

Unnamed: 0,search_podcast_id,result_pocast_id,similarity,subscribe_status
0,12382,16087,0.763451,True
2,12382,16082,0.622512,True
4,12505,14949,0.599157,True
6,12505,16319,0.452908,True
8,13098,16054,0.708524,True


In [122]:
podcast_sim_by_subscribe = pd.concat([podcast_subscribed_sim, podcast_unsubscribed_sim], axis=0, ignore_index=True)
podcast_sim_by_subscribe.shape

(3367992, 4)

In [123]:
podcast_sim_by_subscribe.head()

Unnamed: 0,result_podcast_id,search_podcast_id,similarity,subscribe_status
0,16087,12382,0.763451,True
1,16082,12382,0.622512,True
2,14949,12505,0.599157,True
3,16319,12505,0.452908,True
4,16054,13098,0.708524,True


In [124]:
# save to csv
podcast_sim_by_subscribe.to_csv('csv/podcast_sim_by_subscribe.csv')

In [125]:
sim_scores.head()

Unnamed: 0,result_podcast_id,search_podcast_id,similarity
1171,13596,18463,0.695872
3914,16583,18463,0.655369
1663,14124,18463,0.622745
2031,14484,18463,0.620272
3666,16251,18463,0.596339


In [126]:
podcast_genre.head()

Unnamed: 0,genre_id,podcast_id,podcast_name,genre_name,podcast_gensim_id
0,214,12349,A Cast of Kings - A Game of Thrones Podcast,TV & Film,4221
1,214,12382,Aca-Media Podcast - Aca-Media,TV & Film,32
2,214,12404,Acid Pop Cult,TV & Film,39
3,214,12406,Across the Universe: The Chicks With Accents P...,TV & Film,41
4,214,12413,Actors Anonymous Podcast,TV & Film,49


In [128]:
podcast_same_genre = pd.merge(podcast_genre, podcast_genre, how='inner', on='genre_id')

In [130]:
del(podcast_same_genre['podcast_name_x'])
del(podcast_same_genre['genre_name_x'])
del(podcast_same_genre['podcast_gensim_id_x'])
del(podcast_same_genre['podcast_name_y'])
del(podcast_same_genre['genre_name_y'])
del(podcast_same_genre['podcast_gensim_id_y'])

In [132]:
sim_scores.head()

Unnamed: 0,result_podcast_id,search_podcast_id,similarity
1171,13596,18463,0.695872
3914,16583,18463,0.655369
1663,14124,18463,0.622745
2031,14484,18463,0.620272
3666,16251,18463,0.596339


In [131]:
podcast_same_genre.head()

Unnamed: 0,genre_id,podcast_id_x,podcast_id_y
0,214,12349,12349
1,214,12349,12382
2,214,12349,12404
3,214,12349,12406
4,214,12349,12413


In [133]:
podcast_sim_same_genre = pd.merge(podcast_same_genre, sim_scores, how='inner', left_on=['podcast_id_x', 'podcast_id_y'], right_on=['search_podcast_id', 'result_podcast_id'])

In [135]:
del(podcast_sim_same_genre['podcast_id_x'])
del(podcast_sim_same_genre['podcast_id_y'])

In [136]:
podcast_sim_same_genre.loc[:, 'genre_status'] = True

In [138]:
podcast_sim_diff_genre = pd.merge(sim_scores, podcast_same_genre, how='left', right_on=['podcast_id_x', 'podcast_id_y'], left_on=['search_podcast_id', 'result_podcast_id'])

In [139]:
podcast_sim_same_genre.shape

(4149385, 5)

In [140]:
podcast_sim_diff_genre.shape

(4149385, 6)

In [141]:
podcast_sim_diff_genre.head()

Unnamed: 0,result_podcast_id,search_podcast_id,similarity,genre_id,podcast_id_x,podcast_id_y
0,13596,18463,0.695872,267,18463,13596
1,13596,18463,0.695872,234,18463,13596
2,13596,18463,0.695872,210,18463,13596
3,16583,18463,0.655369,267,18463,16583
4,16583,18463,0.655369,234,18463,16583
