In [1]:
from gensim import corpora, models, similarities
import pandas as pd
import tqdm
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [71]:
# load gensim output
dictionary = corpora.Dictionary.load('gensim/dictionary.dict')
corpus = corpora.MmCorpus('gensim/corpus_tfidf.mm')
lsi = models.LsiModel.load('gensim/model.lsi')
index = similarities.MatrixSimilarity.load('gensim/tfidf_lsi_similarities.index')

In [15]:
# load podcast data
podcast_data = pd.read_pickle('gensim/preprocessed_text_v2.pkl')
podcast_data.shape

(5804, 3)

In [16]:
podcast_data = podcast_data.reset_index()
del(podcast_data['index'])
podcast_data.head()

Unnamed: 0,id,language,text
0,16392,English,"[host, review, favorit, favorit, classic, pro,..."
1,18489,English,"[everyon, need, hope, everyon, need, encourag,..."
2,12350,English,"[insur, news, interview, rate, announc, insur,..."
3,12351,English,"[amovetv, crew, talk, video, game, esport, lot..."
4,12352,English,"[pastor, greg, lauri, thirtyminut, daili, radi..."


In [34]:
podcast_data.to_pickle('gensim/preprocessed_text_v3.pkl')

In [39]:
podcast_ids = podcast_data.copy(deep=True)
del(podcast_ids['language'])
del(podcast_ids['text'])


In [50]:
podcast_ids.to_pickle('gensim/podcast_id_to_gensim_id.pkl')

In [52]:
podcast_ids = pd.read_pickle('gensim/podcast_id_to_gensim_id.pkl')

In [53]:
podcast_ids.head()

Unnamed: 0,podcast_id
0,16392
1,18489
2,12350
3,12351
4,12352


In [72]:
podcast_id = 17233
row_id = podcast_data[podcast_data['id'] == podcast_id].index
row_id = row_id[0]

In [73]:
row_id

1634

In [74]:
search_vec = lsi[corpus[row_id]]
sims = index[search_vec]
#sims = sorted(enumerate(sims), key=lambda item: -item[1])

In [75]:
sims

array([ 0.02857188,  0.09426528,  0.12806995, ...,  0.06971769,
        0.16650036,  0.05022383], dtype=float32)

In [68]:
#match_ids = [x[0] for x in sims]

IndexError: invalid index to scalar variable.

In [77]:
sim_df = pd.DataFrame({"similarity" : sims})
sim_df.head()

Unnamed: 0,similarity
0,0.028572
1,0.094265
2,0.12807
3,0.064543
4,0.056373


In [78]:
podcast_ids.columns = ['podcast_id']

In [81]:
sim_df = pd.concat([sim_df, podcast_ids], axis = 1)

In [85]:
sim_df = sim_df.sort_values('similarity', ascending=False)

In [86]:
sim_df[sim_df['podcast_id'] != podcast_id]

Unnamed: 0,similarity,podcast_id
2904,0.820921,15427
3862,0.809803,16512
1804,0.803452,16522
4191,0.800901,18296
1036,0.796088,16962
2278,0.795028,17540
4507,0.788274,16617
7,0.782280,16399
3581,0.780547,16139
3533,0.769447,18251


In [87]:
sim_df_no_self = sim_df[sim_df['podcast_id'] != podcast_id]

In [91]:
# take top ids
ids = list(sim_df_no_self['podcast_id'].values)
ids = [str(int(x)) for x in ids]
ids = ids[:100]

In [95]:
import psycopg2
user = 'lindsay'          
host = 'localhost'
dbname = 'podcast'
con = None
con = psycopg2.connect(database = dbname, user = user)
cursor = con.cursor()

def pg_int_array(the_list):
  return '(' + ','.join(the_list) + ')'

query = """
SELECT name, view_url, artwork_url100, id
FROM podcast
WHERE id IN %s;
"""
query = query.replace('\n', ' ')

cursor.execute(query % (pg_int_array(ids)))
query_results = cursor.fetchall()

In [104]:
columnNames = ['name', 'view_url', 'artwork_url100', 'id']
podcast_results = pd.DataFrame(columns=columnNames)

In [106]:
podcast_results['name'] = [x[0] for x in query_results]
podcast_results['view_url'] = [x[1] for x in query_results]
podcast_results['artwork_url100'] = [x[2] for x in query_results]
podcast_results['id'] = [x[3] for x in query_results]
podcast_results.head()

Unnamed: 0,name,view_url,artwork_url100,id
0,AACONS,https://itunes.apple.com/us/podcast/aacons/id5...,http://is3.mzstatic.com/image/thumb/Music5/v4/...,12357
1,Aaron Rand Show,https://itunes.apple.com/us/podcast/aaron-rand...,http://is1.mzstatic.com/image/thumb/Music/v4/0...,12359
2,Adam Thompson Show,https://itunes.apple.com/us/podcast/adam-thomp...,http://is4.mzstatic.com/image/thumb/Music5/v4/...,12416
3,AirTalk,https://itunes.apple.com/us/podcast/airtalk/id...,http://is3.mzstatic.com/image/thumb/Music69/v4...,12458
4,Alonzo Bodden: Who's Paying Attention,https://itunes.apple.com/us/podcast/alonzo-bod...,http://is1.mzstatic.com/image/thumb/Music3/v4/...,12532


In [108]:
merge_results = pd.merge(sim_df_no_self, podcast_results, how = 'inner', left_on='podcast_id', right_on='id')

In [112]:
merge_results.head()

Unnamed: 0,similarity,podcast_id,name,view_url,artwork_url100,id
0,0.820921,15427,Kudzu Vine,https://itunes.apple.com/us/podcast/kudzu-vine...,http://is4.mzstatic.com/image/thumb/Music3/v4/...,15427
1,0.809803,16512,PBS NewsHour » Shields and Brooks,https://itunes.apple.com/us/podcast/pbs-newsho...,http://is1.mzstatic.com/image/thumb/Music6/v4/...,16512
2,0.803452,16522,PennLive News Podcasts,https://itunes.apple.com/us/podcast/pennlive-n...,http://is1.mzstatic.com/image/thumb/Music/v4/e...,16522
3,0.800901,18296,WGN - The Rick Pearson Podcast,https://itunes.apple.com/us/podcast/wgn-the-ri...,http://is3.mzstatic.com/image/thumb/Music5/v4/...,18296
4,0.796088,16962,Rock Splitting Politics – Doc Holliday,https://itunes.apple.com/us/podcast/rock-split...,http://is1.mzstatic.com/image/thumb/Music4/v4/...,16962


In [35]:
similarity_df = pd.DataFrame(columns=['search_id', 'match_id', 'similarity'])

In [39]:
for row_id in tqdm.tqdm(range(podcast_data.shape[0])):
    search_id = int(podcast_data.iloc[row_id]['id'])
    search_vec = lsi[corpus[row_id]]
    sims = index[search_vec]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    
    # remove self match (should be first)
    for i, (id, sim) in enumerate(sims):
        if id == row_id:
            del sims[i]
            break
    
    # get match ids and similarities
    match_id = [podcast_data.iloc[match[0]]['id'] for match in sims]
    similarity = [match[1] for match in sims]
    
    similarity_df = similarity_df.append(pd.DataFrame({'search_id' : [[search_id] * len(match_id)],
                                                      'match_id' : [match_id],
                                                      'similarity' : [similarity]}))



In [40]:
similarity_df.to_pickle('gensim/similarity_df.pkl')

In [38]:
print index

MatrixSimilarity<5804 docs, 100 features>
