In [1]:
from gensim import corpora, models, similarities
import pandas as pd
import tqdm
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [51]:
# load gensim output
dictionary = corpora.Dictionary.load('gensim/dictionary.dict')
corpus = corpora.MmCorpus('gensim/corpus_tfidf.mm')
lsi = models.LsiModel.load('gensim/model.lsi')
index = similarities.MatrixSimilarity.load('gensim/tfidf_lsi_similarities.index')

In [15]:
# load podcast data
podcast_data = pd.read_pickle('gensim/preprocessed_text_v2.pkl')
podcast_data.shape

(5804, 3)

In [16]:
podcast_data = podcast_data.reset_index()
del(podcast_data['index'])
podcast_data.head()

Unnamed: 0,id,language,text
0,16392,English,"[host, review, favorit, favorit, classic, pro,..."
1,18489,English,"[everyon, need, hope, everyon, need, encourag,..."
2,12350,English,"[insur, news, interview, rate, announc, insur,..."
3,12351,English,"[amovetv, crew, talk, video, game, esport, lot..."
4,12352,English,"[pastor, greg, lauri, thirtyminut, daili, radi..."


In [34]:
podcast_data.to_pickle('gensim/preprocessed_text_v3.pkl')

In [39]:
podcast_ids = podcast_data.copy(deep=True)
del(podcast_ids['language'])
del(podcast_ids['text'])


In [50]:
podcast_ids.to_pickle('gensim/podcast_id_to_gensim_id.pkl')

In [52]:
podcast_ids = pd.read_pickle('gensim/podcast_id_to_gensim_id.pkl')

In [53]:
podcast_ids.head()

Unnamed: 0,podcast_id
0,16392
1,18489
2,12350
3,12351
4,12352


In [23]:
podcast_id = 17233
row_id = podcast_data[podcast_data['id'] == podcast_id].index
row_id = row_id[0]

In [24]:
search_vec = lsi[corpus[row_id]]
sims = index[search_vec]
sims = sorted(enumerate(sims), key=lambda item: -item[1])

In [26]:
match_ids = [x[0] for x in sims]

In [42]:
sim_df = pd.DataFrame({"id" : [x[0] for x in sims],
                      "similarity" : [x[1] for x in sims]})
sim_df.head()

Unnamed: 0,id,similarity
0,1634,1.0
1,2904,0.820921
2,3862,0.809803
3,1804,0.803452
4,4191,0.800901


In [61]:
ids = list(sim_df['id'].values)
len(ids)

5804

5804

In [60]:
test = [str(x) for x in ids]
test

['1634',
 '2904',
 '3862',
 '1804',
 '4191',
 '1036',
 '2278',
 '4507',
 '7',
 '3581',
 '3533',
 '5425',
 '3364',
 '5347',
 '5130',
 '891',
 '2310',
 '2498',
 '475',
 '1052',
 '4285',
 '4081',
 '1172',
 '1971',
 '3518',
 '1518',
 '5636',
 '1628',
 '5311',
 '916',
 '1590',
 '4786',
 '2663',
 '5411',
 '1629',
 '4258',
 '3313',
 '3772',
 '3019',
 '2619',
 '1595',
 '660',
 '5419',
 '3937',
 '2006',
 '628',
 '45',
 '2075',
 '5485',
 '752',
 '1621',
 '3049',
 '5569',
 '1898',
 '2510',
 '4720',
 '5427',
 '91',
 '1088',
 '4259',
 '735',
 '1311',
 '2579',
 '165',
 '3441',
 '3178',
 '141',
 '3860',
 '2899',
 '3991',
 '1359',
 '5355',
 '4647',
 '3580',
 '5707',
 '3338',
 '190',
 '1627',
 '689',
 '2729',
 '3467',
 '8',
 '3863',
 '765',
 '5457',
 '623',
 '4138',
 '4210',
 '4733',
 '9',
 '661',
 '626',
 '2071',
 '3707',
 '4057',
 '4566',
 '2933',
 '3578',
 '3560',
 '3047',
 '1636',
 '4580',
 '1984',
 '5372',
 '3541',
 '2523',
 '5127',
 '4444',
 '482',
 '926',
 '244',
 '913',
 '1635',
 '4646',
 '3442

In [46]:
podcast_ids.columns = ['podcast_id']

In [48]:
pd.concat([sim_df, podcast_ids], axis = 1)

Unnamed: 0,id,similarity,podcast_id
0,1634,1.000000,16392
1,2904,0.820921,18489
2,3862,0.809803,12350
3,1804,0.803452,12351
4,4191,0.800901,12352
5,1036,0.796088,12355
6,2278,0.795028,16396
7,4507,0.788274,16399
8,7,0.782280,12357
9,3581,0.780547,12359


In [44]:
pd.concat([sim_df, podcast_ids])

Unnamed: 0,id,similarity
0,1634,1.000000
1,2904,0.820921
2,3862,0.809803
3,1804,0.803452
4,4191,0.800901
5,1036,0.796088
6,2278,0.795028
7,4507,0.788274
8,7,0.782280
9,3581,0.780547


In [32]:
podcast_data.iloc[match_ids]

Unnamed: 0,id,language,text
1634,17233,English,"[vote, favorit, polit, itun, listen, stephen, ..."
2904,15427,English,"[cover, georgia, amp, southern, polit, like, k..."
3862,16512,English,"[recent, appear, newshour, polit, analyst, col..."
1804,16522,English,"[pennsylvania, news, opinion, pennliv, patriot..."
4191,18296,English,"[spin, polit, rick, pearson, wgn, anoth, big, ..."
1036,16962,English,"[doc, holliday, rock, split, polit, inspir, fr..."
2278,17540,English,"[join, host, lynn, sebourn, discuss, grassroot..."
4507,16617,English,"[weekli, convers, polit, ken, rudin, mpr, news..."
7,16399,English,"[iowa, polit, weekli, news, analysi, aim, recr..."
3581,16139,English,"[weekli, read, magazin, essay, radic, measur, ..."


In [35]:
similarity_df = pd.DataFrame(columns=['search_id', 'match_id', 'similarity'])

In [39]:
for row_id in tqdm.tqdm(range(podcast_data.shape[0])):
    search_id = int(podcast_data.iloc[row_id]['id'])
    search_vec = lsi[corpus[row_id]]
    sims = index[search_vec]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    
    # remove self match (should be first)
    for i, (id, sim) in enumerate(sims):
        if id == row_id:
            del sims[i]
            break
    
    # get match ids and similarities
    match_id = [podcast_data.iloc[match[0]]['id'] for match in sims]
    similarity = [match[1] for match in sims]
    
    similarity_df = similarity_df.append(pd.DataFrame({'search_id' : [[search_id] * len(match_id)],
                                                      'match_id' : [match_id],
                                                      'similarity' : [similarity]}))



In [40]:
similarity_df.to_pickle('gensim/similarity_df.pkl')

In [38]:
print index

MatrixSimilarity<5804 docs, 100 features>
