In [34]:
from gensim import corpora, models, similarities
import pandas as pd
import tqdm
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
# load gensim output
dictionary = corpora.Dictionary.load('gensim/dictionary.dict')
corpus = corpora.MmCorpus('gensim/corpus_tfidf.mm')
lsi = models.LsiModel.load('gensim/model.lsi')
index = similarities.MatrixSimilarity.load('gensim/tfidf_lsi_similarities.index')

In [7]:
# load podcast data
podcast_data = pd.read_pickle('gensim/preprocessed_text_v2.pkl')
podcast_data.shape

(5804, 3)

In [35]:
similarity_df = pd.DataFrame(columns=['search_id', 'match_id', 'similarity'])

In [39]:
for row_id in tqdm.tqdm(range(podcast_data.shape[0])):
    search_id = int(podcast_data.iloc[row_id]['id'])
    search_vec = lsi[corpus[row_id]]
    sims = index[search_vec]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    
    # remove self match (should be first)
    for i, (id, sim) in enumerate(sims):
        if id == row_id:
            del sims[i]
            break
    
    # get match ids and similarities
    match_id = [podcast_data.iloc[match[0]]['id'] for match in sims]
    similarity = [match[1] for match in sims]
    
    similarity_df = similarity_df.append(pd.DataFrame({'search_id' : [[search_id] * len(match_id)],
                                                      'match_id' : [match_id],
                                                      'similarity' : [similarity]}))



In [40]:
similarity_df.to_pickle('gensim/similarity_df.pkl')