In [1]:
from gensim import utils, corpora, models, similarities
from simserver import SessionServer
import simserver
import pandas as pd
import numpy as np
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import psycopg2
from tqdm import tqdm
import re
import preprocess_text

In [2]:
# connect to database
dbname = 'podcast'
username = 'lindsay'
con = psycopg2.connect(database = dbname, user = username)
cursor = con.cursor()

In [3]:
# download data from database
query = "SELECT id, collection_id, summary, episode_descriptions, episode_names FROM podcast"
cursor.execute(query, con)
query_results = cursor.fetchall()

In [4]:
# put into dataframe, concatenate text and remove non-alphanumeric characters for each podcast
query_df = pd.DataFrame({'id' : [x[0] for x in query_results],
                         'collection_id' : [x[1] for x in query_results],
                         'summary' : [x[2] for x in query_results],
                         'episode_descriptions' : [x[3] for x in query_results],
                         'episode_names' : [x[4] for x in query_results]})

podcast_text = pd.DataFrame(columns=['id', 'collection_id', 'text'])
for ind, row in tqdm(query_df.iterrows(), total = query_df.shape[0]):
    # concatenate
    text = ' '.join([row['summary'], row['episode_descriptions'], row['episode_names']])
    
    # remove non-alphanumeric, non-space
    text = re.sub(r'([^\s\w]|_)+', '', text)
    
    podcast_text = podcast_text.append(pd.DataFrame({'id' : [row['id']],
                                                     'collection_id': [row['collection_id']],
                                                    'text' : [text]}))



In [5]:
server = SessionServer('/tmp/simserver/')

In [6]:
del(podcast_text['collection_id'])

In [None]:
# convert text to pre-processed tokens
podcast_text['tokens'] = [preprocess_text.preprocess_text(x) for x in podcast_text['text']]

In [32]:
podcast_text.head()

Unnamed: 0,id,text,tokens
0,18521,A weekly conversation about whats new in The N...,"[weekli, convers, new, new, yorker, orson, wel..."
0,18522,Welcome to Superhero News Your source for the ...,"[welcom, superhero, news, sourc, latest, movi,..."
0,12350,Insurance news interviews rating announcements...,"[insur, news, interview, rate, announc, insur,..."
0,12351,The Amovetv crew talk video games eSports a lo...,"[amovetv, crew, talk, video, game, esport, lot..."
0,18373,Assorted stories from WMEHFM A look at whats b...,"[assort, stori, wmehfm, look, done, larg, smal..."


In [33]:
del(podcast_text['text'])

In [36]:
corpus = [{'id' : int(x['id']),
           'tokens' : x['tokens']}
         for ind, x in podcast_text.iterrows()]

In [39]:
# upload corpus to server
utils.upload_chunked(server, corpus, chunksize=1000)

In [40]:
# train server (uses LSI and 100 TF-IDF topics)
server.train(corpus, method='lsi')

ERROR:sqlitedict:failed to delete /var/folders/hg/kw83c_j57xqd055hr739mhvm0000gn/T/sqldict713110
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/sqlitedict.py", line 283, in terminate
    os.remove(self.filename)
OSError: [Errno 2] No such file or directory: '/var/folders/hg/kw83c_j57xqd055hr739mhvm0000gn/T/sqldict713110'
ERROR:sqlitedict:failed to delete /var/folders/hg/kw83c_j57xqd055hr739mhvm0000gn/T/sqldict628ff3
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/sqlitedict.py", line 283, in terminate
    os.remove(self.filename)
OSError: [Errno 2] No such file or directory: '/var/folders/hg/kw83c_j57xqd055hr739mhvm0000gn/T/sqldict628ff3'
ERROR:sqlitedict:failed to delete /var/folders/hg/kw83c_j57xqd055hr739mhvm0000gn/T/sqldict13c128
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/sqlitedict.py", line 283, in terminate
    os.remove(self.filename)
OSError: [Errno 2] No such file or di

In [41]:
# index documents we trained on
server.index(corpus)

ERROR:sqlitedict:failed to delete /var/folders/hg/kw83c_j57xqd055hr739mhvm0000gn/T/sqldict65a1d0
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/sqlitedict.py", line 283, in terminate
    os.remove(self.filename)
OSError: [Errno 2] No such file or directory: '/var/folders/hg/kw83c_j57xqd055hr739mhvm0000gn/T/sqldict65a1d0'
ERROR:sqlitedict:failed to delete /var/folders/hg/kw83c_j57xqd055hr739mhvm0000gn/T/sqldictb903d2
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/sqlitedict.py", line 283, in terminate
    os.remove(self.filename)
OSError: [Errno 2] No such file or directory: '/var/folders/hg/kw83c_j57xqd055hr739mhvm0000gn/T/sqldictb903d2'
ERROR:sqlitedict:failed to delete /var/folders/hg/kw83c_j57xqd055hr739mhvm0000gn/T/sqldicta90943
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/sqlitedict.py", line 283, in terminate
    os.remove(self.filename)
OSError: [Errno 2] No such file or di

In [43]:
# query for similar documents to existing podcast
sim_result = server.find_similar('12352')

In [44]:
# query for similar results to keyword
doc = {'tokens' : preprocess_text.preprocess_text('cats')}
key_result = server.find_similar(doc)