In [1]:
from gensim import utils, corpora, models, similarities
from simserver import SessionServer
import simserver
import pandas as pd
import numpy as np
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import psycopg2
from tqdm import tqdm
import re
import preprocess_text

In [3]:
# connect to database
dbname = 'podcast'
username = 'lindsay'
con = psycopg2.connect(database = dbname, user = username)
cursor = con.cursor()

In [4]:
# download data from database
query = "SELECT id, summary, episode_descriptions, episode_names FROM podcast"
cursor.execute(query, con)
query_results = cursor.fetchall()

In [6]:
# put into dataframe, concatenate text and remove non-alphanumeric characters for each podcast
query_df = pd.DataFrame({'id' : [x[0] for x in query_results],
                         'summary' : [x[1] for x in query_results],
                         'episode_descriptions' : [x[2] for x in query_results],
                         'episode_names' : [x[3] for x in query_results]})

podcast_text = pd.DataFrame(columns=['id', 'text'])
for ind, row in tqdm(query_df.iterrows(), total = query_df.shape[0]):
    # concatenate
    text = ' '.join([row['summary'], row['episode_descriptions'], row['episode_names']])
    
    # remove non-alphanumeric, non-space
    text = re.sub(r'([^\s\w]|_)+', '', text)
    
    podcast_text = podcast_text.append(pd.DataFrame({'id' : [row['id']],
                                                    'text' : [text]}))



In [7]:
# convert text to pre-processed tokens
podcast_text['tokens'] = [preprocess_text.preprocess_text(x) for x in podcast_text['text']]

# save tokenized text
podcast_text.to_pickle('gensim/tokenized_preprocessed_Feb_10_2016.pkl')

In [9]:
# convert to corpus
corpus = [{'id' : int(x['id']),
           'tokens' : x['tokens']}
         for ind, x in podcast_text.iterrows()]

In [8]:
# setup servers
server_lsi = SessionServer('/tmp/simserver_lsi/')
server_lda = SessionServer('/tmp/simserver_lda/')
server_logentropy = SessionServer('/tmp/simserver_logentropy/')

In [10]:
# upload corpus to servers
utils.upload_chunked(server_lsi, corpus, chunksize=1000)
utils.upload_chunked(server_lda, corpus, chunksize=1000)
utils.upload_chunked(server_logentropy, corpus, chunksize=1000)

In [11]:
# train models
server_lsi.train(corpus, method='lsi')
server_lda.train(corpus, method='lda')
server_logentropy.train(corpus, method='logentropy')

ERROR:sqlitedict:failed to delete /var/folders/hg/kw83c_j57xqd055hr739mhvm0000gn/T/sqldicta5f2d7
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/sqlitedict.py", line 283, in terminate
    os.remove(self.filename)
OSError: [Errno 2] No such file or directory: '/var/folders/hg/kw83c_j57xqd055hr739mhvm0000gn/T/sqldicta5f2d7'
ERROR:sqlitedict:failed to delete /var/folders/hg/kw83c_j57xqd055hr739mhvm0000gn/T/sqldictdf3400
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/sqlitedict.py", line 283, in terminate
    os.remove(self.filename)
OSError: [Errno 2] No such file or directory: '/var/folders/hg/kw83c_j57xqd055hr739mhvm0000gn/T/sqldictdf3400'
ERROR:sqlitedict:failed to delete /var/folders/hg/kw83c_j57xqd055hr739mhvm0000gn/T/sqldict7a1e3b
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/sqlitedict.py", line 283, in terminate
    os.remove(self.filename)
OSError: [Errno 2] No such file or di

In [17]:
# index documents
server_lsi.index(corpus)
server_lda.index(corpus)
server_logentropy.index(corpus)

ERROR:sqlitedict:failed to delete /var/folders/hg/kw83c_j57xqd055hr739mhvm0000gn/T/sqldict6f029b
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/sqlitedict.py", line 283, in terminate
    os.remove(self.filename)
OSError: [Errno 2] No such file or directory: '/var/folders/hg/kw83c_j57xqd055hr739mhvm0000gn/T/sqldict6f029b'
ERROR:sqlitedict:failed to delete /var/folders/hg/kw83c_j57xqd055hr739mhvm0000gn/T/sqldict295a51
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/sqlitedict.py", line 283, in terminate
    os.remove(self.filename)
OSError: [Errno 2] No such file or directory: '/var/folders/hg/kw83c_j57xqd055hr739mhvm0000gn/T/sqldict295a51'
ERROR:sqlitedict:failed to delete /var/folders/hg/kw83c_j57xqd055hr739mhvm0000gn/T/sqldictcd2161
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/sqlitedict.py", line 283, in terminate
    os.remove(self.filename)
OSError: [Errno 2] No such file or di