Skip to content

Commit

Permalink
Periodic update
Browse files Browse the repository at this point in the history
  • Loading branch information
cedricsam committed Oct 18, 2011
1 parent 5c7aea1 commit e70be46
Show file tree
Hide file tree
Showing 7 changed files with 1,068 additions and 27 deletions.
73 changes: 73 additions & 0 deletions blogs.parse.py
@@ -0,0 +1,73 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
import pg
import mypass
import datetime
import time
import rfc822
import urllib2
import httplib
from xml.dom import minidom

try:
blogid = int(sys.argv[1])
except:
print "Missing blog ID"
sys.exit()

try:
url = sys.argv[2]
except:
print "Missing URL"
sys.exit()

p = urllib2.urlopen(url, timeout=30)
txt = p.read()

try:
dom = minidom.parseString(txt)
except Exception as e:
print e
print "Invalid URL: " + url

pgconn = mypass.getConn()

for item in dom.getElementsByTagName('item'):
r = dict()
r["blogid"] = blogid
for a in ["title", "link", "guid", "description", "author", "comments", "category"]:
att = None
try:
att = item.getElementsByTagName(a)[0].firstChild.data
r[a] = att.encode("utf8")
except:
#print "does not exist: " + a
r[a] = att
try:
pubDate = item.getElementsByTagName("pubDate")[0].firstChild.data
#pubDate_dt = datetime.datetime.strptime(pubDate, '%a, %d %b %Y %H:%M:%S %z')
#print pubDate
try:
pubDate_dt = rfc822.parsedate_tz(pubDate)
pubDate_str = time.strftime("%Y-%m-%d %H:%M:%S", pubDate_dt[0:9])
tz = pubDate.split()
tz_str = tz[len(tz)-1]
r["pubdate"] = pubDate_str + " " + tz_str
except:
try:
r["pubdate"] = pubDate.replace("/","-") + " +0800"
except:
r["pubdate"] = pubDate + " +0800"
#print r
except Exception as e:
print e
continue
try:
pgconn.insert("blogs_entries", r)
except Exception as e:
print e
continue

pgconn.close()
183 changes: 183 additions & 0 deletions hkforums.search.py
@@ -0,0 +1,183 @@
#!/usr/bin/env python

import sys, os
import time, datetime
import csv
import pg
import re
import lucene
import mypass, sinaweibooauth

class SearchForums(object):
"""Usage: hkforums.search.py [-ds|-de DATE] terms <forum name>"""

pgconn = None
STORE_BASE_DIR = "/var/data/lucene/"
STORE_DIR = ""
supported_forums = ["uwants", "discuss", "hkreporter"]
analysers = list()
searcher = None
MAX_ITEMS = 1000
forum = ""

def __init__(self, forumname):
if not forumname in self.supported_forums:
sys.exit()
else:
self.forum = forumname
self.STORE_DIR = self.STORE_BASE_DIR + forumname
smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33)
self.analyzers = { "smartcn": smartcn }
directory = lucene.SimpleFSDirectory(lucene.File(self.STORE_DIR))
self.searcher = lucene.IndexSearcher(directory, True)
self.pgconn = mypass.getConn()

def prepareDates(self, datestring):
if datestring is None:
return None
try:
mydate = time.strptime(datestring, "%Y-%m-%d")
except ValueError:
try:
mydate = time.strptime(datestring, "%Y-%m-%d %H:%M")
except ValueError, TypeError:
return None
return int(time.mktime(mydate))

def searchForums(self, q, time_start_secs, time_end_secs, uids=list(), offset=None, floor=None):
if offset <> None:
try:
offset = int(offset)
if offset > self.MAX_ITEMS:
self.MAX_ITEMS = offset + 100
except:
pass
page_start = page_end = None
if floor <> None and len(floor) > 0:
m = re.match(r"(\d+)-?(\d*)", floor)
if m <> None:
page_start = int(m.group(1))
try:
page_end = int(m.group(2))
except:
page_end = page_start
startexec = datetime.datetime.now()
first = True
query = lucene.BooleanQuery()
query.setMaxClauseCount(2097152)
sorter = lucene.Sort(lucene.SortField("time", lucene.SortField.INT, True))
pageFilter = None
if len(q) > 0:
query.add(lucene.QueryParser(lucene.Version.LUCENE_33, "content", self.analyzers["smartcn"]).parse(q), lucene.BooleanClause.Occur.MUST)
dateFilter = lucene.NumericRangeFilter.newIntRange("time", time_start_secs, time_end_secs, True, True)
else:
query.add(lucene.NumericRangeQuery.newIntRange("time", time_start_secs, time_end_secs, True, True), lucene.BooleanClause.Occur.MUST)
if page_start <> None and page_end <> None:
pageFilter = lucene.NumericRangeFilter.newIntRange("floor", page_start, page_end, True, True)
topScoreCollector = lucene.TopScoreDocCollector
if len(uids) > 0:
uids_str = list()
numfilters = list()
count = 0
for x in uids:
count += 1
uids_str.append(str(x))
numfilter = lucene.NumericRangeFilter.newIntRange("uid", x, x, True, True)
numfilters.append(numfilter)
#if count > 1000:
# break
chainedNumFilters = lucene.ChainedFilter(numfilters, lucene.ChainedFilter.OR)
cachingChainedNumFilters = lucene.CachingWrapperFilter(chainedNumFilters)
if len(q) > 0:
chain = lucene.ChainedFilter([cachingChainedNumFilters,dateFilter, pageFilter], lucene.ChainedFilter.AND)
else:
chain = cachingChainedNumFilters
topDocs = self.searcher.search(query, chain, sorter)
else:
if len(q) > 0 and time_start_secs is not None and time_end_secs is not None:
if pageFilter is not None:
filters = [dateFilter, pageFilter]
chainedFilters = lucene.ChainedFilter(filters, lucene.ChainedFilter.AND)
topDocs = self.searcher.search(query, chainedFilters, self.MAX_ITEMS, sorter)
else:
topDocs = self.searcher.search(query, dateFilter, self.MAX_ITEMS, sorter)
else:
if pageFilter is not None:
topDocs = self.searcher.search(query, pageFilter, self.MAX_ITEMS, sorter)
else:
topDocs = self.searcher.search(query, self.MAX_ITEMS, sorter)
#return "%(nb)d results found in %(secs)f seconds" %
ids = list()
ids_str = list()
hits = list()
count = 0
for scoreDoc in topDocs.scoreDocs:
count += 1
doc = self.searcher.doc(scoreDoc.doc)
id = doc.get("pid")
uid = doc.get("uid")
tid = doc.get("tid")
#ids.append(id)
hit = { "pid": id, "uid": uid, "tid": tid }
hits.append(hit)
#ids_str.append(str(id))
#if count > self.MAX_ITEMS:
#break
out = { "totalhits": topDocs.totalHits, "nb_users": len(uids), "ids": ids, "q": q, "hits": hits }
out["lucene_query_finished"] = long(time.mktime(datetime.datetime.now().timetuple())) * 1000
if len(uids) > 0:
out["user_ids"] = uids_str
# Logging
f = open("/var/data/hkforums/searchlog/%(forum)s.log" % {"forum": self.forum},"a")
f.write(datetime.datetime.strftime(datetime.datetime.now(),"%Y-%m-%d %H:%M:%S") + "\t" + q + "\n")
f.close()
endexec = datetime.datetime.now()
td = endexec - startexec
microtime = td.microseconds + (td.seconds + td.days * 86400) * 1000000
secondstime = microtime / 1000000.0
out["secs"] = secondstime
print out
return out

if __name__ == '__main__':
if len(sys.argv) <= 1:
print SearchSinaWeibo.__doc__
sys.exit(1)
inargs = False
datestart_str = None
dateend_str = None
for i in range(1, len(sys.argv)):
if sys.argv[i].find("-") != 0 and not inargs:
i -= 1
break
else:
inargs = False
if sys.argv[i] == "-ds":
if len(sys.argv) > i + 1:
inargs = True
datestart_str = sys.argv[i+1]
elif sys.argv[i] == "-de":
if len(sys.argv) > i + 1:
inargs = True
dateend_str = sys.argv[i+1]
terms = sys.argv[i+1:len(sys.argv)+1]
if inargs or len(terms) == 0:# or datestart_str is None:
print SearchSinaWeibo.__doc__
sys.exit(1)
if dateend_str is None:
dateend_str = datetime.datetime.strftime(datetime.datetime.now(),"%Y-%m-%d %H:%M")
print terms
print "date start: " + str(datestart_str)
print "date end: " + str(dateend_str)
# Start Lucene
lucene.initVM(lucene.CLASSPATH)
print 'lucene', lucene.VERSION
search = SearchSinaWeibo()
if datestart_str is None and dateend_str is None:
search.searchWeibos(terms)
elif datestart_str is not None:
search.searchWeibos(terms, search.prepareDates(datestart_str))
elif dateend_str is not None:
search.searchWeibos(terms, 0, search.prepareDates(dateend_str))
else:
search.searchWeibos(terms, search.prepareDates(datestart_str), search.prepareDates(dateend_str))
36 changes: 32 additions & 4 deletions sinatrace.py
Expand Up @@ -22,6 +22,7 @@
pgconn = mypass.getConn()

def sinatrace(tid, minimal=False, extra_fields=False, get_users=False, outformat="json"):
# For RP: Should try to find the created_at if it's not known or given as argument...
sw = sinaweibooauth.SinaWeiboOauth()
sw.setToken(sw.sinaweiboOauth["oauth_token"], sw.sinaweiboOauth["oauth_token_secret"])
try:
Expand All @@ -35,9 +36,32 @@ def sinatrace(tid, minimal=False, extra_fields=False, get_users=False, outformat
u.followers_count user_followers_count, u.friends_count user_friends_count, u.retrieved user_retrieved "
else:
extra_fields = ""
'''
rps = sw.getRangePartitionByIds([tid])
for rp in rps:
x = rp.split(",")
year = int(x[0])
week = int(x[1])
break
isocal = datetime.datetime.now().isocalendar()
year_now = isocal[0]
week_now = isocal[1]
sw_tables_arr = list()
for x in range(year,year_now+1):
if year == year_now:
myrange = range(week,week_now+1)
elif x == year:
myrange = range(week,54)
elif x == year_now:
myrange = range(1,week)
for y in myrange:
sw_tables_arr.append("SELECT * FROM rp_sinaweibo_y%(year)dw%(week)d" % { "year": x, "week": y })
sw_tables = " UNION ".join(sw_tables_arr)
'''
sql = "SELECT s.id, s.created_at, s.user_id, s.screen_name, s.text, u.id AS user_id_ref %(extra_fields)s \
FROM sinaweibo s LEFT JOIN sinaweibo_users u ON s.user_id = u.id \
WHERE retweeted_status = %(tid)d ORDER BY s.id " % {"tid": tid, "extra_fields": extra_fields}
FROM rp_sinaweibo s LEFT JOIN sinaweibo_users u ON s.user_id = u.id \
WHERE retweeted_status = %(tid)d ORDER BY s.id " % {"tid": tid, "extra_fields": extra_fields }#, "sw_tables": sw_tables}
#print sql
rows = pgconn.query(sql).dictresult()
out = dict()
rts = list()
Expand Down Expand Up @@ -161,9 +185,13 @@ def gviz_trends(tid, req_id=0, interval="", period="", province=0, listid=0, out
basetime = None
if basetime is None:
sql_period = ""
sw_tables = "sinaweibo"
else:
basetime = datetime.datetime.combine(basetime, datetime.time())
sql_period = " AND s.created_at >= '%s' " % basetime.strftime("%Y-%m-%d")
import sinaweibooauth
sw = sinaweibooauth.SinaWeiboOauth()
sw_tables = "(%s)" % sw.getRangePartitionSQL(basetime)
sql_location = ""
sql_listidjoin = ""
sql_listid = ""
Expand All @@ -173,8 +201,8 @@ def gviz_trends(tid, req_id=0, interval="", period="", province=0, listid=0, out
if int(province) > 0:
sql_location = " AND u.province = %d " % int(province)
sql = "SELECT %(interval)s AS time, COUNT(*) AS count, COUNT(DISTINCT s.user_id) AS users \
FROM sinaweibo s LEFT JOIN sinaweibo_users u ON s.user_id = u.id %(sql_listidjoin)s WHERE retweeted_status = %(tid)d %(sql_period)s %(sql_location)s %(sql_listid)s GROUP BY time ORDER BY time " \
% {"tid": tid, "interval": sql_interval, "sql_period": sql_period, "sql_location": sql_location, "sql_listidjoin": sql_listidjoin, "sql_listid": sql_listid}
FROM %(sw_tables)s s LEFT JOIN sinaweibo_users u ON s.user_id = u.id %(sql_listidjoin)s WHERE retweeted_status = %(tid)d %(sql_period)s %(sql_location)s %(sql_listid)s GROUP BY time ORDER BY time " \
% {"tid": tid, "interval": sql_interval, "sql_period": sql_period, "sql_location": sql_location, "sql_listidjoin": sql_listidjoin, "sql_listid": sql_listid, "sw_tables": sw_tables }
rows = pgconn.query(sql).dictresult()
description = {"time": ("string", "Time"),
"count": ("number", "statuses"),
Expand Down

0 comments on commit e70be46

Please sign in to comment.