Browse files

support for listid-based reports

  • Loading branch information...
1 parent 3fe83fe commit 5720ef56bae90f9e8867587ecba25f7df3fde802 @cedricsam cedricsam committed Mar 1, 2011
Showing with 56 additions and 13 deletions.
  1. +40 −3 sinamostretweeted_firstpass.py
  2. +3 −3 sinareposts.sh
  3. +13 −7 sinatrace.py
View
43 sinamostretweeted_firstpass.py
@@ -18,9 +18,12 @@
# default values
fformat = "json"
provid = 81
+listid = 0
outfile = ""
nouser = False
counting = False
+maxfollowers_default = 100000
+maxfollowers = 0
if len(sys.argv) > 2 and not sys.argv[2].startswith("-"):
dateend = sys.argv[2]
@@ -71,19 +74,53 @@
provid = int(sys.argv[i+1])
except:
continue
+ if sys.argv[i] == "-maxfol" or sys.argv[i] == "--max-followers":
+ if i + 1 < len(sys.argv):
+ try:
+ maxfollowers = int(sys.argv[i+1])
+ except:
+ maxfollowers = maxfollowers_default
+ continue
+ else:
+ maxfollowers = maxfollowers_default
if sys.argv[i] == "-no" or sys.argv[i] == "--no-userinfo":
nouser = True
+ if sys.argv[i] == "-l" or sys.argv[i] == "--listid":
+ if i + 1 < len(sys.argv):
+ try:
+ listid = int(sys.argv[i+1])
+ except:
+ continue
+if maxfollowers > 0:
+ maxfollowers_sql = "AND ru.followers_count < " + str(maxfollowers)
+else:
+ maxfollowers_sql = ""
+
sql = "SELECT s.retweeted_status, COUNT(s.retweeted_status) AS retweeted_count, \
ARRAY_AGG(s.user_id), ARRAY_AGG(s.created_at), ARRAY_AGG(u.gender), ARRAY_AGG(u.followers_count), \
MAX(rs.id) AS id \
FROM sinaweibo_users AS u RIGHT JOIN sinaweibo s on u.id = s.user_id \
-LEFT JOIN sinaweibo as rs ON s.retweeted_status = rs.id \
-WHERE u.province = %(provid)d AND s.created_at >= '%(date)s' %(dateend)s \
+LEFT JOIN sinaweibo AS rs ON s.retweeted_status = rs.id \
+LEFT JOIN sinaweibo_users AS ru ON rs.user_id = ru.id "
+
+if listid <= 0:
+ sql_where = "WHERE u.province = %(provid)d %(maxfollowers)s \
+AND s.created_at >= '%(date)s' %(dateend)s \
AND s.retweeted_status IS NOT NULL \
GROUP BY s.retweeted_status ORDER BY retweeted_count DESC "\
-% { 'provid': provid, 'date': datetime.datetime.strftime(datestart, '%Y-%m-%d'), 'dateend': dateend_sql }
+% { 'provid': provid, 'date': datetime.datetime.strftime(datestart, '%Y-%m-%d'), \
+'dateend': dateend_sql, 'maxfollowers': maxfollowers_sql }
+else:
+ sql_where = "LEFT JOIN sinaweibo_userlist ul ON u.id = ul.user_id \
+WHERE ul.list_id = %(listid)d AND s.created_at >= '%(date)s' %(dateend)s \
+AND s.retweeted_status IS NOT NULL \
+GROUP BY s.retweeted_status ORDER BY retweeted_count DESC "\
+% { 'listid': listid, 'date': datetime.datetime.strftime(datestart, '%Y-%m-%d'), \
+'dateend': dateend_sql, 'maxfollowers': maxfollowers_sql }
+
+sql += sql_where
if counting:
print "Counting rows..."
View
6 sinareposts.sh
@@ -21,7 +21,7 @@ if [ $# -gt 1 ]
then
MAXP=$2
else
- MAXP=200
+ MAXP=1000
fi
if [ $# -gt 2 ]
@@ -43,7 +43,7 @@ ${HOME}/bin/sinastorage.py 1 ${FI}
# go through multiple pages
for i in `seq ${STAP} ${MAXP}`
do
- echo ${POSTID}
+ #echo ${POSTID}
CHECK1=999
CHECK2=2
COUNT=0
@@ -66,7 +66,6 @@ do
break
fi
# count the consecutive "[]", then stop if too many of them
- echo ${CHECK1} ${CHECK2}
if [ ${CHECK1} -eq 0 ] && [ ${CHECK2} -gt 2 ]
then
CONSEQ_BLANKS=0
@@ -80,5 +79,6 @@ do
continue
fi
${HOME}/bin/sinastorage.py 1 ${FI}
+ echo ${i} ${CHECK1} ${CHECK2}
rm ${FI}
done
View
20 sinatrace.py
@@ -43,7 +43,7 @@ def sinatrace(tid, minimal=False, extra_fields=False, get_users=False, outformat
missing_users = list()
missing_users_ids = list()
for r in rows:
- m = re.findall("//@([^: ]*)", r["text"])
+ m = re.findall("//@([^:/@ ]*)", r["text"])
refs = list()
for refname in m:
ref = dict()
@@ -63,7 +63,7 @@ def sinatrace(tid, minimal=False, extra_fields=False, get_users=False, outformat
refs.append(ref)
count += 1
r["references"] = refs
- if get_users and r["user_id_ref"] is None:
+ if get_users and r["user_id_ref"] is None: # users who reposted, but not in our DB yet
missing_users_ids.append(r["user_id"])
rts.append(r)
out["missing_users"] = missing_users
@@ -88,7 +88,7 @@ def sinatrace(tid, minimal=False, extra_fields=False, get_users=False, outformat
else:
return out
-def gviz_trends(tid, req_id=0, interval="", period="", province=0, outformat="json"):
+def gviz_trends(tid, req_id=0, interval="", period="", province=0, listid=0, outformat="json"):
try:
tid = long(tid)
except ValueError:
@@ -137,10 +137,16 @@ def gviz_trends(tid, req_id=0, interval="", period="", province=0, outformat="js
basetime = datetime.datetime.combine(basetime, datetime.time())
sql_period = " AND s.created_at >= '%s' " % basetime.strftime("%Y-%m-%d")
sql_location = ""
- if province > 0:
- sql_location = " AND u.province = %d " % province
- sql = "SELECT %(interval)s AS time, COUNT(*) AS count, COUNT(DISTINCT user_id) AS users \
-FROM sinaweibo s LEFT JOIN sinaweibo_users u ON s.user_id = u.id WHERE retweeted_status = %(tid)d %(sql_period)s %(sql_location)s GROUP BY time ORDER BY time " % {"tid": tid, "interval": sql_interval, "sql_period": sql_period, "sql_location": sql_location}
+ sql_listidjoin = ""
+ sql_listid = ""
+ if int(listid) > 0:
+ sql_listidjoin = "LEFT JOIN sinaweibo_userlist ul ON u.id = ul.user_id "
+ sql_listid = " AND ul.list_id = %d " % int(listid)
+ if int(province) > 0:
+ sql_location = " AND u.province = %d " % int(province)
+ sql = "SELECT %(interval)s AS time, COUNT(*) AS count, COUNT(DISTINCT s.user_id) AS users \
+FROM sinaweibo s LEFT JOIN sinaweibo_users u ON s.user_id = u.id %(sql_listidjoin)s WHERE retweeted_status = %(tid)d %(sql_period)s %(sql_location)s %(sql_listid)s GROUP BY time ORDER BY time " \
+% {"tid": tid, "interval": sql_interval, "sql_period": sql_period, "sql_location": sql_location, "sql_listidjoin": sql_listidjoin, "sql_listid": sql_listid}
rows = pgconn.query(sql).dictresult()
description = {"time": ("string", "Time"),
"count": ("number", "statuses"),

0 comments on commit 5720ef5

Please sign in to comment.