Map with location of teachers Added a lot of map specific stuff. Bigg…

…est issue is handling lots of markers which can potentailly overlap. There are two external google maps libraries in use to solve this. I hope I have documented everthing properly. I wil Summary: Necessary changes to reduce scripts, hive queries and frontend to include map of teachers Test Plan: teacher_country should get populated weekly and teacher-students should display map at the bottom of the page. There are numerous additions here, hence possible bugs are varied Reviewers: chris, jace Reviewed By: jace Differential Revision: http://phabricator.khanacademy.org/D2474
Khan · Jun 6, 2013 · 6d1dfa9 · 6d1dfa9
1 parent 470265f
commit 6d1dfa9
Show file tree

Hide file tree

Showing 17 changed files with 2,369 additions and 556 deletions.
diff --git a/.gitignore b/.gitignore
@@ -43,3 +43,6 @@ auth_whitelist.py
 # Common developer tools
 .project
 .pydevproject
+
+# GeoIP Database
+GeoLiteCity.dat
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "third_party/pygeoip"]
+	path = third_party/pygeoip
+	url = git@github.com:Khan/pygeoip.git
diff --git a/cfg/regular_report.json b/cfg/regular_report.json
@@ -1,6 +1,6 @@
 {
   "name": "<duration>ly report generation (start_dt=<start_dt>)",
-  "wait_for": [ 
+  "wait_for": [
     { "table_name": "user_video_summary",
       "partitions": ["dt=<end_dt_inclusive>"]
     }
@@ -12,7 +12,13 @@
       "hive_table": "video_stats",
       "importer_args": "duration=<duration> dt=<start_dt>",
       "mongo_collection":  "video_stats"
-    }
+    },
+    { "hive_script": "s3://ka-mapreduce/code/hive/teacher_geo.q",
+      "hive_args": {"end_dt": "<end_dt>"},
+      "hive_table": "teacher_country",
+      "mongo_collection":  "teacher_country",
+      "drop": true
+     }
 
   ]
 

diff --git a/map_reduce/Makefile b/map_reduce/Makefile
@@ -9,7 +9,18 @@ upload-hive:
 	scp -i $(PRIVATE_KEY) hive/* $(HIVE_USER)@$(HIVE_MASTER):~/
 
 upload-s3:
-	python deploy.py -v
+	python deploy.py -v code
+
+geo-ip:
+	# Moves fresh GeoIP database with pygeoip to s3 datastore for use with hive queries
+	# You should only want to run this target if you are using geolocation library pygeoip
+	#	and there has been an update to it or the GeoIP database
+	# If the following command does not work refer to http://dev.maxmind.com/geoip/legacy/install/city
+	# MaxMind recently announced V2 of their services, however, they don't have yet new offline database
+	# 	for it
+	wget -N -q http://geolite.maxmind.com/download/geoip/database/GeoLiteCity.dat.gz
+	gunzip --stdout GeoLiteCity.dat.gz > GeoLiteCity.dat
+	python deploy.py geoip
 
 hadoop-ui:
 	ssh -f -i $(PRIVATE_KEY) -l $(HIVE_USER) -N -L2001:localhost:9100 \

diff --git a/map_reduce/deploy.py b/map_reduce/deploy.py
@@ -2,6 +2,12 @@
 """Deploy utilities for map reduce code.
 
 This is a simple wrapper around copying files to S3 for the most part.
+Usually you will want to run it like this:
+    python deploy.py -v code
+
+In rare cases when you're updating GeoIP database on S3 run
+    "python deploy.py geoip"
+For even more information about GeoIP refer to Makefile in this directory
 """
 
 import optparse
@@ -96,7 +102,7 @@ def files_in_prod(branch=""):
     return files
 
 
-def copy_files_to_prod(filenames, branch=None):
+def copy_files_to_prod(filenames, branch=None, flatten=False):
     """Copies all given files to ka-mapreduce S3 code bucket"""
 
     dirname = _get_branch_dirname(branch)
@@ -105,6 +111,8 @@ def copy_files_to_prod(filenames, branch=None):
     bucket = s3conn.get_bucket('ka-mapreduce')
     for filename in filenames:
         with open(filename, 'r') as filestream:
+            if flatten:
+                filename = os.path.basename(filename)
             filepath = "%s%s" % (dirname, filename)
             key = bucket.get_key(filepath)
             if not key:
@@ -172,7 +180,7 @@ def send_hipchat_deploy_message(
 
 
 # TODO(benkomalo): wire up options for subdirectory to deploy to (for testing)
-def do_deploy(verbose, branch=""):
+def do_deploy(verbose, branch="", flatten=False):
     in_tree = set(files_in_tree())
     in_prod = set(files_in_prod(branch))
     new_files = in_tree - in_prod
@@ -194,24 +202,40 @@ def do_deploy(verbose, branch=""):
     if raw_input("Proceed? [y/N]: ").lower() not in ['y', 'yes']:
         return
 
-    copy_files_to_prod(files_to_push, branch)
+    copy_files_to_prod(files_to_push, branch, flatten)
     print "Done!"
-    if not branch.startswith("branch-"):  
+    if not branch.startswith("branch-"):
         # we only notify if not pushing to a personal branch.
-        # "branch-" is the conventional prefix for a personal branch. 
+        # "branch-" is the conventional prefix for a personal branch.
         send_hipchat_deploy_message(
                 replaced_files, new_files, spurious_files, dest_path)
 
 
 if __name__ == '__main__':
-    parser = optparse.OptionParser()
+    usage = "Usage: %prog [options] {code|geoip}"
+    parser = optparse.OptionParser(usage=usage)
     parser.add_option('-v', '--verbose',
         action="store_true", dest="verbose",
         help="Print more information during the deploy process")
     parser.add_option('-b', '--branch',
         default="",
         help=("The branch to deploy to. By default, no branch is specified "
               "implying that the default production branch is used"))
+    parser.add_option('-f', '--flatten',
+        action="store_true", dest="flatten",
+        help=("Discard the directory structure from the source."
+              "Uploads files directly to branch speficied"))
 
     options, args = parser.parse_args()
-    do_deploy(options.verbose, options.branch)
+    if len(args) < 1:
+        # Catch invalid usage
+        parser.print_help()
+
+    if args[0] == "code":
+        do_deploy(options.verbose, options.branch, options.flatten)
+    if args[0] == "geoip":
+        geoIpFiles = ["const.py", "util.py", "timezone.py", "__init__.py"]
+        fullPahtFiles = map(
+            lambda f: "../third_party/pygeoip/pygeoip/" + f, geoIpFiles)
+        copy_files_to_prod(["GeoLiteCity.dat"] +
+            fullPahtFiles, "geo/", options.flatten)
diff --git a/map_reduce/hive/ka_hive_init.q b/map_reduce/hive/ka_hive_init.q
@@ -382,6 +382,23 @@ CREATE EXTERNAL TABLE IF NOT EXISTS student_teacher_count (
     dt STRING
 ) LOCATION 's3://ka-mapreduce/summary_tables/student_teacher_count';
 
+-- Holds geolocated summary of teachers
+DROP TABLE IF EXISTS teacher_country;
+CREATE EXTERNAL TABLE IF NOT EXISTS teacher_country (
+    teacher STRING,
+    user_id STRING,
+    user_email STRING,
+    user_nickname STRING,
+    joined DOUBLE,
+    ip STRING,
+    city STRING,
+    region STRING,
+    country_code STRING,
+    country STRING,
+    latitude FLOAT,
+    longitude FLOAT
+) LOCATION 's3://ka-mapreduce/summary_tables/teacher_country';
+
 CREATE EXTERNAL TABLE IF NOT EXISTS video_topic(
   vid_key STRING, vid_title STRING, topic_key STRING,
   topic_title STRING, topic_desc STRING)

diff --git a/map_reduce/hive/student_teacher_count.q b/map_reduce/hive/student_teacher_count.q
@@ -1,6 +1,6 @@
 -- Query to produce time series of students and teachers using the site.
 -- TODO(robert): Results produced by these queries are fundamentally flawed.
--- We have to record date of change of coach to be able to produce
+--  We have to record date of change of coach to be able to produce
 --  accurate report
 
 -- Arguments:
@@ -12,81 +12,21 @@
 --   Student: User who has a teacher.
 --   Active Student: Student who completed an activity in last 28 days.
 
-DROP TABLE IF EXISTS user_coach_date;
-CREATE EXTERNAL TABLE IF NOT EXISTS user_coach_date(
-  user STRING,
-  coach STRING,
-  joined_on STRING,
-  self_coach BOOLEAN
-) LOCATION 's3://ka-mapreduce/tmp/user_coach_date';
-
-DROP TABLE IF EXISTS teacher_on_date;
-CREATE EXTERNAL TABLE IF NOT EXISTS teacher_on_date(
-  teacher STRING,
-  dt STRING
-) LOCATION 's3://ka-mapreduce/tmp/teacher_on_date';
+SOURCE FILE s3://ka-mapreduce/code/hive/student_teacher_current.q;
 
 DROP TABLE IF EXISTS active_teacher_on_date;
 CREATE EXTERNAL TABLE IF NOT EXISTS active_teacher_on_date(
   teacher STRING,
   dt STRING
 ) LOCATION 's3://ka-mapreduce/tmp/active_teacher_on_date';
 
-DROP TABLE IF EXISTS student_on_date;
-CREATE EXTERNAL TABLE IF NOT EXISTS student_on_date(
-  student STRING,
-  teacher STRING,
-  dt STRING
-) LOCATION 's3://ka-mapreduce/tmp/student_on_date';
-
 DROP TABLE IF EXISTS active_student_on_date;
 CREATE EXTERNAL TABLE IF NOT EXISTS active_student_on_date(
   student STRING,
   teacher STRING,
   dt STRING
 ) LOCATION 's3://ka-mapreduce/tmp/active_student_on_date';
 
-ADD FILE s3://ka-mapreduce/code/py/coach_reduce.py;
-ADD FILE s3://ka-mapreduce/code/py/ka_udf.py;
-
--- Extract relevant information from UserData table
-INSERT OVERWRITE TABLE user_coach_date
-SELECT a.user, a.coach,
-  from_unixtime(cast(cast(a.joined AS FLOAT) AS INT), 'yyyy-MM-dd')
-    AS joined_on,
-  (a.coach = a.user or a.coach = a.user_email or
-   a.coach = a.user_id) as self_coach
-FROM (
-  SELECT TRANSFORM(UserData.json)
-  USING 'ka_udf.py explode user,user_id,user_email,joined coaches'
-  AS user, user_id, user_email, joined, coach
-  FROM UserData
-) a;
-
--- Custom map reduce routine - compute first day when
---  given coach became a teacher
--- refer to map_reduce/py/coach_reduce.py for reduce function
-FROM (
-  SELECT user, coach, joined_on
-  FROM user_coach_date
-  WHERE NOT self_coach
-  DISTRIBUTE BY coach
-  SORT BY coach DESC, joined_on ASC
-) st_date
-INSERT OVERWRITE TABLE teacher_on_date
-REDUCE st_date.user, st_date.coach, st_date.joined_on
-USING 'coach_reduce.py teacher' AS teacher, dt;
-
--- Find first date when each user, coach pair became student, teacher pair
-INSERT OVERWRITE TABLE student_on_date
-SELECT u_dt.user, t_dt.teacher, IF(MIN(t_dt.dt) > MIN(u_dt.joined_on),
-    MIN(t_dt.dt), MIN(u_dt.joined_on)) AS dt
-FROM user_coach_date u_dt
-JOIN teacher_on_date t_dt
-ON u_dt.coach = t_dt.teacher
-WHERE NOT u_dt.self_coach
-GROUP BY u_dt.user, t_dt.teacher;
-
 -- Find all active students
 --  Active student is a user who performed an action,
 --      as defined by user_daily_activity, in last 28 days

diff --git a/map_reduce/hive/student_teacher_current.q b/map_reduce/hive/student_teacher_current.q
@@ -0,0 +1,64 @@
+-- Common part of teachers and students metrics
+-- Finds first day for each coach, user when he became a teacher, student
+
+DROP TABLE IF EXISTS user_coach_date;
+CREATE EXTERNAL TABLE IF NOT EXISTS user_coach_date(
+  user STRING,
+  coach STRING,
+  joined_on STRING,
+  self_coach BOOLEAN
+) LOCATION 's3://ka-mapreduce/tmp/user_coach_date';
+
+DROP TABLE IF EXISTS teacher_on_date;
+CREATE EXTERNAL TABLE IF NOT EXISTS teacher_on_date(
+  teacher STRING,
+  dt STRING
+) LOCATION 's3://ka-mapreduce/tmp/teacher_on_date';
+
+DROP TABLE IF EXISTS student_on_date;
+CREATE EXTERNAL TABLE IF NOT EXISTS student_on_date(
+  student STRING,
+  teacher STRING,
+  dt STRING
+) LOCATION 's3://ka-mapreduce/tmp/student_on_date';
+
+ADD FILE s3://ka-mapreduce/code/py/coach_reduce.py;
+ADD FILE s3://ka-mapreduce/code/py/ka_udf.py;
+
+-- Extract relevant information from UserData table
+INSERT OVERWRITE TABLE user_coach_date
+SELECT a.user, a.coach,
+  from_unixtime(cast(cast(a.joined AS FLOAT) AS INT), 'yyyy-MM-dd')
+    AS joined_on,
+  (a.coach = a.user or a.coach = a.user_email or
+   a.coach = a.user_id) as self_coach
+FROM (
+  SELECT TRANSFORM(UserData.json)
+  USING 'ka_udf.py explode user,user_id,user_email,joined coaches'
+  AS user, user_id, user_email, joined, coach
+  FROM UserData
+) a;
+
+-- Custom map reduce routine - compute first day when
+--  given coach became a teacher
+-- refer to map_reduce/py/coach_reduce.py for reduce function
+FROM (
+  SELECT user, coach, joined_on
+  FROM user_coach_date
+  WHERE NOT self_coach
+  DISTRIBUTE BY coach
+  SORT BY coach DESC, joined_on ASC
+) st_date
+INSERT OVERWRITE TABLE teacher_on_date
+REDUCE st_date.user, st_date.coach, st_date.joined_on
+USING 'coach_reduce.py teacher' AS teacher, dt;
+
+-- Find first date when each user, coach pair became student, teacher pair
+INSERT OVERWRITE TABLE student_on_date
+SELECT u_dt.user, t_dt.teacher, IF(MIN(t_dt.dt) > MIN(u_dt.joined_on),
+    MIN(t_dt.dt), MIN(u_dt.joined_on)) AS dt
+FROM user_coach_date u_dt
+JOIN teacher_on_date t_dt
+ON u_dt.coach = t_dt.teacher
+WHERE NOT u_dt.self_coach
+GROUP BY u_dt.user, t_dt.teacher;