Skip to content
This repository has been archived by the owner on Dec 5, 2018. It is now read-only.

Commit

Permalink
Map with location of teachers Added a lot of map specific stuff. Bigg…
Browse files Browse the repository at this point in the history
…est issue is handling lots of markers which can potentailly overlap. There are two external google maps libraries in use to solve this. I hope I have documented everthing properly. I wil

Summary: Necessary changes to reduce scripts, hive queries and frontend to include map of teachers

Test Plan: teacher_country should get populated weekly and teacher-students should display map at the bottom of the page. There are numerous additions here, hence possible bugs are varied

Reviewers: chris, jace

Reviewed By: jace

Differential Revision: http://phabricator.khanacademy.org/D2474
  • Loading branch information
robert3005 committed Jun 6, 2013
1 parent 470265f commit 6d1dfa9
Show file tree
Hide file tree
Showing 17 changed files with 2,369 additions and 556 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,6 @@ auth_whitelist.py
# Common developer tools
.project
.pydevproject

# GeoIP Database
GeoLiteCity.dat
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "third_party/pygeoip"]
path = third_party/pygeoip
url = git@github.com:Khan/pygeoip.git
10 changes: 8 additions & 2 deletions cfg/regular_report.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "<duration>ly report generation (start_dt=<start_dt>)",
"wait_for": [
"wait_for": [
{ "table_name": "user_video_summary",
"partitions": ["dt=<end_dt_inclusive>"]
}
Expand All @@ -12,7 +12,13 @@
"hive_table": "video_stats",
"importer_args": "duration=<duration> dt=<start_dt>",
"mongo_collection": "video_stats"
}
},
{ "hive_script": "s3://ka-mapreduce/code/hive/teacher_geo.q",
"hive_args": {"end_dt": "<end_dt>"},
"hive_table": "teacher_country",
"mongo_collection": "teacher_country",
"drop": true
}

]

Expand Down
13 changes: 12 additions & 1 deletion map_reduce/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,18 @@ upload-hive:
scp -i $(PRIVATE_KEY) hive/* $(HIVE_USER)@$(HIVE_MASTER):~/

upload-s3:
python deploy.py -v
python deploy.py -v code

geo-ip:
# Moves fresh GeoIP database with pygeoip to s3 datastore for use with hive queries
# You should only want to run this target if you are using geolocation library pygeoip
# and there has been an update to it or the GeoIP database
# If the following command does not work refer to http://dev.maxmind.com/geoip/legacy/install/city
# MaxMind recently announced V2 of their services, however, they don't have yet new offline database
# for it
wget -N -q http://geolite.maxmind.com/download/geoip/database/GeoLiteCity.dat.gz
gunzip --stdout GeoLiteCity.dat.gz > GeoLiteCity.dat
python deploy.py geoip

hadoop-ui:
ssh -f -i $(PRIVATE_KEY) -l $(HIVE_USER) -N -L2001:localhost:9100 \
Expand Down
38 changes: 31 additions & 7 deletions map_reduce/deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@
"""Deploy utilities for map reduce code.
This is a simple wrapper around copying files to S3 for the most part.
Usually you will want to run it like this:
python deploy.py -v code
In rare cases when you're updating GeoIP database on S3 run
"python deploy.py geoip"
For even more information about GeoIP refer to Makefile in this directory
"""

import optparse
Expand Down Expand Up @@ -96,7 +102,7 @@ def files_in_prod(branch=""):
return files


def copy_files_to_prod(filenames, branch=None):
def copy_files_to_prod(filenames, branch=None, flatten=False):
"""Copies all given files to ka-mapreduce S3 code bucket"""

dirname = _get_branch_dirname(branch)
Expand All @@ -105,6 +111,8 @@ def copy_files_to_prod(filenames, branch=None):
bucket = s3conn.get_bucket('ka-mapreduce')
for filename in filenames:
with open(filename, 'r') as filestream:
if flatten:
filename = os.path.basename(filename)
filepath = "%s%s" % (dirname, filename)
key = bucket.get_key(filepath)
if not key:
Expand Down Expand Up @@ -172,7 +180,7 @@ def send_hipchat_deploy_message(


# TODO(benkomalo): wire up options for subdirectory to deploy to (for testing)
def do_deploy(verbose, branch=""):
def do_deploy(verbose, branch="", flatten=False):
in_tree = set(files_in_tree())
in_prod = set(files_in_prod(branch))
new_files = in_tree - in_prod
Expand All @@ -194,24 +202,40 @@ def do_deploy(verbose, branch=""):
if raw_input("Proceed? [y/N]: ").lower() not in ['y', 'yes']:
return

copy_files_to_prod(files_to_push, branch)
copy_files_to_prod(files_to_push, branch, flatten)
print "Done!"
if not branch.startswith("branch-"):
if not branch.startswith("branch-"):
# we only notify if not pushing to a personal branch.
# "branch-" is the conventional prefix for a personal branch.
# "branch-" is the conventional prefix for a personal branch.
send_hipchat_deploy_message(
replaced_files, new_files, spurious_files, dest_path)


if __name__ == '__main__':
parser = optparse.OptionParser()
usage = "Usage: %prog [options] {code|geoip}"
parser = optparse.OptionParser(usage=usage)
parser.add_option('-v', '--verbose',
action="store_true", dest="verbose",
help="Print more information during the deploy process")
parser.add_option('-b', '--branch',
default="",
help=("The branch to deploy to. By default, no branch is specified "
"implying that the default production branch is used"))
parser.add_option('-f', '--flatten',
action="store_true", dest="flatten",
help=("Discard the directory structure from the source."
"Uploads files directly to branch speficied"))

options, args = parser.parse_args()
do_deploy(options.verbose, options.branch)
if len(args) < 1:
# Catch invalid usage
parser.print_help()

if args[0] == "code":
do_deploy(options.verbose, options.branch, options.flatten)
if args[0] == "geoip":
geoIpFiles = ["const.py", "util.py", "timezone.py", "__init__.py"]
fullPahtFiles = map(
lambda f: "../third_party/pygeoip/pygeoip/" + f, geoIpFiles)
copy_files_to_prod(["GeoLiteCity.dat"] +
fullPahtFiles, "geo/", options.flatten)
17 changes: 17 additions & 0 deletions map_reduce/hive/ka_hive_init.q
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,23 @@ CREATE EXTERNAL TABLE IF NOT EXISTS student_teacher_count (
dt STRING
) LOCATION 's3://ka-mapreduce/summary_tables/student_teacher_count';

-- Holds geolocated summary of teachers
DROP TABLE IF EXISTS teacher_country;
CREATE EXTERNAL TABLE IF NOT EXISTS teacher_country (
teacher STRING,
user_id STRING,
user_email STRING,
user_nickname STRING,
joined DOUBLE,
ip STRING,
city STRING,
region STRING,
country_code STRING,
country STRING,
latitude FLOAT,
longitude FLOAT
) LOCATION 's3://ka-mapreduce/summary_tables/teacher_country';

CREATE EXTERNAL TABLE IF NOT EXISTS video_topic(
vid_key STRING, vid_title STRING, topic_key STRING,
topic_title STRING, topic_desc STRING)
Expand Down
64 changes: 2 additions & 62 deletions map_reduce/hive/student_teacher_count.q
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
-- Query to produce time series of students and teachers using the site.
-- TODO(robert): Results produced by these queries are fundamentally flawed.
-- We have to record date of change of coach to be able to produce
-- We have to record date of change of coach to be able to produce
-- accurate report

-- Arguments:
Expand All @@ -12,81 +12,21 @@
-- Student: User who has a teacher.
-- Active Student: Student who completed an activity in last 28 days.

DROP TABLE IF EXISTS user_coach_date;
CREATE EXTERNAL TABLE IF NOT EXISTS user_coach_date(
user STRING,
coach STRING,
joined_on STRING,
self_coach BOOLEAN
) LOCATION 's3://ka-mapreduce/tmp/user_coach_date';

DROP TABLE IF EXISTS teacher_on_date;
CREATE EXTERNAL TABLE IF NOT EXISTS teacher_on_date(
teacher STRING,
dt STRING
) LOCATION 's3://ka-mapreduce/tmp/teacher_on_date';
SOURCE FILE s3://ka-mapreduce/code/hive/student_teacher_current.q;

DROP TABLE IF EXISTS active_teacher_on_date;
CREATE EXTERNAL TABLE IF NOT EXISTS active_teacher_on_date(
teacher STRING,
dt STRING
) LOCATION 's3://ka-mapreduce/tmp/active_teacher_on_date';

DROP TABLE IF EXISTS student_on_date;
CREATE EXTERNAL TABLE IF NOT EXISTS student_on_date(
student STRING,
teacher STRING,
dt STRING
) LOCATION 's3://ka-mapreduce/tmp/student_on_date';

DROP TABLE IF EXISTS active_student_on_date;
CREATE EXTERNAL TABLE IF NOT EXISTS active_student_on_date(
student STRING,
teacher STRING,
dt STRING
) LOCATION 's3://ka-mapreduce/tmp/active_student_on_date';

ADD FILE s3://ka-mapreduce/code/py/coach_reduce.py;
ADD FILE s3://ka-mapreduce/code/py/ka_udf.py;

-- Extract relevant information from UserData table
INSERT OVERWRITE TABLE user_coach_date
SELECT a.user, a.coach,
from_unixtime(cast(cast(a.joined AS FLOAT) AS INT), 'yyyy-MM-dd')
AS joined_on,
(a.coach = a.user or a.coach = a.user_email or
a.coach = a.user_id) as self_coach
FROM (
SELECT TRANSFORM(UserData.json)
USING 'ka_udf.py explode user,user_id,user_email,joined coaches'
AS user, user_id, user_email, joined, coach
FROM UserData
) a;

-- Custom map reduce routine - compute first day when
-- given coach became a teacher
-- refer to map_reduce/py/coach_reduce.py for reduce function
FROM (
SELECT user, coach, joined_on
FROM user_coach_date
WHERE NOT self_coach
DISTRIBUTE BY coach
SORT BY coach DESC, joined_on ASC
) st_date
INSERT OVERWRITE TABLE teacher_on_date
REDUCE st_date.user, st_date.coach, st_date.joined_on
USING 'coach_reduce.py teacher' AS teacher, dt;

-- Find first date when each user, coach pair became student, teacher pair
INSERT OVERWRITE TABLE student_on_date
SELECT u_dt.user, t_dt.teacher, IF(MIN(t_dt.dt) > MIN(u_dt.joined_on),
MIN(t_dt.dt), MIN(u_dt.joined_on)) AS dt
FROM user_coach_date u_dt
JOIN teacher_on_date t_dt
ON u_dt.coach = t_dt.teacher
WHERE NOT u_dt.self_coach
GROUP BY u_dt.user, t_dt.teacher;

-- Find all active students
-- Active student is a user who performed an action,
-- as defined by user_daily_activity, in last 28 days
Expand Down
64 changes: 64 additions & 0 deletions map_reduce/hive/student_teacher_current.q
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
-- Common part of teachers and students metrics
-- Finds first day for each coach, user when he became a teacher, student

DROP TABLE IF EXISTS user_coach_date;
CREATE EXTERNAL TABLE IF NOT EXISTS user_coach_date(
user STRING,
coach STRING,
joined_on STRING,
self_coach BOOLEAN
) LOCATION 's3://ka-mapreduce/tmp/user_coach_date';

DROP TABLE IF EXISTS teacher_on_date;
CREATE EXTERNAL TABLE IF NOT EXISTS teacher_on_date(
teacher STRING,
dt STRING
) LOCATION 's3://ka-mapreduce/tmp/teacher_on_date';

DROP TABLE IF EXISTS student_on_date;
CREATE EXTERNAL TABLE IF NOT EXISTS student_on_date(
student STRING,
teacher STRING,
dt STRING
) LOCATION 's3://ka-mapreduce/tmp/student_on_date';

ADD FILE s3://ka-mapreduce/code/py/coach_reduce.py;
ADD FILE s3://ka-mapreduce/code/py/ka_udf.py;

-- Extract relevant information from UserData table
INSERT OVERWRITE TABLE user_coach_date
SELECT a.user, a.coach,
from_unixtime(cast(cast(a.joined AS FLOAT) AS INT), 'yyyy-MM-dd')
AS joined_on,
(a.coach = a.user or a.coach = a.user_email or
a.coach = a.user_id) as self_coach
FROM (
SELECT TRANSFORM(UserData.json)
USING 'ka_udf.py explode user,user_id,user_email,joined coaches'
AS user, user_id, user_email, joined, coach
FROM UserData
) a;

-- Custom map reduce routine - compute first day when
-- given coach became a teacher
-- refer to map_reduce/py/coach_reduce.py for reduce function
FROM (
SELECT user, coach, joined_on
FROM user_coach_date
WHERE NOT self_coach
DISTRIBUTE BY coach
SORT BY coach DESC, joined_on ASC
) st_date
INSERT OVERWRITE TABLE teacher_on_date
REDUCE st_date.user, st_date.coach, st_date.joined_on
USING 'coach_reduce.py teacher' AS teacher, dt;

-- Find first date when each user, coach pair became student, teacher pair
INSERT OVERWRITE TABLE student_on_date
SELECT u_dt.user, t_dt.teacher, IF(MIN(t_dt.dt) > MIN(u_dt.joined_on),
MIN(t_dt.dt), MIN(u_dt.joined_on)) AS dt
FROM user_coach_date u_dt
JOIN teacher_on_date t_dt
ON u_dt.coach = t_dt.teacher
WHERE NOT u_dt.self_coach
GROUP BY u_dt.user, t_dt.teacher;
Loading

0 comments on commit 6d1dfa9

Please sign in to comment.