In [3]:
#Add event_classification field to click stream logs

from pymongo import MongoClient
from pymongo.errors import BulkWriteError
import json

client = MongoClient()
dbname = 'crime101x2014'
db = client[dbname]

def event_classification(event):
    classification = "None"
 
    feature_eventmap_dict = {'forumread': '/discussion/forum/', 'forumcommentread': '/discussion/comments/',
                             'videoplay': 'play_video', 'videopause': 'pause_video', 'videostop': 'stop_video', 
                             'videoseek': 'seek_video', 'videospeedchange': 'speed_change_video',
                             'forumsearch': 'discussion/forum/search', 'checkprogress': 'progress',
                             'videoload': 'load_video'
                            }
    
    for key in feature_eventmap_dict:
        if  feature_eventmap_dict[key] in event:
            classification = key
            break;
    
    return classification

def process_clickstream(db):
    # Find all forum posts
    cursor = db.clickstream.find()
    
    bulk_op = db.clickstream.initialize_unordered_bulk_op()
    cnt = 1
    for document in cursor:
        #print document
        mongo_id = document['_id']
        event_type = document['event_type']
        event = document['event'] # this returns a string
        classification = event_classification(event_type)
        cnt = 1
        if 'discussion/forum/search' in event_type:
            if cnt < 10:
                #print document['event']
                cnt += 1
            event_dict = json.loads(event)
            searchtext = ""
            if "sort_key" in event_dict['GET']:
                classification = "forumsort"
            elif "text" in event_dict['GET']:
                classification = "forumsearch"
                searchtext = event_dict['GET']['text']
                #print searchtext
            bulk_op.find({'_id': mongo_id}).update({'$set': {'event_classification': classification, 'searchtext': searchtext}})
        elif event_type in ['play_video','load_video']:
            #print event
            event_dict = json.loads(event)
            bulk_op.find({'_id': mongo_id}).update({'$set': {'event_classification': classification, 'event_id': event_dict['id']}})
        else:
            bulk_op.find({'_id': mongo_id}).update({'$set': {'event_classification': classification}})
        
    try:
        bulk_op.execute()
    except BulkWriteError as bwe:
        print bwe.details
        
        
def test():
    
    #pipeline = [ 
    #{ "$match": { "event_classification": "videoplay", "context.user_id": 2660628}  },
    #{ "$group": { "_id": "$event.id"}, {"count": { "$sum": 1 }}   },
    #{ "$group": { "_id": 1, "count": { "$sum": 1 } } }
    #]
 
    #pipeline = [ 
    #{ "$match": { "event_classification": "videoplay", "context.user_id": 2660628}},
    #{ "$group": {
    #    "_id": "$event.id",
    #    "distinctCount": { "$sum": 1 }
    #}}
    #]
    
    pipeline = [
        { "$match": { "event_classification": "videoplay", "context.user_id": 2660628}},
        { "$group": { "_id": "$event_id"}  },{ "$group": { "_id": 1, "count": { "$sum": 1 } } } 
    ]

    docs = db.clickstream.aggregate(pipeline)
    
    docs = db.clickstream.distinct("event_id", { "event_classification": "videoplay", "context.user_id": 2660628})
    print len(docs)
    
    #for doc in docs:
    #    print doc
    
    results_count = db.clickstream.find({"event_classification": "videoplay", "context.user_id": 2660628}).count()
    print results_count

In [4]:
process_clickstream(db)
#test()

[u'statistical']
[u'quiz']
[u'parkinson2012']
[u'planed and organised']
[u'']
[u'Nancy']
[u'show answer']
[u'brett']
[u'tunic']
[u'When is next lesson?']
[u'brett']
[u'tunic']
[u'verified']
[u'photo discrepancies']
[u'Hilary']
[u'clues']
[u'']
[u'ltc73']
[u'Nancy']
[u'']
[u'did we ever get a gender on the clothing']
[u'mark moore']
[u'translate']
[u'planed and organised']
[u'iadnanllb']
[u'planed and ']
[u'certificate']
[u'episode 1 - profiling']
[u'certificate']
[u'grading']
[u'progress']
[u'andyduggan']
[u'backpack']
[u'progress']
[u'progress']
[u'music']
[u'beggar']
[u'Early questions... risky answers']
[u'progress']
[u'progress']
[u'progress bar']
[u'expressive']
[u'celey']
[u'pkchabot']
[u'edit a post']
[u'profesional']
[u'bag']
[u'tunic']
[u'brett']
[u'by anonymous']
[u'anonymous']
[u'ildin']
[u'celestelawson']
[u'progress']
[u'interviewing']
[u'episode 2 topic']
[u'']
[u'cross cultural studies']
[u'video']
[u'transcript']
[u'The right technique']
[u'woman']
[u'Man']
[u'BRiKie']
