__Import all packages needed__

In [1]:
from pymongo import MongoClient
from pprint import pprint
import json
from bson.code import Code

__1. Creates a MongoDB database called "amazon".__

In [2]:
client = MongoClient('localhost', 27017)
db = client.amazon

__2. Reads "reviews_electronics.16.json" and uploads each review as a separate document to the collection "reviews" in the database "amazon".__

In [3]:
reviews = db.reviews
review_json = open('reviews_electronics.16.json', 'r')
review_info = [json.loads(line) for line in review_json]
reviews.insert_many(review_info)

<pymongo.results.InsertManyResult at 0x13331d460>

__3. Uses MongoDB's map reduce function to build a new collection "avg_scores" that averages review scores by product ("asin"). Print the first 100 entries of "avg_scores" to screen.__

In [81]:
mapf = Code('''
function() { emit(this.asin, this.overall) }
''')

reducef = Code('''
function(key, value) { return Array.avg(value) }
''')

avg_score = reviews.map_reduce(mapf, reducef, 'avg_score', limit = 100)

for score in avg_score.find():
    print (score)

{'_id': '0132793040', 'value': 5.0}
{'_id': 'B00E4KP4W6', 'value': 4.545454545454546}
{'_id': 'B00E4KP8VI', 'value': 5.0}
{'_id': 'B00E4KPMC8', 'value': 2.0}
{'_id': 'B00E4KQ5C4', 'value': 5.0}
{'_id': 'B00E4KQ9GG', 'value': 3.2857142857142856}
{'_id': 'B00E4KQ9K2', 'value': 5.0}
{'_id': 'B00E4KQD4E', 'value': 4.0}
{'_id': 'B00E4KZBX8', 'value': 4.0}
{'_id': 'B00E4KZDJ0', 'value': 5.0}
{'_id': 'B00E4L35DA', 'value': 4.0}
{'_id': 'B00E4L3N9Q', 'value': 4.0}
{'_id': 'B00E4L48EA', 'value': 5.0}
{'_id': 'B00E4L7FLI', 'value': 1.0}
{'_id': 'B00E4L7TS2', 'value': 4.0}
{'_id': 'B00E4LAL82', 'value': 3.0}
{'_id': 'B00E4LBZZK', 'value': 5.0}
{'_id': 'B00E4LF2Z4', 'value': 4.333333333333333}
{'_id': 'B00E4LFP0G', 'value': 4.444444444444445}
{'_id': 'B00E4LFWWW', 'value': 4.4}
{'_id': 'B00E4LGTVU', 'value': 3.9310344827586206}


__4. Uses MongoDB's map reduce function to build a new collection "weighted_avg_scores" that averages review scores by product ("asin"), weighted by the number of helpful votes (The base weight is 1 and for every additional helpful vote add 1 to weight. e.g. a "[3, 5]" value on "helpful" column should use 3 + 1 = 4 as weight, 3 being the additional votes and 1 being the base weight). Print the first 100 entries of "weighted_avg_scores" to screen.__

In [171]:
mapf = Code('''
function() { 
    key = this.asin,
    value = {
        weight_score: ( this.helpful[0] + 1 ) * this.overall,
        total_weight: this.helpful[0] + 1
        },
    emit( key, value )
    }
''')

reducef = Code('''
function(key, countObjVals) { 
    reducedVal = { total_weight: 0, weight_score: 0}
    for ( idx = 0; idx < countObjVals.length; idx++){
        reducedVal.total_weight += countObjVals[idx].total_weight;
        reducedVal.weight_score += countObjVals[idx].weight_score;
    }
    return reducedVal
    }
''')

finalizef = Code('''
function (key, reducedVal) {
reducedVal.weighted_avg = reducedVal.weight_score/reducedVal.total_weight
return reducedVal['weighted_avg']
}
''')

avg_score = reviews.map_reduce(mapf, reducef, 'weight_avg_score', finalize = finalizef, limit = 100)

for score in avg_score.find():
    print (score)

{'_id': '0132793040', 'value': 5.0}
{'_id': 'B00E4KP4W6', 'value': 4.684210526315789}
{'_id': 'B00E4KP8VI', 'value': 5.0}
{'_id': 'B00E4KPMC8', 'value': 2.0}
{'_id': 'B00E4KQ5C4', 'value': 5.0}
{'_id': 'B00E4KQ9GG', 'value': 3.6875}
{'_id': 'B00E4KQ9K2', 'value': 5.0}
{'_id': 'B00E4KQD4E', 'value': 4.0}
{'_id': 'B00E4KZBX8', 'value': 4.0}
{'_id': 'B00E4KZDJ0', 'value': 5.0}
{'_id': 'B00E4L35DA', 'value': 3.0}
{'_id': 'B00E4L3N9Q', 'value': 4.0}
{'_id': 'B00E4L48EA', 'value': 5.0}
{'_id': 'B00E4L7FLI', 'value': 1.0}
{'_id': 'B00E4L7TS2', 'value': 4.0}
{'_id': 'B00E4LAL82', 'value': 3.0}
{'_id': 'B00E4LBZZK', 'value': 5.0}
{'_id': 'B00E4LF2Z4', 'value': 4.153846153846154}
{'_id': 'B00E4LFP0G', 'value': 4.434782608695652}
{'_id': 'B00E4LFWWW', 'value': 4.4}
{'_id': 'B00E4LGTVU', 'value': 3.8857142857142857}
