# Hidden Factor Hidden Topics
## Amazon Kindle Store Reviews Dataset
Dataset Link: http://jmcauley.ucsd.edu/data/amazon/
##### Sample Review

In [41]:

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import json

In [56]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
DATASET_SIZE = 30000
DATASET_FILE = 'AppReviews30k.json'
TRAIN_SET_FILE = 'train_15000.json'
TEST_SET_FILE = 'test_5000.json'





In [57]:
dataset_file = open(TRAIN_SET_FILE)
train_set = dataset_file.readlines()

dataset_file = open(TEST_SET_FILE)
test_set = dataset_file.readlines()

print len(train_set)
print len(test_set)

15000
5000


In [58]:
data = []
for line in train_set:
    data.append(json.loads(line)["reviewText"])
print data[0]

My son plays this game all the time. He loves Umizoomi. He has all the badges and trophies for the first two levels on here, but the others will not unlock. I don't know how many times you have to replay the same two games, but now he's had it for months and is sick of it because it's just to long to wait for something else to happen.


In [59]:
tf_vectorizer = CountVectorizer(strip_accents='unicode',
                                stop_words='english',
                                token_pattern=r'\b[a-zA-Z]{3,}\b',
                                max_df=0.5,
                                min_df=5
                               )
dtm_tf = tf_vectorizer.fit_transform(data)

In [60]:
print (tf_vectorizer.get_feature_names()[:10])
print (dtm_tf.shape)

[u'abc', u'abilities', u'ability', u'able', u'absolute', u'absolutely', u'accept', u'acceptable', u'access', u'accessed']
(15000, 4154)


In [61]:
NUMBER_OF_TOPICS = 5


lda_tf = LatentDirichletAllocation(n_components=NUMBER_OF_TOPICS, random_state=18)
lda_tf.fit(dtm_tf)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=5, n_jobs=1,
             n_topics=None, perp_tol=0.1, random_state=18,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [62]:


ReviewsByUser = {}
ItemReviews = {}
AverageRatingsByUsers = {}
AverageRatingsByItems = {}
UserHistory = {}
GlobalAverage = 0.0
for line in train_set:
    d = json.loads(line)
    user = d["reviewerID"]
    item = d["asin"]
    try: 
        ReviewsByUser[user] += ' ' + d["reviewText"]
    except:
        ReviewsByUser[user] = d["reviewText"]
    try: 
        ItemReviews[item] += ' ' + d["reviewText"]
    except:
        ItemReviews[item] = d["reviewText"]
    try:
        AverageRatingsByUsers[user] = (AverageRatingsByUsers[user][0]+float(d["overall"]), AverageRatingsByUsers[user][1]+1)
    except:
        AverageRatingsByUsers[user] = (float(d["overall"]), 1)
    try:
        AverageRatingsByItems[item] = (AverageRatingsByItems[item][0]+float(d["overall"]), AverageRatingsByItems[item][1]+1)
    except:
        AverageRatingsByItems[item] = (float(d["overall"]), 1)
    try:
        UserHistory[user].append(item) 
    except:
        UserHistory[user] = [item]
    
    GlobalAverage += float(d["overall"])

GlobalAverage /= len(train_set)

for k in AverageRatingsByUsers.keys():
    AverageRatingsByUsers[k] = AverageRatingsByUsers[k][0]*1.0/AverageRatingsByUsers[k][1]        
        
for k in AverageRatingsByItems.keys():
    AverageRatingsByItems[k] = AverageRatingsByItems[k][0]*1.0/AverageRatingsByItems[k][1]

# print AverageRatingsByItems
# print AverageRatingsByUsers
print GlobalAverage
print "done"

4.05813333333
done


In [63]:
import numpy as np

WordToTopicMapping = {}

tf_feature_names = tf_vectorizer.get_feature_names()

rows = NUMBER_OF_TOPICS # topics count
columns = len(tf_feature_names) # Total number of words
    
M = np.zeros((rows, columns))
total_of_columns = np.sum(lda_tf.components_, axis=0)
print len(total_of_columns)
for i in range(columns):
    column_sum = total_of_columns[i]
    for j in range(rows):
        M[j][i] = lda_tf.components_[j][i]*1.0/column_sum

M = np.transpose(M)
DocVecRep = CountVectorizer(vocabulary=tf_feature_names) # document vector representation

4154


In [64]:
# print M

def MakeProbabilityDistribution(Gamma):
    column_sums = np.sum(Gamma, axis=0)
    return np.divide(Gamma, 1.0*column_sums)

Gamma_i = np.zeros((len(ItemReviews.keys()), NUMBER_OF_TOPICS)) #   #items   X  topics

j = 0
# Calculating latent factors for items
ItemToIndexMapping = {}
for item in ItemReviews.keys():
    ItemToIndexMapping[item] = j
    review = ItemReviews[item]
    review_vector = DocVecRep.fit_transform([review]).toarray()
    Gamma_i[j] = np.matmul(review_vector, M)
    j += 1


Gamma_i = np.transpose(Gamma_i)
Gamma_i = MakeProbabilityDistribution(Gamma_i)
print Gamma_i.shape

Gamma_u = np.zeros((len(ReviewsByUser.keys()), NUMBER_OF_TOPICS))
j = 0
# Calculating latent factors for users
UserToIndexMapping = {}
for user in ReviewsByUser.keys():
    UserToIndexMapping[user] = j
    review = ReviewsByUser[user]
    review_vector = DocVecRep.fit_transform([review]).toarray()
    Gamma_u[j] = np.matmul(review_vector, M)
    j += 1

Gamma_u = np.transpose(Gamma_u)
Gamma_u = MakeProbabilityDistribution(Gamma_u)
Gamma_u = np.transpose(Gamma_u)
print Gamma_u.shape

# print UserToIndexMapping
# print ItemToIndexMapping

    

(5, 1842)
(3139, 5)


In [65]:

total_users = len(ReviewsByUser.keys())
total_items = len(ItemReviews.keys())

ActualRatingAvailability = np.zeros((total_users, total_items))

for line in train_set:
    d = json.loads(line)
    userId = d["reviewerID"]
    itemId = d["asin"]
    user = UserToIndexMapping[userId]
    item = ItemToIndexMapping[itemId]

In [67]:
def PredictUserToItemRating(userId, itemId):
    userIndex = UserToIndexMapping[userId]
    itemIndex = ItemToIndexMapping[itemId]
    gamma_user = Gamma_u[userIndex]
    gamma_item = Gamma_i[:, itemIndex]
    return np.dot(gamma_user, gamma_item)

def IsActuallyRated(userId, itemId):
    user = UserToIndexMapping[userId]
    item = ItemToIndexMapping[itemId]
    
    return ActualRatingAvailability[user][item] == 1

RECOMMENDATIONS = 5

def RecommendItemsToUser(userId):
    itemsList = ItemToIndexMapping.keys()
    predictions = []
    for item in itemsList:
        if not IsActuallyRated(userId, item):
            predictions.append((item, PredictUserToItemRating(userId, item)))
    sorted_by_predictions = sorted(predictions, key=lambda tup: tup[1], reverse=True)
    print len(sorted_by_predictions)
    print sorted_by_predictions[:min(RECOMMENDATIONS, len(predictions))]
    return sorted_by_predictions[:min(RECOMMENDATIONS, len(predictions))]

def GetRecommendationsForUser(userId):
    predictions = RecommendItemsToUser(userId)
    return [_[0] for _ in predictions]

cnt = 0
def GetEvaluationOnTestData():
    y_pred = []
    y_actual = []
    for line in test_set:
        d = json.loads(line)
        userId = d["reviewerID"]
        itemId = d["asin"]
        #print PredictUserToItemRating(userId, itemId)

        try:
            prediction = AverageRatingsByUsers[userId] + AverageRatingsByItems[itemId] - GlobalAverage + PredictUserToItemRating(userId, itemId)
            y_pred.append(prediction)
            y_actual.append(d["overall"])
        except:
            cnt += 1
            #print cnt,
            pass
            
            
    MAE = round(mean_absolute_error(y_actual, y_pred), 2)
    RMSE = round(np.sqrt(mean_squared_error(y_actual, y_pred)), 2)
    return (MAE, RMSE)

USER_ID = 'A00100742Q4O8VH0YMUBZ'
print GetEvaluationOnTestData()
history = UserHistory[USER_ID]
recommendations = GetRecommendationsForUser(USER_ID)
print recommendations
print history
print cnt

(1.03, 1.4)
1842
[(u'B0097C8XL0', 0.24626559069641446), (u'B00FDUH4WY', 0.24606602162718538), (u'B00IG63BVK', 0.24267680725076549), (u'B00821NZ74', 0.24180343372661123), (u'B00FPYJLCY', 0.24100694055317715)]
[u'B0097C8XL0', u'B00FDUH4WY', u'B00IG63BVK', u'B00821NZ74', u'B00FPYJLCY']
[u'B00ANT8OF6', u'B00AMR1HZ8', u'B004WGGQPQ', u'B006C1ZSO4', u'B008LY1B32', u'B007KPT2N4', u'B0080JJLBW', u'B009P8EMCK']
0


In [68]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


In [2]:
import json

# Converting json to votes file format
DATASET_FILE = 'Apps20k.json'
dataset_file = open(DATASET_FILE)
dataset_lines = dataset_file.readlines()

output_file = open('Apps20k.votes', 'w')
for line in dataset_lines:
    d = json.loads(line)
    output_file.write(d["reviewerID"] + ' ' + d["asin"]+ ' ' + str(d["overall"]) + ' ' + str(d["unixReviewTime"]) + ' ' + str(len(d["reviewText"].split())) + ' ' + d["reviewText"] + '\n')
output_file.close()

In [70]:
def GetURL(products_list):
    print products_list
    products = ','.join([str(_) for _ in products_list])
    url = "http://www.asinlab.com/php/convertfromasin.php?asin_num=" + products + "&id_type=UPC&bulk=true&x=false"
    return url
    
print GetURL(recommendations)
print GetURL(history)

[u'B0097C8XL0', u'B00FDUH4WY', u'B00IG63BVK', u'B00821NZ74', u'B00FPYJLCY']
http://www.asinlab.com/php/convertfromasin.php?asin_num=B0097C8XL0,B00FDUH4WY,B00IG63BVK,B00821NZ74,B00FPYJLCY&id_type=UPC&bulk=true&x=false
[u'B00ANT8OF6', u'B00AMR1HZ8', u'B004WGGQPQ', u'B006C1ZSO4', u'B008LY1B32', u'B007KPT2N4', u'B0080JJLBW', u'B009P8EMCK']
http://www.asinlab.com/php/convertfromasin.php?asin_num=B00ANT8OF6,B00AMR1HZ8,B004WGGQPQ,B006C1ZSO4,B008LY1B32,B007KPT2N4,B0080JJLBW,B009P8EMCK&id_type=UPC&bulk=true&x=false


In [39]:
DATASET_FILE = 'AppReviewsFull.json'
dataset_file = open(DATASET_FILE)
dataset_lines = dataset_file.readlines()[:30000]

output_file = open('AppReviews30k.json', 'w')
for line in dataset_lines:
    output_file.write(line.strip() + '\n')
output_file.close()