# Hidden Factor Hidden Topics
## Amazon Kindle Store Reviews Dataset
Dataset Link: http://jmcauley.ucsd.edu/data/amazon/
##### Sample Review

In [41]:

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import json

In [42]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
DATASET_SIZE = 30000
DATASET_FILE = 'AppReviews30k.json'
TRAINING_END_FRACTION = 0.8
TESTING_START_FRACTION = 0.9

In [44]:
dataset_file = open(DATASET_FILE)
dataset_lines = dataset_file.readlines()

train_set = dataset_lines[:int(TRAINING_END_FRACTION*DATASET_SIZE)]
test_set = dataset_lines[int(TESTING_START_FRACTION*DATASET_SIZE):]
print len(train_set)
print len(test_set)

In [45]:


data = []
for line in train_set:
    data.append(json.loads(line)["reviewText"])
print data[0]

24000
3000
Loves the song, so he really couldn't wait to play this. A little less interesting for him so he doesn't play long, but he is almost 3 and likes to play the older games, but really cute for a younger child.


In [47]:
tf_vectorizer = CountVectorizer(strip_accents='unicode',
                                stop_words='english',
                                token_pattern=r'\b[a-zA-Z]{3,}\b',
                                max_df=0.5,
                                min_df=5
                               )
dtm_tf = tf_vectorizer.fit_transform(data)

In [48]:
print (tf_vectorizer.get_feature_names()[:10])
print (dtm_tf.shape)

[u'abandoned', u'abilities', u'ability', u'able', u'absolute', u'absolutely', u'abstract', u'abundance', u'abuse', u'accelerometer']
(24000, 5867)


In [49]:
NUMBER_OF_TOPICS = 5


lda_tf = LatentDirichletAllocation(n_components=NUMBER_OF_TOPICS, random_state=18)
lda_tf.fit(dtm_tf)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=5, n_jobs=1,
             n_topics=None, perp_tol=0.1, random_state=18,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [50]:


ReviewsByUser = {}
ItemReviews = {}
AverageRatingsByUsers = {}
AverageRatingsByItems = {}
UserHistory = {}
GlobalAverage = 0.0
for line in train_set:
    d = json.loads(line)
    user = d["reviewerID"]
    item = d["asin"]
    try: 
        ReviewsByUser[user] += ' ' + d["reviewText"]
    except:
        ReviewsByUser[user] = d["reviewText"]
    try: 
        ItemReviews[item] += ' ' + d["reviewText"]
    except:
        ItemReviews[item] = d["reviewText"]
    try:
        AverageRatingsByUsers[user] = (AverageRatingsByUsers[user][0]+float(d["overall"]), AverageRatingsByUsers[user][1]+1)
    except:
        AverageRatingsByUsers[user] = (float(d["overall"]), 1)
    try:
        AverageRatingsByItems[item] = (AverageRatingsByItems[item][0]+float(d["overall"]), AverageRatingsByItems[item][1]+1)
    except:
        AverageRatingsByItems[item] = (float(d["overall"]), 1)
    try:
        UserHistory[user].append(item) 
    except:
        UserHistory[user] = [item]
    
    GlobalAverage += float(d["overall"])

GlobalAverage /= len(train_set)

for k in AverageRatingsByUsers.keys():
    AverageRatingsByUsers[k] = AverageRatingsByUsers[k][0]*1.0/AverageRatingsByUsers[k][1]        
        
for k in AverageRatingsByItems.keys():
    AverageRatingsByItems[k] = AverageRatingsByItems[k][0]*1.0/AverageRatingsByItems[k][1]

# print AverageRatingsByItems
# print AverageRatingsByUsers
print GlobalAverage
print "done"

4.03570833333
done


In [51]:
import numpy as np

WordToTopicMapping = {}

tf_feature_names = tf_vectorizer.get_feature_names()

rows = NUMBER_OF_TOPICS # topics count
columns = len(tf_feature_names) # Total number of words
    
M = np.zeros((rows, columns))
total_of_columns = np.sum(lda_tf.components_, axis=0)
print len(total_of_columns)
for i in range(columns):
    column_sum = total_of_columns[i]
    for j in range(rows):
        M[j][i] = lda_tf.components_[j][i]*1.0/column_sum

M = np.transpose(M)
DocVecRep = CountVectorizer(vocabulary=tf_feature_names) # document vector representation

5867


In [52]:
# print M

def MakeProbabilityDistribution(Gamma):
    column_sums = np.sum(Gamma, axis=0)
    return np.divide(Gamma, 1.0*column_sums)

Gamma_i = np.zeros((len(ItemReviews.keys()), NUMBER_OF_TOPICS)) #   #items   X  topics

j = 0
# Calculating latent factors for items
ItemToIndexMapping = {}
for item in ItemReviews.keys():
    ItemToIndexMapping[item] = j
    review = ItemReviews[item]
    review_vector = DocVecRep.fit_transform([review]).toarray()
    Gamma_i[j] = np.matmul(review_vector, M)
    j += 1


Gamma_i = np.transpose(Gamma_i)
Gamma_i = MakeProbabilityDistribution(Gamma_i)
print Gamma_i.shape

Gamma_u = np.zeros((len(ReviewsByUser.keys()), NUMBER_OF_TOPICS))
j = 0
# Calculating latent factors for users
UserToIndexMapping = {}
for user in ReviewsByUser.keys():
    UserToIndexMapping[user] = j
    review = ReviewsByUser[user]
    review_vector = DocVecRep.fit_transform([review]).toarray()
    Gamma_u[j] = np.matmul(review_vector, M)
    j += 1

Gamma_u = np.transpose(Gamma_u)
Gamma_u = MakeProbabilityDistribution(Gamma_u)
Gamma_u = np.transpose(Gamma_u)
print Gamma_u.shape

# print UserToIndexMapping
# print ItemToIndexMapping

    

(5, 329)
(17453, 5)


  """


In [53]:

total_users = len(ReviewsByUser.keys())
total_items = len(ItemReviews.keys())

ActualRatingAvailability = np.zeros((total_users, total_items))

for line in train_set:
    d = json.loads(line)
    userId = d["reviewerID"]
    itemId = d["asin"]
    user = UserToIndexMapping[userId]
    item = ItemToIndexMapping[itemId]

In [55]:
def PredictUserToItemRating(userId, itemId):
    userIndex = UserToIndexMapping[userId]
    itemIndex = ItemToIndexMapping[itemId]
    gamma_user = Gamma_u[userIndex]
    gamma_item = Gamma_i[:, itemIndex]
    return np.dot(gamma_user, gamma_item)

def IsActuallyRated(userId, itemId):
    user = UserToIndexMapping[userId]
    item = ItemToIndexMapping[itemId]
    
    return ActualRatingAvailability[user][item] == 1

RECOMMENDATIONS = 5

def RecommendItemsToUser(userId):
    itemsList = ItemToIndexMapping.keys()
    predictions = []
    for item in itemsList:
        if not IsActuallyRated(userId, item):
            predictions.append((item, PredictUserToItemRating(userId, item)))
    sorted_by_predictions = sorted(predictions, key=lambda tup: tup[1], reverse=True)
    print len(sorted_by_predictions)
    print sorted_by_predictions[:min(RECOMMENDATIONS, len(predictions))]
    return sorted_by_predictions[:min(RECOMMENDATIONS, len(predictions))]

def GetRecommendationsForUser(userId):
    predictions = RecommendItemsToUser(userId)
    return [_[0] for _ in predictions]

def GetEvaluationOnTestData():
    y_pred = []
    y_actual = []
    cnt = 0
    for line in test_set:
        d = json.loads(line)
        userId = d["reviewerID"]
        itemId = d["asin"]
        #print PredictUserToItemRating(userId, itemId)

        try:
            prediction = AverageRatingsByUsers[userId] + AverageRatingsByItems[itemId] - GlobalAverage + PredictUserToItemRating(userId, itemId)
            y_pred.append(prediction)
            y_actual.append(d["overall"])
        except:
            cnt += 1
            print cnt,
            pass
            
            
    MAE = round(mean_absolute_error(y_actual, y_pred), 2)
    RMSE = round(np.sqrt(mean_squared_error(y_actual, y_pred)), 2)
    return (MAE, RMSE)

USER_ID = 'A00100742Q4O8VH0YMUBZ'
print GetEvaluationOnTestData()
history = UserHistory[USER_ID]
recommendations = GetRecommendationsForUser(USER_ID)
print recommendations
print history
print cnt

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 

 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273

ValueError: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.

In [38]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


In [40]:

# Converting json to votes file format
DATASET_FILE = 'AppReviews30k.json'
dataset_file = open(DATASET_FILE)
dataset_lines = dataset_file.readlines()

output_file = open('AppReviews30k.votes', 'w')
for line in dataset_lines:
    d = json.loads(line)
    output_file.write(d["reviewerID"] + ' ' + d["asin"]+ ' ' + str(d["overall"]) + ' ' + str(d["unixReviewTime"]) + ' ' + str(len(d["reviewText"].split())) + ' ' + d["reviewText"] + '\n')
output_file.close()

In [24]:
def GetURL(products_list):
    print products_list
    products = ','.join([str(_) for _ in products_list])
    url = "http://www.asinlab.com/php/convertfromasin.php?asin_num=" + products + "&id_type=UPC&bulk=true&x=false"
    return url
    
print GetURL(recommendations)
print GetURL(history)

[u'B007VJ2KEC', u'B0051MYL8E', u'B001LTVCBU', u'B0043GCVUY', u'B004VB5JDW']
http://www.asinlab.com/php/convertfromasin.php?asin_num=B007VJ2KEC,B0051MYL8E,B001LTVCBU,B0043GCVUY,B004VB5JDW&id_type=UPC&bulk=true&x=false
[u'B00353PWAW', u'B0062BHOOG', u'B0014FKI1Q', u'B000EH4V18']
http://www.asinlab.com/php/convertfromasin.php?asin_num=B00353PWAW,B0062BHOOG,B0014FKI1Q,B000EH4V18&id_type=UPC&bulk=true&x=false


In [39]:
DATASET_FILE = 'AppReviewsFull.json'
dataset_file = open(DATASET_FILE)
dataset_lines = dataset_file.readlines()[:30000]

output_file = open('AppReviews30k.json', 'w')
for line in dataset_lines:
    output_file.write(line.strip() + '\n')
output_file.close()