/
svd_sk.py
167 lines (141 loc) · 7.28 KB
/
svd_sk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import pandas as pd
import numpy as np
from Cython.Plex.Regexps import RE
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt
import os
from PIL import Image
from urllib2 import urlopen
import sys
import requests
from os import path
import argparse
import time
sys.path.append(os.path.join(os.path.abspath(os.path.dirname(__file__)), os.pardir, os.pardir))
here = path.abspath(path.dirname(__file__))
def get_data(start_time, input_file_path):
data = pd.read_csv(input_file_path).set_index('user')
print("Data import complete")
elaps = (time.time() - start_time) / 60
print("Successfully imported data from %s in %s minutes" % (input_file_path, elaps))
return data
def do_predictions(data, start_time):
data_matrix = data.as_matrix()
user_ratings_mean = np.mean(data_matrix, axis=1)
data_normalised = data_matrix - user_ratings_mean.reshape(-1, 1)
elaps = (time.time() - start_time) / 60
print("Data normalised in %s minutes" % str(elaps))
U, sigma, Vt = svds(data_normalised, k=80)
elaps = (time.time() - start_time) / 60
print("Data successfully decomposed into 3 singular matrix in with 80 iterations in %s minutes"
% str((elaps)))
sigma = np.diag(sigma)
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
predicted_df = pd.DataFrame(all_user_predicted_ratings, columns=data.columns, index=data.index)
print("Rmse values for doing svd on transaction data is " + str(((data.sub(predicted_df).pow(2).mean()).mean())))
elaps = (time.time() - start_time) / 60
print("Data prediction completed and loaded into dataframe in %s minutes"
% str(elaps))
print("Done predicting the ratings for all users and all items")
return predicted_df
def get_product_image(user_data):
prod_links = {
"red hot chili peppers": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQ1XeuRyHbmHL_vXtm6zesptPN4UO3CSGy1Kb7IS0Hv9Mv2m0iT",
"the killers": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQcawUk4qsSmE6kDaZHh_jNrzblblcbCgJ4eprGkEOHQFJzzgmvyA",
"jack johnson": "https://upload.wikimedia.org/wikipedia/en/thumb/4/42/Schandmaul-Leuchtfeuer.jpg/220px-Schandmaul-Leuchtfeuer.jpg"}
if user_data in prod_links:
image_url = prod_links[user_data]
image = user_data + '.jpg'
img_data = requests.get(image_url).content
with open(image, 'wb') as handler:
handler.write(img_data)
return image
def merge_images(image_list):
images = map(Image.open, image_list)
widths, heights = zip(*(i.size for i in images))
total_width = sum(widths)
max_height = max(heights)
new_im = Image.new('RGB', (total_width, max_height))
x_offset = 0
for im in images:
new_im.paste(im, (x_offset, 0))
x_offset += im.size[0]
new_im.save('user1_recommendations.jpg')
for i in image_list:
os.remove(i)
def get_user_rating(userid, file, topk, start_time):
image_list = []
print("Fetching the records for user %s" % str(userid))
input = pd.read_csv(file, sep="\t", header=None, index_col=0)
if int(userid) in input.index:
user_data = str(input.loc[userid][:topk])
user_data_list = list(input.loc[userid][:topk])
print("For user id " + str(userid) + " top " + str(topk) + " predictions are: \n" + str(user_data))
if userid == 1 and len(user_data_list) == 3:
for i in user_data_list:
image_list.append(get_product_image(i))
merge_images(image_list)
elaps = (time.time() - start_time) / 60
print("Records successfully fetched for user %s in %s minutes" % (str(userid),
str(elaps)))
else:
print("You have entered an invalid userid")
print("A user with userid %s doesnt exist in our records" % str(userid))
def is_user_valid(userid, file):
input = pd.read_csv(file, sep="\t", header=None, index_col=0)
if int(userid) in input.index:
return True
return False
def generate_ratings(uid, input_file_path, output_records_path, topk=5, refresh="no"):
uid = int(uid)
topk = int(topk)
start_time = time.time()
print("Starting SVD based recommendation engine")
if refresh == "no" and os.path.isfile(output_records_path):
print(
"Since the refresh is not requested and a local copy of record is found hence rendering the response")
print("Existing record found....generating recommendation from the records")
input = pd.read_csv(output_records_path, sep="\t", header=None)
get_user_rating(uid, output_records_path, topk, start_time)
else:
if os.path.isfile(output_records_path):
print("Since the user requested new recommendations hence resubmitting the job")
if not is_user_valid(uid, output_records_path):
print("No such user found , enter a valid uid")
return
else:
print("Generating new recommendation as no previous records found")
data = get_data(start_time, input_file_path)
predicted_df = do_predictions(data, start_time)
with open(output_records_path, "w") as text_file:
for userid in data.index:
user_row = data.loc[userid]
non_predicted_items = list(data.columns[(user_row == 0)])
user_prediction = predicted_df.loc[userid]
for item in non_predicted_items:
user_prediction.filter(like=item)
top_predictions = pd.DataFrame(user_prediction.sort_values(ascending=False))
out = str(userid) + "\t" + "\t".join(list(top_predictions.index))
text_file.write(out)
text_file.write("\n")
print("The recommendations have been generated")
print("0;All recommendations generated are written to %s in %s minutes" % (str(output_records_path),
str((
time.time() - start_time) / 60)))
get_user_rating(uid, output_records_path, topk, start_time)
if __name__ == '__main__':
try:
parser = argparse.ArgumentParser()
parser.add_argument('--input', help='Location of input transaction data', required=True)
parser.add_argument('--output', help='Location of output recommendations', required=True)
parser.add_argument('--userid', help='UserId for which you want to get recommendations', required=True)
parser.add_argument('--n', help='No of top recommendations')
parser.add_argument('--refresh', help='Whether you want new recommendations("yes"/"no")')
args = parser.parse_args()
input_file_path = args.input
output_records_path = args.output
generate_ratings(args.userid, input_file_path, output_records_path, args.n, args.refresh)
except Exception as e:
parser.print_help()
print("Exception occurred due to ", e)
#python svd_sk.py --output /home/khanna/codebase/recommendation-engine/python/records/svd_based_recommendation.txt --input /home/khanna/codebase/recommendation-engine/python/data/transaction_data.csv --userid 1 --n 5 --refresh "yes"