In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
import pandas as pd
import gzip
import json

In [3]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)
def getNice(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

# Датасет с сайта Amazon с музыкальными инструментами

In [4]:
data = getNice('./reviews_Musical_Instruments_5.json.gz')

In [5]:
data.shape

(10261, 9)

In [6]:
data.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5.0,good,1393545600,"02 28, 2014"
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5.0,Jake,1363392000,"03 16, 2013"
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",The primary job of this device is to block the...,5.0,It Does The Job Well,1377648000,"08 28, 2013"
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",Nice windscreen protects my MXL mic and preven...,5.0,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014"
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",This pop filter is great. It looks and perform...,5.0,No more pops when I record my vocals.,1392940800,"02 21, 2014"


В данном датасете рейтинг (оценка) находится в колонке "overall"

In [7]:
data['overall'].value_counts()

overall
5.0    6938
4.0    2084
3.0     772
2.0     250
1.0     217
Name: count, dtype: int64

Сколько отдельных покупателей ставили оценки товарам:

In [8]:
data['reviewerID'].nunique()

1429

Сколько уникальных товаров есть в датасете:

In [9]:
data['asin'].nunique()

900

Количества покупателей и товаров сопоставимы, покупателей немного больше.

Для начала попробую применить метод коллаборативной фильтрации и посмотреть, что получится на таком маленьком датасете.

In [10]:
import numpy as np
from tqdm import tqdm_notebook
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.base import BaseEstimator

# Разбиение на train и test

Отбираю 20% последних оценок от каждого пользователя на тестирование, остальное пойдет на обучение модели.

In [11]:
def train_test_split(X, ratio=0.2):
    X.sort_values(by=['unixReviewTime'], inplace=True)
    userIDs = X['reviewerID'].unique()
    X_train_data = []
    X_test_data = []
    y_train = []
    y_test = []
    for userID in tqdm_notebook(userIDs):
        curUser = X[X['reviewerID'] == userID]
        idx = int(curUser.shape[0] * (1 - ratio))
        X_train_data.append(curUser[['reviewerID', 'asin']].iloc[:idx, :].values)
        X_test_data.append(curUser[['reviewerID', 'asin']].iloc[idx:, :].values)
        y_train.append(curUser['overall'].values[:idx])
        y_test.append(curUser['overall'].values[idx:])
    X_train = pd.DataFrame(np.vstack(X_train_data), columns=['reviewerID', 'asin'])
    X_test = pd.DataFrame(np.vstack(X_test_data), columns=['reviewerID', 'asin'])
    y_train = np.hstack(y_train)
    y_test = np.hstack(y_test)
    return X_train, X_test, y_train, y_test

In [12]:
X_train, X_test, y_train, y_test = train_test_split(data)

  0%|          | 0/1429 [00:00<?, ?it/s]

In [13]:
X_train.shape, X_test.shape

((7722, 2), (2539, 2))

In [14]:
X_train.head()

Unnamed: 0,reviewerID,asin
0,AV8MDYLHHTUOY,B000CD3QY2
1,AV8MDYLHHTUOY,B008BPI2HE
2,AV8MDYLHHTUOY,B0002Z2D0I
3,AV8MDYLHHTUOY,B0002Z2D08
4,A33H0WC9MI8OVW,B0002D0COE


In [15]:
y_train

array([4., 4., 5., ..., 5., 5., 5.])

# Коллоборативная фильтрация: User-based model
- на основе известных оценок товаров предсказываю оценку товару для данного покупателя, учитывая с большим весом оценки похожих покупателей
- похожесть покупателей вычисляю на основе косинусного сходства векторов их оценок

In [16]:
class UserBased(BaseEstimator):
    def fit(self, X, y, user_col='reviewerID', item_col='asin'):
        X = X.copy()
        self.users = X[user_col].unique()
        self.items = X[item_col].unique()
        X['y'] = y
        self.mean_y_user = X.groupby(user_col)['y'].mean()
        self.mean_y_item = X.groupby(item_col)['y'].mean()
        X['y'] -= X[user_col].apply(lambda x: self.mean_y_user[x])
        self.user_ratings = pd.pivot_table(X, values='y', index=user_col,
                                           columns=item_col, fill_value=0)
        self.user_sim = cosine_similarity(self.user_ratings)
        self.user_pos = dict()
        for user in self.users:
            self.user_pos[user] = np.argwhere(self.user_ratings.index.values == user)[0][0]
        return self

    def predict_rating(self, pr_user, pr_item):
        if not pr_item in self.items or not pr_user in self.users:
            return 0
        numerator = self.user_sim[self.user_pos[pr_user]].dot(
                        self.user_ratings.loc[:, pr_item])
        denominator = np.abs(self.user_sim[self.user_pos[pr_user]]).sum() - 1
        return self.mean_y_user[pr_user] + numerator / denominator
    def predict(self, X, user_col='reviewerID', item_col='asin'):
        y = X[[user_col, item_col]].apply(lambda row: self.predict_rating(row[0], row[1]), axis=1)
        return y

In [17]:
rmse = lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred))

In [18]:
model = UserBased().fit(X_train, y_train)

In [19]:
print('User-based Colloborative filtering RMSE = {}'.format(rmse(y_test, model.predict(X_test))))

User-based Colloborative filtering RMSE = 1.0338196858363196


# Коллоборативная фильтрация: Item-based model
- на основе известных оценок товаров от покупателя предсказываю оценки других товаров для этого покупателя, учитывая с большим весом похожие товары
- похожесть товаров также вычислим на основе косинусного сходства векторов их оценок


In [20]:
class ItemBased(BaseEstimator):
    def fit(self, X, y, user_col='reviewerID', item_col='asin'):
        X = X.copy()
        self.users = X[user_col].unique()
        self.items = X[item_col].unique()
        X['y'] = y
        self.mean_y_user = X.groupby(user_col)['y'].mean()
        self.mean_y_item = X.groupby(item_col)['y'].mean()
        X['y'] -= X[item_col].apply(lambda x: self.mean_y_item[x])
        self.item_ratings = pd.pivot_table(X, values='y', index=item_col,
                                           columns=user_col, fill_value=0)
        self.item_sim = cosine_similarity(self.item_ratings)
        self.item_pos = dict()
        for item in self.items:
            self.item_pos[item] = np.argwhere(self.item_ratings.index.values == item)[0][0]
        return self
    def predict_rating(self, pr_user, pr_item):
        if not pr_item in self.items or not pr_user in self.users:
            return 0
        numerator = self.item_sim[self.item_pos[pr_item]].dot(
                        self.item_ratings.loc[:, pr_user])
        denominator = np.abs(self.item_sim[self.item_pos[pr_item]]).sum() - 1
        return self.mean_y_item[pr_item] + numerator / denominator

    def predict(self, X, user_col='reviewerID', item_col='asin'):
        y = X[[user_col, item_col]].apply(lambda row: self.predict_rating(row[0], row[1]), axis=1)
        return y

In [21]:
model = ItemBased().fit(X_train, y_train)

In [22]:
print('Item-based Colloborative filtering RMSE = {}'.format(rmse(y_test, model.predict(X_test))))

Item-based Colloborative filtering RMSE = 1.071995624542344


Ошибки в случае User-based и Item-based очень похожи, но в User-based RMSE все же немного ниже.

Для создания рекомендательной системы все же попробую более продвинутые методы.

# Content-based filtration c TF-IDF

Для определения схожести товаров обработаю отзывы покупателей из колонки reviewText с помощью TF-IDF и получу вектора описаний, которые и будут использованы для определения схожести товаров. В отзывах чаще всего содержится достаточно много информации о товаре.

In [23]:
from typing import List, Dict
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import linear_kernel

In [24]:
data.shape

(10261, 9)

In [25]:
data.reviewText.nunique()

10255

Отзывы скорее не все уникальны, уберу дупликаты

In [26]:
clean_data = data.drop_duplicates(subset = 'reviewText')

In [27]:
tfidf = TfidfVectorizer(analyzer='word', stop_words='english')
tfidf_matrix = tfidf.fit_transform(clean_data['reviewText'])

In [28]:
tfidf_matrix.shape

(10255, 20251)

Схожесть товаров снова буду определять с помощью косинусной меры

In [29]:
cosine_similarities = cosine_similarity(tfidf_matrix)

Создаю словарь с похожими товарами для каждого товара

In [30]:
similarities = {}
for i in range(len(cosine_similarities)):
    similar_indices = cosine_similarities[i].argsort()[:-50:-1]
    similarities[clean_data['asin'].iloc[i]] = [(cosine_similarities[i][x], clean_data.iloc[x]['reviewText'], clean_data.iloc[x]['asin']) for x in similar_indices][1:]

И саму рекомендательную систему

In [31]:
class ContentBasedRecommender:
    def __init__(self, matrix):
        self.matrix_similar = matrix

    def _print_message(self, item, recom_item):
        rec_items = len(recom_item)

        print(f'The {rec_items} recommended items are:')
        for i in range(rec_items):
            print(f"Number {i+1}:")
            print(f"{recom_item[i][1]} with {round(recom_item[i][0], 3)} similarity score")
            print("--------------------")

    def recommend(self, recommendation, verbose = True):
        item = recommendation['item']
        number_items = recommendation['item_number']
        recom_item = self.matrix_similar[item][:number_items]
        if verbose:
            self._print_message(item=item, recom_item=recom_item)
        return recom_item

In [32]:
recommedations = ContentBasedRecommender(similarities)

Посмотрю, что система порекомендует тому, кто написал вот такой отзыв:

In [33]:
clean_data.iloc[2,4]

'Jade rosin gives a extra grippiness to the bow.  It hardly generates any dust, and gives the bow that extra grab.  The only problem is that the effect wears off quickly.  I use it in combination with Motrya Gold.'

In [34]:
idx = 2
recommendation = {
    "item": clean_data['asin'].iloc[idx],
    "item_number": 3
}
print('Recommendations for ', clean_data['asin'].iloc[idx], clean_data['reviewText'].iloc[idx],':\n\n')
recom_item = recommedations.recommend(recommendation)

Recommendations for  B002Q0WT6U Jade rosin gives a extra grippiness to the bow.  It hardly generates any dust, and gives the bow that extra grab.  The only problem is that the effect wears off quickly.  I use it in combination with Motrya Gold. :


The 3 recommended items are:
Number 1:
Very good rosin and does exactly what is described.  A nice finish to the other rosin with specific bows.  No need to try mix rosin when a new bow with the rosin labeled on the bow will reduce the mystery of trying to figure out which bow to used in different temperatures and humidity. with 0.54 similarity score
--------------------
Number 2:
I had some other rosin that was just a mess to apply.  This rosin is great because it's encased in a wooden sleeve that leaves your fingers free of rosin.  It's so easy to apply and doesn't leave any messy residue on my fingers.  I haven't been playing the violin for very long but did know I needed something to rosin the bow easily.  This was exactly what I needed.

В принципе неплохо, если предположить, что люди, купившие канифоль и оставившие отзыв о ней, хотят купить еще один одну канифоль

Посмотрю, что предложит система вот этому покупателю

In [36]:
customer = 'A33H0WC9MI8OVW'
cur_customer = clean_data[clean_data.reviewerID == customer]

Какой товар он оценил высоко?

In [37]:
cur_customer.sort_values('overall').iloc[-1]

reviewerID                                           A33H0WC9MI8OVW
asin                                                     B0002D0CGW
reviewerName                                              Clare Chu
helpful                                                      [1, 1]
reviewText        The .6 MM pick has just the right amount of fl...
overall                                                         5.0
summary                           Just right for Mountain dulcimers
unixReviewTime                                           1340150400
reviewTime                                              06 20, 2012
Name: 856, dtype: object

Медиатор для гитары. Что же ему предложит система?

In [38]:
recommendation = {
    "item": str(cur_customer.iloc[-1].asin),
    "item_number": 5
}
print('Recommendations for ',cur_customer.iloc[-1].asin, cur_customer.iloc[-1]['reviewText'].lower(),'\n\n')
recom_item = recommedations.recommend(recommendation)

Recommendations for  B0002D0CGW the .6 mm pick has just the right amount of flexibility without flapping around like thinner picks. and the dunlop nylon standard is my first choice for mountain dulcimer playing. i know people use yogurt containers, credit cards, bird feathers and even coffee stirrers for picks, but for me, the consistency of the dunlop pick can't be beaten. and the price is right, too! 


The 5 recommended items are:
Number 1:
I like a flexible pick and these at .60mm work just fine.  They are also in a good color so they don't stand out too much. with 0.315 similarity score
--------------------
Number 2:
What else can be said about them, they're good picks. They strum well, they pick well and they look good with 0.308 similarity score
--------------------
Number 3:
these come in multiple thickness and are great because of the thickness and the grip that is on the pick. makes it much easier to hold the pick as your hands get sweaty with 0.292 similarity score
---------

Да, человеку, оставившему отзывы о медиаторе для гитары, предлагается купить еще медиаторы для гитары.

Попробую гибридный подход с помощью библиотеки surprise

In [39]:
import surprise
from surprise.prediction_algorithms.knns import KNNBasic
from surprise import Dataset
from surprise import Reader

In [40]:
data.head(1)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
4420,AV8MDYLHHTUOY,B000CD3QY2,"Amazon Customer ""eyegor""","[18, 19]",The ability to quickly change the range and se...,4.0,GREAT Wah,1095465600,"09 18, 2004"


User-based Коллаборативная фильтрация 

In [41]:
reader = Reader(rating_scale=(1, max(data.overall)))
data = Dataset.load_from_df(data[['reviewerID','asin','overall']], reader)
trainset = data.build_full_trainset()
algo = KNNBasic()
algo.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f163bc7f6d0>

Посмотрю, что порекомендует алгоритм тому покупателю, который оценивал медиатор для гитары

In [42]:
user = 'A33H0WC9MI8OVW'

In [43]:
cur_user = clean_data[clean_data.reviewerID == user].copy()
preds = []
for code in list(clean_data.asin.unique()):
    if not code in set(cur_user.asin):
        pred = algo.predict(str(user), str(code)).est
        desc = clean_data.groupby('asin').first().reviewText[str(code)]
        preds.append([code, desc, pred])

In [44]:
pd.set_option('display.max_colwidth', None)
knn_preds = pd.DataFrame(preds, columns = ['asin', 'reviewText', 'prediction']).sort_values('prediction', ascending = False)
knn_preds.head()

Unnamed: 0,asin,reviewText,prediction
401,B0002H0JZC,"i've used d'addario strings for bass, electric, acoustic, and classic. even though this string was my first time use and it haven't failed me yet!! it's very stable and in tune all time while playing. highly recommand to other ppl",5.0
180,B002M8EEW8,"After online research, I decided to buy (and found locally) this midi pad to use in Reason 4. Although not confirmed (online) as working with Reason, I was happy to see it worked immediately with no problems. My CPU is Windows 7, but it worked on my MSI netbook also. This unit also worked in Ableton Live (as stated online). I would recommend this pad to anyone looking for an affordable midi drum pad that works in many music programs.",5.0
560,B0002H0KG0,We bought this for my daughter's boyfriend; we saved it as the last gift to give on Christmas Day; he wrapped the strap around his neck and never put it back down; I still don't think it has come out of his hands for more than a few minutes; I believe he is very happy with his gift!,5.0
119,B001EL6I8W,"I play a few stringed instruments, and ukulele isn't one of them. But after seeing so many great uke players lately- Jake Shimaukuro, Joe Brown- I had a hankering to try some ukulele playing myself. I wanted a playable instrument, and I didn't feel like spending a lot of money. I've tried $30-40 ukes in stores that were big disappointments- bad intonation, no tone, etc. After reading reviews here and elsewhere I decided to risk a purchase.Result? This is a surprisingly good instrument for less than fifty bucks. Sure, it doesn't have the volume or the rich tone of a soprano with a solid top, but it delivers good tone, excellent intonation, and good playability at a very good price. It may not be a 5-star intrument- but it's a 5-star value.If you're thinking about trying ukulele, but you're not sure you want to buy an expensive instrument right away, I would strongly recommend an LU-21. Even after you buy a better uke, this is a great one to keep on hand to shove into the hands of unsuspecting guests, so that you can play ukulele duets!",5.0
721,B005A09I7Q,"Recently got the itch to try some different picks out just for fun.. these, rubber picks, metal picks, etc. I play guitar.I'm not crazy about these for that purpose, but they aren't made for guitar obviously, so I can't fault them. They seem to be made well made. They are larger than I expected, and rather thick.Obviously these are made for ukes, so my review is probably not helpful to many. If you're like me and were looking for some variety for the guitar I'd suggest the rubber picks instead. They give a similar mellow sound without mudding up the tone, and they are easier and more comfortable to use also. Heck, maybe uke players should try those too?",5.0


- струны для гитары
- контроллер для инструментов
- нечто, что вешают на шею, к сожалению их описания не понятно, что это
- укулель
- медиатор для гитары

В принципе неплохо. А что, если обьединить рекомендациями Content-based?

In [45]:
recommendation = {
    "item": cur_user.sort_values('overall').iloc[-1].asin,
    "item_number": 5
}
recom_item = recommedations.recommend(recommendation, verbose = False)
content_preds = pd.DataFrame(recom_item, columns = ['similarity','reviewText','asin'])

In [46]:
res_recos = pd.concat([knn_preds[['asin','reviewText']].head(), content_preds[['asin','reviewText']]])
res_recos = res_recos.sample(frac=1).reset_index(drop = True)

In [47]:
import sklearn
from sklearn.utils import shuffle

In [48]:
recomendations_for_customer = shuffle(res_recos, random_state = 0)

In [49]:
recomendations_for_customer

Unnamed: 0,asin,reviewText
2,B0002H0JZC,"i've used d'addario strings for bass, electric, acoustic, and classic. even though this string was my first time use and it haven't failed me yet!! it's very stable and in tune all time while playing. highly recommand to other ppl"
8,B003B01QR2,these come in multiple thickness and are great because of the thickness and the grip that is on the pick. makes it much easier to hold the pick as your hands get sweaty
4,B0002H0KG0,We bought this for my daughter's boyfriend; we saved it as the last gift to give on Christmas Day; he wrapped the strap around his neck and never put it back down; I still don't think it has come out of his hands for more than a few minutes; I believe he is very happy with his gift!
9,B005A09I7Q,"Recently got the itch to try some different picks out just for fun.. these, rubber picks, metal picks, etc. I play guitar.I'm not crazy about these for that purpose, but they aren't made for guitar obviously, so I can't fault them. They seem to be made well made. They are larger than I expected, and rather thick.Obviously these are made for ukes, so my review is probably not helpful to many. If you're like me and were looking for some variety for the guitar I'd suggest the rubber picks instead. They give a similar mellow sound without mudding up the tone, and they are easier and more comfortable to use also. Heck, maybe uke players should try those too?"
1,B0002CZSJO,"Durable, affordable, easy to grasp and use."
6,B0002E2XCW,"I almost always play finger style without a pick, but on the occasion I need a pick I like these.Easier to strum sounds good to me.If you're doing a lot of picking out individual strings, you'll probably want something thicker, but these make it really easy to strum chords and sound good, plus pick out a few strings here and there."
7,B001EL6I8W,"I play a few stringed instruments, and ukulele isn't one of them. But after seeing so many great uke players lately- Jake Shimaukuro, Joe Brown- I had a hankering to try some ukulele playing myself. I wanted a playable instrument, and I didn't feel like spending a lot of money. I've tried $30-40 ukes in stores that were big disappointments- bad intonation, no tone, etc. After reading reviews here and elsewhere I decided to risk a purchase.Result? This is a surprisingly good instrument for less than fifty bucks. Sure, it doesn't have the volume or the rich tone of a soprano with a solid top, but it delivers good tone, excellent intonation, and good playability at a very good price. It may not be a 5-star intrument- but it's a 5-star value.If you're thinking about trying ukulele, but you're not sure you want to buy an expensive instrument right away, I would strongly recommend an LU-21. Even after you buy a better uke, this is a great one to keep on hand to shove into the hands of unsuspecting guests, so that you can play ukulele duets!"
3,B0002D0CGW,I like a flexible pick and these at .60mm work just fine. They are also in a good color so they don't stand out too much.
0,B001PGXHX0,"What else can be said about them, they're good picks. They strum well, they pick well and they look good"
5,B002M8EEW8,"After online research, I decided to buy (and found locally) this midi pad to use in Reason 4. Although not confirmed (online) as working with Reason, I was happy to see it worked immediately with no problems. My CPU is Windows 7, but it worked on my MSI netbook also. This unit also worked in Ableton Live (as stated online). I would recommend this pad to anyone looking for an affordable midi drum pad that works in many music programs."


Гибридный подход дает неплохой результат. Предлагаются товары, похожие на те, которые он уже покупал, а также что еще может понадобиться вместе с ними.