In [2]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [3]:
#載入資料集
rating_df = pd.read_csv('userstar.csv')
rating_df.head()

Unnamed: 0,BOOKNAME,CONTENT,ISBN,USER,USERSTAR
0,「氣內臟」自癒按摩法：每天按摩脾．肝．腎30秒，美肌、消除便秘、好眠、減肥，天天都有好氣色！,深入淺出，圖片清晰介唔按壓身體及伸展。\n按照個人體質、情志、節氣等，以吃，嗅香芬、調息、伸...,9789865683498,啟迪,5.0
1,唐鳳：我所看待的自由與未來,一開始是對唐鳳好奇，想更了解她，讀完之後真正體認到她是個傳奇人物，同時她的家庭給了她很大的自...,9789575036942,Miley Guan,5.0
2,老夫老妻重返青春（1）,看了很暖心的漫畫，會覺得要好好過每一天,9789865241810,蘇蘇,0.0
3,祕密瑜伽士的日常,直指心性的好書，書中道理發人省思,9789864779079,kbboss7416,5.0
4,花開千年（01）,好唯美的畫風。喜歡故事開頭引用的一段《佛經》：「彼岸花，開一千年，落一千年，花葉永不想見。情...,9789571068121,Annie,5.0


In [4]:
# 檢查空值
rating_df.isna().any()

BOOKNAME    False
CONTENT      True
ISBN        False
USER         True
USERSTAR     True
dtype: bool

In [5]:
# 清理空值並檢查空值
rating_df=rating_df.dropna(axis=0)
rating_df.isna().any()

BOOKNAME    False
CONTENT     False
ISBN        False
USER        False
USERSTAR    False
dtype: bool

In [6]:
# 清理重複
duplicates = rating_df.duplicated()

if duplicates.sum() > 0:
    print('> {} duplicates'.format(duplicates.sum()))
    rating_df = rating_df[~duplicates]

print('> {} duplicates'.format(rating_df.duplicated().sum()))

> 72 duplicates
> 0 duplicates


In [7]:
print('Rating Dataframe shape : ', rating_df.shape)

Rating Dataframe shape :  (100091, 5)


In [8]:
# 移除 CONTENT 欄位
rating_df.drop(['CONTENT'], axis = 1, inplace = True)

In [9]:
# ISBN轉為字串 移除 ISBN等於 0
rating_df['ISBN'] = rating_df['ISBN'].astype('str')
rating_df=rating_df.drop(rating_df[rating_df["ISBN"]=='0'].index,axis=0) 

In [10]:
# 給user_id
import statsmodels.api as sm
rating_df['user_id'] = pd.Categorical(rating_df.USER).codes   
rating_df.head()

Unnamed: 0,BOOKNAME,ISBN,USER,USERSTAR,user_id
0,「氣內臟」自癒按摩法：每天按摩脾．肝．腎30秒，美肌、消除便秘、好眠、減肥，天天都有好氣色！,9789865683498,啟迪,5.0,8025
1,唐鳳：我所看待的自由與未來,9789575036942,Miley Guan,5.0,937
2,老夫老妻重返青春（1）,9789865241810,蘇蘇,0.0,8685
3,祕密瑜伽士的日常,9789864779079,kbboss7416,5.0,4491
4,花開千年（01）,9789571068121,Annie,5.0,200


In [11]:
book_features_df = rating_df.pivot_table(index = 'ISBN',columns = 'user_id',values = 'USERSTAR')
book_features_df.head()

user_id,0,1,2,3,4,5,6,7,8,9,...,8901,8902,8903,8904,8905,8906,8907,8908,8909,8910
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1905302050014,,,,,,,,,,,...,,,,,,,,,,
1907082050017,,,,,,,,,,,...,,,,,,,,,,
4710415386243,,,,,,,,,,,...,,,,,,,,,,
4710614360792,,,,,,,,,,,...,,,,,,,,,,
4710614360808,,,,,,,,,,,...,,,,,,,,,,


In [12]:
missing_value = book_features_df.isna().sum()
missing_value

user_id
0       47170
1       47161
2       47109
3       47176
4       47158
        ...  
8906    47137
8907    47176
8908    47175
8909    47175
8910    47169
Length: 8911, dtype: int64

In [13]:
rows = book_features_df.shape[0]
cols = book_features_df.shape[1]

count_empty_or_zero_cells = book_features_df.isna().sum().sum()
total_elements = rows * cols

sparsity_of_matrix = count_empty_or_zero_cells/total_elements

print('Total Empty cells are : ', count_empty_or_zero_cells)
print('Total cells in Matrix are : ', total_elements)
print('Sparsity of Matrix are : ', sparsity_of_matrix)

Total Empty cells are :  420303254
Total cells in Matrix are :  420403158
Sparsity of Matrix are :  0.9997623614425846


In [14]:
# 空值用 0 取代
book_features_df.fillna(0, inplace = True)

In [15]:
book_features_df.values

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [16]:
from scipy.sparse import csr_matrix
book_features_df_matrix = csr_matrix(book_features_df.values)

In [17]:
# 建立餘弦相似模型 (K-近鄰演算法)
from sklearn.neighbors import NearestNeighbors
nearest_neighbor_model = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
nearest_neighbor_model.fit(book_features_df_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [18]:
# 隨機選擇一本書
total_no_of_books = book_features_df.shape[0]
print('Total books in our pivot table : ', total_no_of_books)
random_book_index = np.random.choice(total_no_of_books)
print('Random book Index : ', random_book_index)

Total books in our pivot table :  47178
Random book Index :  30497


In [19]:
book_features_df.iloc[random_book_index]

user_id
0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
8906    0.0
8907    0.0
8908    0.0
8909    0.0
8910    0.0
Name: 9789863564935, Length: 8911, dtype: float64

In [20]:
# 向量顯示 random_book
one_dimensional_representation_of_book_vector = book_features_df.iloc[random_book_index].values.reshape(1, -1)
one_dimensional_representation_of_book_vector

array([[0., 0., 0., ..., 0., 0., 0.]])

In [21]:
# 相似書本與 random_book 的餘弦距離指數
distances, indices = nearest_neighbor_model.kneighbors(one_dimensional_representation_of_book_vector, n_neighbors = 6)
print('Distance :', distances)
print('Indices :', indices)

Distance : [[0. 0. 0. 1. 1. 1.]]
Indices : [[30497 32932 31169 31453 31454 31452]]


In [22]:
indices = indices.flatten()
distances = distances.flatten()
for i in range(0, len(indices)):
    ## same book
    if i == 0:
        print('Recommendations for {0}:\n'.format(book_features_df.index[random_book_index]))
    else:
        ## similar books
        print('{0}: {1}, with distance of {2}'.format(i, book_features_df.index[indices[i]], distances[i]))

Recommendations for 9789863564935:

1: 9789864733156, with distance of 0.0
2: 9789863666646, with distance of 0.0
3: 9789863844679, with distance of 1.0
4: 9789863844693, with distance of 1.0
5: 9789863844662, with distance of 1.0


In [23]:
# 找出全部書本的相似資料集 
my_dict = {}
for book_index in range(book_features_df.shape[0]):
    
    one_dimensional_representation_of_book_vector = book_features_df.iloc[book_index].values.reshape(1, -1)
    distances, indices = nearest_neighbor_model.kneighbors(one_dimensional_representation_of_book_vector, n_neighbors = 4)
    indices = indices.flatten()
    distances = distances.flatten()
    similar_books = []
    
    for i in range(0, len(indices)):
        ## same book
        if i == 0:
            original_book = book_features_df.index[book_index]
        else:
            ## similar books
            similar_books.append(book_features_df.index[indices[i]])

        my_dict[original_book] = similar_books

In [24]:
# 秀出前三本相似
recommended_book_df = pd.DataFrame(my_dict)
recommended_book_df = recommended_book_df.T
recommended_book_df.columns = ['1st_Similar_book', '2nd_Similar_book', '3rd_Similar_book']
recommended_book_df.head()

Unnamed: 0,1st_Similar_book,2nd_Similar_book,3rd_Similar_book
1905302050014,9789571050997,9789573330158,9789571050775
1907082050017,9789571050997,9789573330158,9789571050775
4710415386243,9789869455220,9789578039407,9789865882471
4710614360792,9789571002798,9789571002767,9789571002736
4710614360808,9789571002798,9789571002767,9789571002736


In [25]:
# 書本推薦
book_name = "9789862134467"
result = recommended_book_df.loc[book_name, : ]
print('I have book recommendatation for you: \n')
for books in list(result.values):
      print(books)

I have book recommendatation for you: 

9780060731328
9789868459236
9789863202899
