## ⭐ Hybrid Recommendation
* 자료 출처 : https://thecleverprogrammer.com/2023/06/05/hybrid-recommendation-system-using-python/
* Explicit data 사용 (Rating 등의 사용자 선호도가 명시적으로 드러난 자료)

In [None]:
import pandas as pd
import os

## Data check

In [None]:
os.chdir('/content/drive/MyDrive/DACON_국민대AI/study/mysong/코드 예시')

In [None]:
data = pd.read_csv("./data/fashion_products.csv")
print(data.head())

   User ID  Product ID Product Name   Brand         Category  Price    Rating  \
0       19           1        Dress  Adidas    Men's Fashion     40  1.043159   
1       97           2        Shoes     H&M  Women's Fashion     82  4.026416   
2       25           3        Dress  Adidas  Women's Fashion     44  3.337938   
3       57           4        Shoes    Zara    Men's Fashion     23  1.049523   
4       79           5      T-shirt  Adidas    Men's Fashion     79  4.302773   

    Color Size  
0   Black   XL  
1   Black    L  
2  Yellow   XL  
3   White    S  
4   Black    M  


In [None]:
data

Unnamed: 0,User ID,Product ID,Product Name,Brand,Category,Price,Rating,Color,Size
0,19,1,Dress,Adidas,Men's Fashion,40,1.043159,Black,XL
1,97,2,Shoes,H&M,Women's Fashion,82,4.026416,Black,L
2,25,3,Dress,Adidas,Women's Fashion,44,3.337938,Yellow,XL
3,57,4,Shoes,Zara,Men's Fashion,23,1.049523,White,S
4,79,5,T-shirt,Adidas,Men's Fashion,79,4.302773,Black,M
...,...,...,...,...,...,...,...,...,...
995,20,996,Shoes,Zara,Women's Fashion,55,1.620081,Black,M
996,42,997,Sweater,Nike,Kids' Fashion,13,1.544464,Green,L
997,9,998,Sweater,Zara,Men's Fashion,47,3.961913,White,L
998,8,999,T-shirt,Zara,Women's Fashion,68,3.089722,Blue,S


## First Approach : Content-Based Filtering

In [None]:
# pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3163338 sha256=582a7cf2de84ce296f7cbf47fb5f5af474e451a08f87c5c14701a1138e180fe8
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


In [None]:
from surprise import Dataset, Reader, SVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [None]:
content_df = data[['Product ID', 'Product Name', 'Brand', 'Category', 'Color', 'Size']]
content_df['Content'] = content_df.apply(lambda row: ' '.join(row.dropna().astype(str)), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  content_df['Content'] = content_df.apply(lambda row: ' '.join(row.dropna().astype(str)), axis=1)


In [None]:
content_df

Unnamed: 0,Product ID,Product Name,Brand,Category,Color,Size,Content
0,1,Dress,Adidas,Men's Fashion,Black,XL,1 Dress Adidas Men's Fashion Black XL
1,2,Shoes,H&M,Women's Fashion,Black,L,2 Shoes H&M Women's Fashion Black L
2,3,Dress,Adidas,Women's Fashion,Yellow,XL,3 Dress Adidas Women's Fashion Yellow XL
3,4,Shoes,Zara,Men's Fashion,White,S,4 Shoes Zara Men's Fashion White S
4,5,T-shirt,Adidas,Men's Fashion,Black,M,5 T-shirt Adidas Men's Fashion Black M
...,...,...,...,...,...,...,...
995,996,Shoes,Zara,Women's Fashion,Black,M,996 Shoes Zara Women's Fashion Black M
996,997,Sweater,Nike,Kids' Fashion,Green,L,997 Sweater Nike Kids' Fashion Green L
997,998,Sweater,Zara,Men's Fashion,White,L,998 Sweater Zara Men's Fashion White L
998,999,T-shirt,Zara,Women's Fashion,Blue,S,999 T-shirt Zara Women's Fashion Blue S


In [None]:
# Use TF-IDF vectorizer to convert content into a matrix of TF-IDF features
tfidf_vectorizer = TfidfVectorizer()
content_matrix = tfidf_vectorizer.fit_transform(content_df['Content'])

content_similarity = linear_kernel(content_matrix, content_matrix)

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(data[['User ID',
                                  'Product ID',
                                  'Rating']], reader)

In [None]:
len(tfidf_vectorizer.get_feature_names_out())

1011

In [None]:
# 1000은 데이터 row 개수, 1011은 TF-IDF feature name 개수
print(content_matrix.shape)

(1000, 1011)


In [None]:
print(content_similarity)

[[1.         0.35544216 0.62850886 ... 0.10749189 0.01943933 0.15338818]
 [0.35544216 1.         0.21382974 ... 0.0252009  0.13830844 0.29824655]
 [0.62850886 0.21382974 1.         ... 0.01962514 0.1077074  0.24379159]
 ...
 [0.10749189 0.0252009  0.01962514 ... 1.         0.09788722 0.10187094]
 [0.01943933 0.13830844 0.1077074  ... 0.09788722 1.         0.07011138]
 [0.15338818 0.29824655 0.24379159 ... 0.10187094 0.07011138 1.        ]]


In [None]:
def get_content_based_recommendations(product_id, top_n):
    index = content_df[content_df['Product ID'] == product_id].index[0]
    similarity_scores = content_similarity[index]
    similar_indices = similarity_scores.argsort()[::-1][1:top_n + 1]
    recommendations = content_df.loc[similar_indices, 'Product ID'].values
    return recommendations

## Second Approach : Collaborative Filtering

In [None]:
algo = SVD()
trainset = data.build_full_trainset()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7ad283f1b9d0>

In [None]:
def get_collaborative_filtering_recommendations(user_id, top_n):
    testset = trainset.build_anti_testset()
    testset = filter(lambda x: x[0] == user_id, testset)
    predictions = algo.test(testset)
    predictions.sort(key=lambda x: x.est, reverse=True)
    recommendations = [prediction.iid for prediction in predictions[:top_n]]
    return recommendations

## And Finally, The Hybrid Approach

In [None]:
def get_hybrid_recommendations(user_id, product_id, top_n):
    content_based_recommendations = get_content_based_recommendations(product_id, top_n)
    collaborative_filtering_recommendations = get_collaborative_filtering_recommendations(user_id, top_n)

    hybrid_recommendations = list(set(content_based_recommendations + collaborative_filtering_recommendations))

    return hybrid_recommendations[:top_n]

In [None]:
user_id = 6
product_id = 11
top_n = 10
recommendations = get_hybrid_recommendations(user_id, product_id, top_n)

print(f"Hybrid Recommendations for User {user_id} based on Product {product_id}:")

for i, recommendation in enumerate(recommendations):
    print(f"{i + 1}. Product ID: {recommendation}")
    print(f"{i + 1}. Product ID: {recommendation}")

Hybrid Recommendations for User 6 based on Product 11:
1. Product ID: 1121
1. Product ID: 1121
2. Product ID: 578
2. Product ID: 578
3. Product ID: 867
3. Product ID: 867
4. Product ID: 647
4. Product ID: 647
5. Product ID: 1223
5. Product ID: 1223
6. Product ID: 1810
6. Product ID: 1810
7. Product ID: 980
7. Product ID: 980
8. Product ID: 1780
8. Product ID: 1780
9. Product ID: 792
9. Product ID: 792
10. Product ID: 1564
10. Product ID: 1564


## Saving python library info
* requirements.txt 파일 생성

In [None]:
# !pip freeze > requirements.txt