In [71]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
import ast
import pickle
import sqlite3

class UserProfiling:
    def __init__(self, location_of_df='/home/gm/Desktop/ExcelR_Projects/book_recommendation/model_training/Model_building_Hybrid/TF-IDF/bart_final_preprocess.csv'):
        self.df = pd.read_csv(location_of_df)

    
    def save_pickle(self, data, filename):
        with open(filename, 'wb') as f:
            pickle.dump(data, f)

    def load_pickle(self, filename):
        with open(filename, 'rb') as f:
            return pickle.load(f)

    def get_item_profile(self):
        if os.path.exists('item_profiles.pkl'):
            return self.load_pickle('item_profiles.pkl')

    def get_item_profile(self):
        # Ensure the use of the class attribute self.df
        df = self.df.copy()
        
        df = df[~df.book_id.duplicated()]
        genres = df['genres'].apply(ast.literal_eval)

        # Convert genres to binary feature vectors
        mlb = MultiLabelBinarizer()
        genre_matrix = mlb.fit_transform(genres)
        genre_features = pd.DataFrame(genre_matrix, columns=mlb.classes_, index=df['book_id'])

        # Reduce dimensions with PCA
        pca = PCA(n_components=100)  # Adjust the number of components as needed
        reduced_genre_features = pca.fit_transform(genre_features)

        # Convert back to DataFrame for easy handling
        reduced_genre_features = pd.DataFrame(reduced_genre_features, index=df['book_id'])
        # print(reduced_genre_features)

        return reduced_genre_features

    def get_reviewer_profile(self):
        # import sqlite3
        # import ast
        connection=sqlite3.connect('/home/gm/Desktop/ExcelR_Projects/book_recommendation/preprocessing_cleaning/FINAL_DATA/DATASETS/book_reviews.db')
        book_reviews=pd.read_sql_query("SELECT * FROM book_reviews", connection)

        book_reviews=book_reviews[~book_reviews['review_rating'].isna()]
        book_reviews.reset_index(inplace=True,drop=True)
        book_reviews['rating_of_user']=book_reviews['review_rating'].apply(lambda x: x.split()[1])
        # user_item_interaction=book_reviews.groupby('reviewer_id')[['book_id', 'rating_of_user']].apply(lambda x: x.reset_index(drop=True))
        a=book_reviews[['book_id', 'reviewer_id', 'rating_of_user']]
        df_evaluated = a.map(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
        # print(df_evaluated)
        return df_evaluated

    def filtered_data(self):
        item_profiles = self.get_item_profile()
        df_evaluated = self.get_reviewer_profile()
        
        # Find symmetric difference between book_id indices
        total_index = set(item_profiles.index) ^ set(df_evaluated.book_id)
        symmetric_difference_list = list(total_index)

        # Drop rows with indices present only in one of the lists
        item_profiles_filtered = item_profiles[~item_profiles.index.isin(symmetric_difference_list)]
        df_evaluated_filtered = df_evaluated[~df_evaluated['book_id'].isin(symmetric_difference_list)]
        
        return df_evaluated_filtered, item_profiles_filtered

    def get_user_profiles(self):
        df_evaluated_filtered, item_profiles_filtered = self.filtered_data()
        print(len(df_evaluated_filtered),len(item_profiles_filtered))
        user_item_interactions = {
            reviewer_id: dict(zip(group['book_id'], group['rating_of_user']))
            for reviewer_id, group in df_evaluated_filtered.groupby('reviewer_id')
        }

        user_profiles = {}
        for user, interactions in user_item_interactions.items():
            profile = np.zeros(item_profiles_filtered.shape[1])
            total_weight = 0

            for book_id, rating in interactions.items():
                if book_id in item_profiles_filtered.index:
                    profile += rating * item_profiles_filtered.loc[book_id]
                    total_weight += rating

            if total_weight > 0:
                user_profiles[user] = profile / total_weight
            else:
                user_profiles[user] = profile

        user_profiles = pd.DataFrame(user_profiles).T
        print(len(item_profiles_filtered),len(user_profiles))
        return user_profiles, item_profiles_filtered

    def recommend_books(self, user_id, top_n=10):
        user_profiles, item_profiles = self.get_user_profiles()
        # print(user_profiles)
        if user_id not in user_profiles.index:
            return [], []

        user_profile = user_profiles.loc[user_id].values.reshape(1, -1)
        similarities = cosine_similarity(user_profile, item_profiles)
        similar_items = np.argsort(similarities[0])[::-1][:top_n]
        recommended_book_ids = item_profiles.index[similar_items]
        similarity_scores = similarities[0][similar_items]
        
        return recommended_book_ids, similarity_scores
if __name__ == '__main__':
        
    # Example usage:
    profiling = UserProfiling()
    recommendations, scores = profiling.recommend_books(user_id=1, top_n=10)
    print(recommendations)
    print(scores)


KeyboardInterrupt: 

In [5]:
pd.read_csv()

In [65]:
book_reviews

Unnamed: 0,book_id,reviewer_id,reviewer_name,likes_on_review,review_content,reviewer_followers,reviewer_total_reviews,review_date,review_rating
0,57094644,114413220,Sofia,582 likes,Just when you thought he was done writing book...,"7,961 followers",234 reviews,"February 24, 2021",
1,57094644,48328025,megs_bookrack,329 likes,Would you be shocked if I told you this was th...,12.1k followers,"1,802 reviews","March 17, 2024",Rating 5 out of 5
2,57094644,6728955,Mariah,232 likes,So you're telling me Anaisn'ta Daughter of Pos...,490 followers,"1,263 reviews","September 5, 2022",Rating 3 out of 5
3,57094644,101179363,ale (semi hiatus) ‧ ₊˚୨ ♡ ୧ ₊˚,218 likes,"*inserts vine ""anything for you, beyoncé""*upda...","2,709 followers",458 reviews,"June 3, 2021",
4,2948832,48727754,chan ☆,174 likes,i was excited about this one since it was so w...,55.1k followers,"1,139 reviews","June 9, 2021",Rating 2 out of 5
...,...,...,...,...,...,...,...,...,...
70954,6444962,5953049,Raymond Fraser,,Great book!,3 followers,20 reviews,"August 3, 2011",Rating 5 out of 5
70955,18700197,52416699,jodie,9 likes,"BEFORE YOU JUDGE ME, HEAR ME OUT.My sister bou...",3 followers,22 reviews,"January 2, 2017",Rating 1 out of 5
70956,18700197,15172424,Hol,2 likes,I got this along with Trisha's first book. Bot...,12 followers,135 reviews,"January 4, 2016",Rating 2 out of 5
70957,18700197,111041676,Amy Elizabeth,2 likes,Had my tooth out and fancied some trashy readi...,,26 reviews,"November 8, 2022",Rating 1 out of 5


In [67]:
book_reviews=book_reviews[['book_id', 'reviewer_id', 'review_rating']]

In [68]:
book_reviews.to_csv('book_reveiews_final_after_model.csv', index=False)

In [59]:
# connection = sqlite3.connect('/home/gm/Desktop/ExcelR_Projects/book_recommendation/preprocessing_cleaning/FINAL_DATA/DATASETS/book_reviews.db')
# book_reviews = pd.read_sql_query("SELECT * FROM book_reviews", connection)
        

In [50]:
book_reviews.reviewer_id.value_counts().sort_values(ascending=True)

reviewer_id
149388758      1
131091         1
1299194        1
2265296        1
10275797       1
            ... 
614778       336
60866073     359
32879029     408
19283284     414
4622890      744
Name: count, Length: 25509, dtype: int64

In [8]:
book_reviews.groupby('reviewer_id')[['book_id', 'review_rating']].apply(lambda x: x.reset_index(drop=True))

Unnamed: 0_level_0,Unnamed: 1_level_0,book_id,review_rating
reviewer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,35167685,Rating 5 out of 5
1,1,2255,Rating 5 out of 5
1,2,2612,Rating 4 out of 5
1,3,18077903,Rating 5 out of 5
1,4,34536488,Rating 5 out of 5
...,...,...,...
99920177,4,691001,Rating 5 out of 5
999233,0,8214792,Rating 3 out of 5
999233,1,19448,Rating 4 out of 5
99924305,0,185368,Rating 2 out of 5


In [11]:
df_evaluated_filtered

Unnamed: 0,book_id,reviewer_id,rating_of_user
0,57094644,48328025,5
1,57094644,6728955,3
2,2948832,48727754,2
3,2948832,18104163,4
4,2948832,886675,5
...,...,...,...
68873,6444962,5953049,5
68874,18700197,52416699,1
68875,18700197,15172424,2
68876,18700197,111041676,1


In [13]:
item_profiles_filtered

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
57094644,-0.951469,0.636601,-0.092245,-0.775985,0.039121,0.808302,0.225985,0.203049,-0.305197,0.287027,...,-0.014700,-0.022057,-0.048261,-0.079948,0.143144,0.065621,-0.020263,0.065362,-0.017268,-0.037676
895185,-0.048712,-1.144920,1.159710,-0.133889,0.669200,-0.178034,-0.280233,0.172990,-0.122358,0.189385,...,0.029990,-0.074067,-0.084619,0.047772,0.116717,-0.038605,-0.011790,0.127095,0.044652,-0.008672
2948832,-0.239808,-0.232418,-0.333035,0.901323,1.191967,-0.486329,0.373732,-0.318416,-0.161827,-0.164200,...,0.052009,0.021427,-0.270274,-0.228073,0.066405,-0.000592,-0.011561,0.012284,0.089107,-0.052239
154126,1.515637,0.274146,-0.266325,-0.358795,-0.018257,0.076199,-0.327595,-0.459095,0.076167,0.282942,...,-0.032653,-0.167610,-0.027245,-0.068520,0.316975,0.199025,-0.054132,0.005211,-0.482779,-0.087616
298663,-0.107443,-1.077447,0.852223,-0.494246,-0.183834,-0.245674,-0.636924,-0.453985,-0.202583,-0.294443,...,0.058394,-0.033172,-0.037040,0.028299,0.101481,-0.031079,0.034432,-0.004048,0.009412,-0.014644
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183057601,-0.092692,0.954389,0.051946,0.267454,-0.057373,-0.707010,-0.149414,-0.095568,-0.022986,-0.006605,...,0.064613,0.066750,0.047234,0.114736,-0.014643,-0.062483,-0.076867,0.059304,0.004386,-0.075012
337100,-0.678615,0.380408,-0.075983,-0.938803,-0.253898,-0.013350,1.096187,0.069523,-0.251863,0.156638,...,-0.033012,-0.029049,0.006911,-0.118682,0.043722,0.034447,-0.001995,0.041936,0.038740,-0.028366
9817,-0.190668,-0.567651,-0.637170,1.088111,-0.767789,-0.115578,-0.046936,0.710100,0.192340,0.415556,...,-0.029178,-0.005614,-0.002368,0.016563,-0.055447,-0.004890,0.037816,-0.024252,-0.009082,0.032751
32940867,-0.283824,-0.545649,1.279824,0.593246,0.027721,-0.276141,0.194318,-0.583094,0.405741,0.156758,...,-0.022407,0.057872,0.083718,-0.008884,-0.047524,-0.051508,0.040133,0.014333,-0.054594,-0.039761


In [14]:
type(df_evaluated_filtered['rating_of_user'][0])

numpy.int64

In [15]:
df_evaluated_filtered

Unnamed: 0,book_id,reviewer_id,rating_of_user
0,57094644,48328025,5
1,57094644,6728955,3
2,2948832,48727754,2
3,2948832,18104163,4
4,2948832,886675,5
...,...,...,...
68873,6444962,5953049,5
68874,18700197,52416699,1
68875,18700197,15172424,2
68876,18700197,111041676,1


In [16]:
set(list(df_evaluated_filtered.book_id))^ set(list(item_profiles_filtered.index))

set()

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
import ast
import pickle
import sqlite3
import os

class UserProfiling:
    def __init__(self, location_of_df='/home/gm/Desktop/ExcelR_Projects/book_recommendation/model_training/Model_building_Hybrid/TF-IDF/bart_final_preprocess.csv'):
        self.df = pd.read_csv(location_of_df)

    def save_pickle(self, data, filename):
        with open(filename, 'wb') as f:
            pickle.dump(data, f)

    def load_pickle(self, filename):
        with open(filename, 'rb') as f:
            return pickle.load(f)

    def get_item_profile(self):
        # Ensure the use of the class attribute self.df
        df = self.df.copy()
        
        df = df[~df.book_id.duplicated()]
        genres = df['genres'].apply(ast.literal_eval)

        # Convert genres to binary feature vectors
        mlb = MultiLabelBinarizer()
        genre_matrix = mlb.fit_transform(genres)
        genre_features = pd.DataFrame(genre_matrix, columns=mlb.classes_, index=df['book_id'])

        # Reduce dimensions with PCA
        pca = PCA(n_components=100)  # Adjust the number of components as needed
        reduced_genre_features = pca.fit_transform(genre_features)

        # Convert back to DataFrame for easy handling
        reduced_genre_features = pd.DataFrame(reduced_genre_features, index=df['book_id'])

        return reduced_genre_features

    def get_reviewer_profile(self):
        # import sqlite3
        # import ast
        connection=sqlite3.connect('/home/gm/Desktop/ExcelR_Projects/book_recommendation/preprocessing_cleaning/FINAL_DATA/DATASETS/book_reviews.db')
        book_reviews=pd.read_sql_query("SELECT * FROM book_reviews", connection)

        book_reviews=book_reviews[~book_reviews['review_rating'].isna()]
        book_reviews.reset_index(inplace=True,drop=True)
        book_reviews['rating_of_user']=book_reviews['review_rating'].apply(lambda x: x.split()[1])
        # user_item_interaction=book_reviews.groupby('reviewer_id')[['book_id', 'rating_of_user']].apply(lambda x: x.reset_index(drop=True))
        a=book_reviews[['book_id', 'reviewer_id', 'rating_of_user']]
        df_evaluated = a.map(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
        # print(df_evaluated)
        return df_evaluated

    def filtered_data(self):
        item_profiles = self.get_item_profile()
        df_evaluated = self.get_reviewer_profile()

        total_index = set(item_profiles.index) ^ set(df_evaluated.book_id)
        symmetric_difference_list = list(total_index)

        item_profiles_filtered = item_profiles[~item_profiles.index.isin(symmetric_difference_list)]
        df_evaluated_filtered = df_evaluated[~df_evaluated['book_id'].isin(symmetric_difference_list)]


        return df_evaluated_filtered, item_profiles_filtered

    def get_user_profiles(self):
        df_evaluated_filtered, item_profiles_filtered = self.filtered_data()
        user_item_interactions = {
            reviewer_id: dict(zip(group['book_id'], group['rating_of_user']))
            for reviewer_id, group in df_evaluated_filtered.groupby('reviewer_id')
        }

        user_profiles = {}
        for user, interactions in user_item_interactions.items():
            profile = np.zeros(item_profiles_filtered.shape[1])
            total_weight = 0

            for book_id, rating in interactions.items():
                if book_id in item_profiles_filtered.index:
                    profile += rating * item_profiles_filtered.loc[book_id]
                    total_weight += rating

            if total_weight > 0:
                user_profiles[user] = profile / total_weight
            else:
                user_profiles[user] = profile

        user_profiles = pd.DataFrame(user_profiles).T

        return user_profiles, item_profiles_filtered




u=UserProfiling()
df1,df2=u.filtered_data()


#     def create_profiles(self):
#         user_profiles, item_profiles_filtered = self.get_user_profiles()
#         # Save both profiles in a single pickle file
#         self.save_pickle({'user_profiles': user_profiles, 'item_profiles_filtered': item_profiles_filtered}, 'profiles.pkl')

#     def load_profiles(self):
#         profiles = self.load_pickle('profiles.pkl')
#         return profiles['user_profiles'], profiles['item_profiles_filtered']

#     def recommend_books(self, user_id=None, book_id=None, top_n=10):
#         user_profiles, item_profiles = self.load_profiles()

#         if user_id is not None:
#             if user_id not in user_profiles.index:
#                 return [], []
#             user_profile = user_profiles.loc[user_id].values.reshape(1, -1)
#             similarities = cosine_similarity(user_profile, item_profiles)
#         elif book_id is not None:
#             if book_id not in item_profiles.index:
#                 return [], []
#             item_profile = item_profiles.loc[book_id].values.reshape(1, -1)
#             similarities = cosine_similarity(item_profile, item_profiles)
#         else:
#             return [], []

#         similar_items = np.argsort(similarities[0])[::-1][:top_n]
#         recommended_book_ids = item_profiles.index[similar_items]
#         similarity_scores = similarities[0][similar_items]
        
#         return recommended_book_ids, similarity_scores

# if __name__ == '__main__':
#     profiling = UserProfiling()
    
#     # Check if the profiles pickle file exists, if not create it
#     if not os.path.exists('profiles.pkl'):
#         profiling.create_profiles()
    
#     df=pd.read_csv('/home/gm/Desktop/ExcelR_Projects/book_recommendation/model_training/Model_building_Hybrid/TF-IDF/bart_final_preprocess.csv')

#     # Example usage:
#     # book_title_input=input('Enter Title of the book:\n')
#     # book_id=df[df.book_title==book_title_input].book_id
#     # if book_id.empty:
#     #     print('Book not found')
#     # else:
#     #     book_id=book_id.iloc[0]
#     #     print(book_id)
#     # user_id=None
#     # book_id=2
#     # user_id=None
#     recommendations, scores = profiling.recommend_books(user_id=1,top_n=20)
#     print(recommendations)
#     a=df[df.book_id.isin(list(recommendations))].book_title
#     print(scores)


In [6]:
set(df2.index)^set(df1.book_id)

set()

In [30]:
a=list(set(df1.reviewer_id))

In [37]:
connection=sqlite3.connect(path)
book_reviews=pd.read_sql_query("SELECT * FROM book_reviews", connection)

In [39]:
a

[1,
 5,
 7,
 41,
 153,
 510,
 819,
 984,
 1994,
 2614,
 2631,
 2828,
 2938,
 3241,
 3784,
 3910,
 3947,
 4693,
 4695,
 5154,
 5176,
 5436,
 6049,
 6488,
 6556,
 6676,
 6681,
 6709,
 6724,
 7158,
 7261,
 7358,
 7742,
 8253,
 8348,
 8351,
 8583,
 8599,
 9136,
 9547,
 9750,
 9847,
 10378,
 10492,
 10707,
 10936,
 11513,
 11587,
 12190,
 12334,
 12586,
 13167,
 13338,
 13702,
 13779,
 14120,
 14172,
 14468,
 14494,
 15048,
 15118,
 15174,
 15294,
 15314,
 15824,
 15894,
 16032,
 16677,
 16958,
 17116,
 17285,
 17547,
 18213,
 18440,
 18631,
 18657,
 18661,
 18672,
 18687,
 18745,
 19026,
 19809,
 19993,
 20543,
 20610,
 20649,
 20673,
 20698,
 21105,
 21178,
 21250,
 21481,
 21602,
 21912,
 22033,
 22610,
 22902,
 23034,
 23260,
 23760,
 23800,
 24176,
 24423,
 25108,
 25708,
 25994,
 26125,
 26185,
 26188,
 26259,
 26276,
 26292,
 26561,
 26597,
 26639,
 26684,
 26694,
 26737,
 26852,
 27271,
 27465,
 27925,
 27944,
 28110,
 28445,
 28467,
 28654,
 28829,
 28859,
 29339,
 29490,
 30078,
 

In [48]:
import ast
book_reviews['reviewer_id']=book_reviews['reviewer_id'].apply(lambda x : ast.literal_eval(x) if isinstance(x, str) else x)

In [49]:
for i in book_reviews['reviewer_id']:
    print(type(i))

<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class

In [51]:
len(book_reviews)

70959

In [55]:
df

Unnamed: 0.1,Unnamed: 0,book_id,book_title,book_details,format,author,num_pages,genres,num_ratings,num_reviews,average_rating,5,4,3,2,1,years_since_first_publication,Genre_Interpretation
0,0,57094644,Daughter of the Deep,New York Times #1 best-selling author Rick Rio...,Hardcover,Rick Riordan,0.144748,"['Fantasy', 'Middle Grade', 'Young Adult', 'Sc...",0.254560,0.689025,0.680305,0.423443,0.375877,0.251464,0.124733,0.003403,-1.894448,Juvenile Fiction
1,1,895185,The Ghost,The stunning new novel from the No. 1 bestsell...,Hardcover,Robert Harris,0.304597,"['Fiction', 'Thriller', 'Mystery', 'Crime', 'P...",0.085572,0.237198,-0.762375,-0.017886,0.289047,0.370226,0.258989,-0.027662,-0.300537,Thriller/Mystery
2,2,2948832,Seduce Me at Sunrise,\nHe'd tried so hard to forget her.\nKev Merri...,Mass Market Paperback,Lisa Kleypas,0.185632,"['Historical Romance', 'Romance', 'Historical'...",0.511851,0.524474,0.063872,0.555305,0.613293,0.595864,0.507648,0.284715,-0.363598,Historical Fiction
3,3,154126,The Discovery of India,In conjunction with the Jawaharlal Nehru Memor...,Paperback,Jawaharlal Nehru,1.386309,"['History', 'India', 'Nonfiction', 'Politics',...",-0.250787,-0.348547,0.168100,-0.073290,-0.100157,-0.176481,-0.178933,0.013985,1.431631,Non Fiction
4,4,298663,The Killer Inside Me,"Everyone in the small town of Central City, Te...",Paperback,Jim Thompson,-0.484222,"['Fiction', 'Crime', 'Mystery', 'Noir', 'Thril...",0.207299,0.294163,-0.553099,0.211275,0.354494,0.387119,0.454474,0.447853,1.338255,Thriller/Mystery
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15500,15566,183057601,Ours,The third and final.Blurb coming soon.,ebook,Julie Mannino,0.023473,"['BDSM', 'Romance', 'M M Romance', 'Fantasy', ...",-2.048186,-1.953341,-0.204847,-2.234238,-2.870195,-2.183616,-2.208814,-2.029850,-2.895006,"Supernatural, Mystery, and Romance"
15501,15567,337100,The Runes of the Earth,Beginning ten years after the end of the accla...,Hardcover,Stephen R. Donaldson,0.946065,"['Fantasy', 'Fiction', 'Epic Fantasy', 'Scienc...",-0.298514,-0.463990,-0.762375,-0.269883,-0.148728,-0.034705,-0.002992,0.016072,-0.116297,Fantasy
15502,15568,9817,Ten Days in the Hills,It is the morning after the Academy Awards. Ma...,Hardcover,Jane Smiley,0.606985,"['Fiction', 'Contemporary', 'Romance', 'Litera...",-0.853062,-0.404413,-3.167038,-1.494164,-1.061748,-0.557024,-0.096651,0.296962,-0.248938,Classic Fiction
15503,15569,32940867,The Chemist,"In this gripping page-turner, an ex-agent on t...",Paperback,Stephenie Meyer,0.899438,"['Fiction', 'Thriller', 'Mystery', 'Romance', ...",0.890139,1.229197,-0.849215,0.737328,0.886439,0.991032,1.127764,1.192957,-1.130532,Thriller/Mystery


In [52]:
book_reviews=book_reviews[book_reviews['reviewer_id'].isin((a))]

In [57]:
book_reviews.to_sql('book_reviews_final', connection, if_exists='replace', index=False)

69444

In [36]:
path='/home/gm/Desktop/ExcelR_Projects/book_recommendation/preprocessing_cleaning/FINAL_DATA/DATASETS/book_reviews.db'

In [58]:
import os
os.getcwd()


'/home/gm/Desktop/ExcelR_Projects/book_recommendation/model_training/Model_building_Hybrid/User_Profiling_and_genre'

In [22]:
a=df[df.book_id.isin(df2.index)][['book_id','book_title']]

In [23]:
a.to_csv('book_names_final.csv',index=False)

In [59]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
import ast
import sqlite3

class UserProfiling:
    def __init__(self, location_of_df='/home/gm/Desktop/ExcelR_Projects/book_recommendation/model_training/Model_building_Hybrid/TF-IDF/bart_final_preprocess.csv'):
        self.df = pd.read_csv(location_of_df)
        self.item_profiles = self.get_item_profile()
        self.item_similarities = self.compute_item_similarities()

    def save_pickle(self, data, filename):
        with open(filename, 'wb') as f:
            pickle.dump(data, f)

    def load_pickle(self, filename):
        with open(filename, 'rb') as f:
            return pickle.load(f)

    def get_item_profile(self):
        df = self.df.copy()
        df = df[~df.book_id.duplicated()]
        genres = df['genres'].apply(ast.literal_eval)

        # Convert genres to binary feature vectors
        mlb = MultiLabelBinarizer()
        genre_matrix = mlb.fit_transform(genres)
        genre_features = pd.DataFrame(genre_matrix, columns=mlb.classes_, index=df['book_id'])

        # Reduce dimensions with PCA
        pca = PCA(n_components=100)  
        reduced_genre_features = pca.fit_transform(genre_features)

        # Convert back to DataFrame for easy handling
        reduced_genre_features = pd.DataFrame(reduced_genre_features, index=df['book_id'])

        return reduced_genre_features

    def get_reviewer_profile(self):
        # import sqlite3
        # import ast
        connection=sqlite3.connect('/home/gm/Desktop/ExcelR_Projects/book_recommendation/preprocessing_cleaning/FINAL_DATA/DATASETS/book_reviews.db')
        book_reviews=pd.read_sql_query("SELECT * FROM book_reviews", connection)

        book_reviews=book_reviews[~book_reviews['review_rating'].isna()]
        book_reviews.reset_index(inplace=True,drop=True)
        book_reviews['rating_of_user']=book_reviews['review_rating'].apply(lambda x: x.split()[1])
        # user_item_interaction=book_reviews.groupby('reviewer_id')[['book_id', 'rating_of_user']].apply(lambda x: x.reset_index(drop=True))
        a=book_reviews[['book_id', 'reviewer_id', 'rating_of_user']]
        df_evaluated = a.map(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
        # print(df_evaluated)
        return df_evaluated

    def filtered_data(self):
        df_evaluated = self.get_reviewer_profile()
        
        # Find symmetric difference between book_id indices
        total_index = set(self.item_profiles.index) ^ set(df_evaluated.book_id)
        symmetric_difference_list = list(total_index)

        # Drop rows with indices present only in one of the lists
        item_profiles_filtered = self.item_profiles[~self.item_profiles.index.isin(symmetric_difference_list)]
        df_evaluated_filtered = df_evaluated[~df_evaluated['book_id'].isin(symmetric_difference_list)]
        
        return df_evaluated_filtered, item_profiles_filtered

    def get_user_profiles(self):
        df_evaluated_filtered, item_profiles_filtered = self.filtered_data()
        user_item_interactions = {
            reviewer_id: dict(zip(group['book_id'], group['rating_of_user']))
            for reviewer_id, group in df_evaluated_filtered.groupby('reviewer_id')
        }

        user_profiles = {}
        for user, interactions in user_item_interactions.items():
            profile = np.zeros(item_profiles_filtered.shape[1])
            total_weight = 0

            for book_id, rating in interactions.items():
                if book_id in item_profiles_filtered.index:
                    profile += rating * item_profiles_filtered.loc[book_id]
                    total_weight += rating

            if total_weight > 0:
                user_profiles[user] = profile / total_weight
            else:
                user_profiles[user] = profile

        user_profiles = pd.DataFrame(user_profiles).T
        return user_profiles, item_profiles_filtered
    
    def create_profiles(self):
        user_profiles, item_profiles_filtered = self.get_user_profiles()
        # Save both profiles in a single pickle file
        self.save_pickle({'user_profiles': user_profiles, 'item_profiles_filtered': item_profiles_filtered}, 'profiles.pkl')

    def load_profiles(self):
        profiles = self.load_pickle('profiles.pkl')
        return profiles['user_profiles'], profiles['item_profiles_filtered']


    def compute_item_similarities(self):
        return cosine_similarity(self.item_profiles)

    def recommend_books_for_user(self, user_id, top_n=10):
        user_profiles, item_profiles = self.get_user_profiles()
        if user_id not in user_profiles.index:
            return [], []

        user_profile = user_profiles.loc[user_id].values.reshape(1, -1)
        similarities = cosine_similarity(user_profile, item_profiles)
        similar_items = np.argsort(similarities[0])[::-1][:top_n]
        recommended_book_ids = item_profiles.index[similar_items]
        similarity_scores = similarities[0][similar_items]
        
        return recommended_book_ids, similarity_scores

    def recommend_books_for_item(self, book_id, top_n=10):
        if book_id not in self.item_profiles.index:
            return [], []

        item_index = self.item_profiles.index.get_loc(book_id)
        similarities = self.item_similarities[item_index]
        similar_items = np.argsort(similarities)[::-1][:top_n]
        recommended_book_ids = self.item_profiles.index[similar_items]
        similarity_scores = similarities[similar_items]
        
        return recommended_book_ids, similarity_scores

# Example usage:
profiling = UserProfiling()
recommendations_for_user, user_scores = profiling.recommend_books_for_user(user_id=1, top_n=10)
print("Recommendations for user:", recommendations_for_user)
print("User similarity scores:", user_scores)

recommendations_for_item, item_scores = profiling.recommend_books_for_item(book_id=1, top_n=10)
print("Recommendations for book:", recommendations_for_item)
print("Item similarity scores:", item_scores)


Recommendations for user: Index([41881472, 53280099,  3860977,  5946601,  2251306,     1031,  6667514,
         255127, 18077903,    69242],
      dtype='int64', name='book_id')
User similarity scores: [0.88552028 0.88345882 0.88107095 0.88033411 0.88033411 0.87754365
 0.87698065 0.87597031 0.87472622 0.87144695]
Recommendations for book: Index([136251, 1, 2548866, 590325, 7823592, 10859323, 2479827, 2, 7077215,
       769483],
      dtype='int64', name='book_id')
Item similarity scores: [1.         1.         0.88830403 0.88825123 0.84588845 0.84510263
 0.84510263 0.81585628 0.80779147 0.80502517]


In [64]:
list(recommendations_for_user)

Index([41881472, 53280099,  3860977,  5946601,  2251306,     1031,  6667514,
         255127, 18077903,    69242],
      dtype='int64', name='book_id')

In [73]:
df

Unnamed: 0.1,Unnamed: 0,book_id,book_title,book_details,format,author,num_pages,genres,num_ratings,num_reviews,average_rating,5,4,3,2,1,years_since_first_publication,Genre_Interpretation
0,0,57094644,Daughter of the Deep,New York Times #1 best-selling author Rick Rio...,Hardcover,Rick Riordan,0.144748,"['Fantasy', 'Middle Grade', 'Young Adult', 'Sc...",0.254560,0.689025,0.680305,0.423443,0.375877,0.251464,0.124733,0.003403,-1.894448,Juvenile Fiction
1,1,895185,The Ghost,The stunning new novel from the No. 1 bestsell...,Hardcover,Robert Harris,0.304597,"['Fiction', 'Thriller', 'Mystery', 'Crime', 'P...",0.085572,0.237198,-0.762375,-0.017886,0.289047,0.370226,0.258989,-0.027662,-0.300537,Thriller/Mystery
2,2,2948832,Seduce Me at Sunrise,\nHe'd tried so hard to forget her.\nKev Merri...,Mass Market Paperback,Lisa Kleypas,0.185632,"['Historical Romance', 'Romance', 'Historical'...",0.511851,0.524474,0.063872,0.555305,0.613293,0.595864,0.507648,0.284715,-0.363598,Historical Fiction
3,3,154126,The Discovery of India,In conjunction with the Jawaharlal Nehru Memor...,Paperback,Jawaharlal Nehru,1.386309,"['History', 'India', 'Nonfiction', 'Politics',...",-0.250787,-0.348547,0.168100,-0.073290,-0.100157,-0.176481,-0.178933,0.013985,1.431631,Non Fiction
4,4,298663,The Killer Inside Me,"Everyone in the small town of Central City, Te...",Paperback,Jim Thompson,-0.484222,"['Fiction', 'Crime', 'Mystery', 'Noir', 'Thril...",0.207299,0.294163,-0.553099,0.211275,0.354494,0.387119,0.454474,0.447853,1.338255,Thriller/Mystery
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15500,15566,183057601,Ours,The third and final.Blurb coming soon.,ebook,Julie Mannino,0.023473,"['BDSM', 'Romance', 'M M Romance', 'Fantasy', ...",-2.048186,-1.953341,-0.204847,-2.234238,-2.870195,-2.183616,-2.208814,-2.029850,-2.895006,"Supernatural, Mystery, and Romance"
15501,15567,337100,The Runes of the Earth,Beginning ten years after the end of the accla...,Hardcover,Stephen R. Donaldson,0.946065,"['Fantasy', 'Fiction', 'Epic Fantasy', 'Scienc...",-0.298514,-0.463990,-0.762375,-0.269883,-0.148728,-0.034705,-0.002992,0.016072,-0.116297,Fantasy
15502,15568,9817,Ten Days in the Hills,It is the morning after the Academy Awards. Ma...,Hardcover,Jane Smiley,0.606985,"['Fiction', 'Contemporary', 'Romance', 'Litera...",-0.853062,-0.404413,-3.167038,-1.494164,-1.061748,-0.557024,-0.096651,0.296962,-0.248938,Classic Fiction
15503,15569,32940867,The Chemist,"In this gripping page-turner, an ex-agent on t...",Paperback,Stephenie Meyer,0.899438,"['Fiction', 'Thriller', 'Mystery', 'Romance', ...",0.890139,1.229197,-0.849215,0.737328,0.886439,0.991032,1.127764,1.192957,-1.130532,Thriller/Mystery


Recommended Books: Index([46165, 124272, 49583709, 46270, 1334844, 126609, 373755, 331319, 17728,
       457228],
      dtype='int64', name='book_id')
Cosine Similarity Scores: [1.         1.         0.89848237 0.89653535 0.89482028 0.89344184
 0.89323847 0.89295316 0.89280945 0.89106384]


In [33]:
df[df.book_id.isin(recommendations)]

Unnamed: 0.1,Unnamed: 0,book_id,book_title,book_details,format,author,num_pages,genres,num_ratings,num_reviews,average_rating,5,4,3,2,1,years_since_first_publication,Genre_Interpretation
1284,1284,46270,Suvashun,"Considerada una obra maestra, esta novela supu...",Hardcover,Simin Daneshvar,0.343095,"['Novels', 'Iran', 'Fiction', 'Literature', 'H...",-0.233793,-0.104234,-0.269968,-0.131947,-0.077515,-0.036642,-0.047697,-0.099982,1.038512,Classic Fiction
1448,1448,1334844,Ronggeng Dukuh Paruk,Gabungan 3 buku seri Dukuh Paruk: Ronggeng Duk...,Paperback,Ahmad Tohari,0.409168,"['Fiction', 'Historical Fiction', 'Indonesian ...",-0.34597,-0.067864,0.757038,-0.128501,-0.165556,-0.368127,-0.484005,-0.468482,0.737381,Classic Fiction
2493,2493,126609,Sister Carrie,"A landmark in American literature, presented i...",Paperback,Theodore Dreiser,1.121925,"['Classics', 'Fiction', 'Literature', 'America...",0.465222,0.308893,-0.820456,0.407161,0.537831,0.665156,0.758955,0.762263,1.949033,Classic Fiction
2761,2761,17728,The House of Mirth,"First published in 1905, The House of Mirth sh...",Paperback,Edith Wharton,0.139592,"['Fiction', 'Classics', 'Historical Fiction', ...",0.919687,0.956912,-0.204847,0.867668,0.910617,0.924268,0.987012,1.083649,1.902607,Classic Fiction
3978,3978,46165,This Side of Paradise,"This Side of Paradise, F. Scott Fitzgerald's r...",Paperback,F. Scott Fitzgerald,-0.286552,"['Classics', 'Fiction', 'Literature', 'Novels'...",0.771307,0.80124,-1.180018,0.565968,0.780988,0.971456,1.1061,1.025608,1.754665,Classic Fiction
7540,7540,124272,The Wings of the Dove,Set amid the splendor of London drawing rooms ...,Paperback,Henry James,1.657739,"['Classics', 'Fiction', 'Literature', 'America...",0.065832,-0.092205,-0.762375,0.100193,0.176734,0.275304,0.393703,0.525448,1.938398,Classic Fiction
12368,12368,373755,"Absalom, Absalom!",,Paperback,William Faulkner,-0.047712,"['Classics', 'Fiction', 'Literature', 'Novels'...",0.517175,0.535115,-0.17198,0.605856,0.521545,0.569498,0.725548,0.91068,1.563717,Classic Fiction
12594,12594,331319,An American Tragedy,,Mass Market Paperback,Theodore Dreiser,2.000563,"['Classics', 'Fiction', 'Literature', 'Histori...",0.42428,0.269227,-0.237509,0.49016,0.488643,0.521994,0.610745,0.720787,1.690304,Classic Fiction
12645,12645,457228,Butcher's Crossing,In his National Book Award–winning novel Augus...,Paperback,John Williams,-0.292679,"['Fiction', 'Westerns', 'Historical Fiction', ...",0.221244,0.456591,0.492414,0.334507,0.414705,0.218587,0.06564,-0.228131,1.208713,Classic Fiction
14532,14550,49583709,Lady Chatterley's Lover,One of the most extraordinary literary works o...,Paperback,D.H. Lawrence,0.381049,"['Fiction', 'Romance', 'Classics', 'Literature...",1.054023,1.061651,-1.533887,0.717633,0.958879,1.227518,1.422535,1.409922,1.666484,Classic Fiction
