In [30]:
import pandas as pd
import numpy as np
from faker import Faker
from random import choices, randint

# Initialize Faker
fake = Faker()

# Predefined genre-based title templates
genre_titles = {
    'Fiction': ['The Lost Horizon', 'Echoes of Eternity', 'Shadows of the Past', 'A Distant Journey', 'Whispers in the Wind'],
    'Mystery': ['The Hidden Key', 'Murder on the Midnight Express', 'The Silent Witness', 'The Enigma Code', 'Undercover Shadows'],
    'Fantasy': ['The Dragon’s Call', 'The Wizard’s Legacy', 'Chronicles of Eldoria', 'The Crystal Quest', 'The Sorcerer’s Stone'],
    'Science Fiction': ['Galactic Wars', 'The AI Prophecy', 'Beyond the Stars', 'The Quantum Paradox', 'Terraformers'],
    'Romance': ['A Love Unforeseen', 'Hearts Entwined', 'The Paris Affair', 'Whispered Promises', 'A Chance Encounter'],
    'Non-Fiction': ['The Art of Mindfulness', 'Breaking Barriers', 'The Science of Sleep', 'A History of Empires', 'Innovations of Tomorrow'],
    'Biography': ['The Life of a Visionary', 'Memoirs of a Maverick', 'The Untold Story', 'A Journey to Greatness', 'Living Legends'],
    'Thriller': ['The Last Chase', 'Bloodlines', 'The Final Countdown', 'Deadly Pursuit', 'The Betrayal Pact']
}

# Step 1: Generate Synthetic Book Data
def generate_books(num_books=1000):
    genres = list(genre_titles.keys())
    countries = ['USA', 'UK', 'Canada', 'India', 'Australia', 'Germany', 'France', 'Japan']
    
    books = []
    for _ in range(num_books):
        genre = np.random.choice(genres)
        title = np.random.choice(genre_titles[genre]) + f" {fake.word().capitalize()}"
        author = fake.name()
        year = np.random.randint(1950, 2023)
        # 50% of books are from the USA
        country = 'USA' if np.random.random() < 0.5 else np.random.choice(countries)
        rating = np.random.randint(1, 11)
        isbn = fake.unique.isbn10()
        books.append([isbn, title, author, genre, year, country, rating])
    
    return pd.DataFrame(books, columns=['ISBN', 'Title', 'Author', 'Genre', 'Year_of_Publication', 'Country', 'Rating'])

# Step 2: Generate Synthetic User Data
def generate_users(num_users=500, book_df=None):
    if book_df is None:
        raise ValueError("Book dataset is required to ensure consistency!")
    
    genres = book_df['Genre'].unique()
    countries = book_df['Country'].unique()
    authors = book_df['Author'].unique()
    decades = [(year // 10) * 10 for year in book_df['Year_of_Publication']]
    decades = sorted(list(set(decades)))  # Unique decades
    
    users = {
        'User_ID': range(1, num_users + 1),
        'Age': np.random.randint(15, 80, num_users),
        # 90% Male/Female, 10% Other
        'Gender': np.random.choice(['Male', 'Female', 'Other'], p=[0.45, 0.45, 0.1], size=num_users),
        'Country': np.random.choice(countries, num_users),
        'Prefers_High_Rated_Books': np.random.choice(['Yes', 'No'], num_users),
        'Preferred_Genres': [choices(genres, k=3) for _ in range(num_users)],  # Top 3 genres
        'Preferred_Authors': [choices(authors, k=3) for _ in range(num_users)],  # Top 3 authors
        'Preferred_Decade': np.random.choice(decades, num_users),
        'Likes_Foreign_Books': np.random.choice(['Yes', 'No'], num_users)
    }
    # Join genres and authors as strings for readability
    users['Preferred_Genres'] = [", ".join(prefs) for prefs in users['Preferred_Genres']]
    users['Preferred_Authors'] = [", ".join(prefs) for prefs in users['Preferred_Authors']]
    
    return pd.DataFrame(users)

# Step 3: Generate the Data
num_books = 1000
num_users = 500

books_df = generate_books(num_books=num_books)
users_df = generate_users(num_users=num_users, book_df=books_df)

# Step 4: Save to CSV or Inspect
books_df.to_csv("synthetic_books.csv", index=False)
users_df.to_csv("synthetic_users.csv", index=False)

# Display the generated datasets
print("Books Dataset Sample:")
print(books_df.head())

print("\nUsers Dataset Sample:")
print(users_df.head())

Books Dataset Sample:
            ISBN                        Title          Author  \
0  0-9577122-4-3        Echoes of Eternity It    Kayla Torres   
1  1-07-584219-0  The Quantum Paradox Foreign   Mario Gardner   
2  0-405-07656-8        The Dragon’s Call Who  Robert Alvarez   
3  0-8024-0626-2      The Final Countdown Art       Bob Hicks   
4  1-363-51264-1         The AI Prophecy Type  Joanne Gilbert   

             Genre  Year_of_Publication Country  Rating  
0          Fiction                 2022     USA      10  
1  Science Fiction                 1974     USA       2  
2          Fantasy                 1978   India       2  
3         Thriller                 2017  Canada      10  
4  Science Fiction                 2016     USA       3  

Users Dataset Sample:
   User_ID  Age  Gender    Country Prefers_High_Rated_Books  \
0        1   73  Female  Australia                       No   
1        2   68  Female      India                       No   
2        3   19    Male  Au

In [43]:
import sqlite3
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split

# Connect to SQLite Database
conn = sqlite3.connect("book_recommendation_system.db")

# Load or generate datasets (assuming books_df, users_df, and ratings_df are already generated)
# Uncomment below lines if datasets are not already loaded in the database.
# books_df.to_sql("Books", conn, if_exists="replace", index=False)
# users_df.to_sql("Users", conn, if_exists="replace", index=False)
# ratings_data = {
#     'User_ID': np.random.choice(users_df['User_ID'], 5000),
#     'ISBN': np.random.choice(books_df['ISBN'], 5000),
#     'Rating': np.random.randint(1, 11, 5000)
# }
# ratings_df = pd.DataFrame(ratings_data)
# ratings_df.to_sql("Ratings", conn, if_exists="replace", index=False)

# Load data from the database
books_df = pd.read_sql_query("SELECT * FROM Books", conn)
users_df = pd.read_sql_query("SELECT * FROM Users", conn)
ratings_df = pd.read_sql_query("SELECT * FROM Ratings", conn)

# Combine metadata for content-based filtering
books_df['metadata'] = (
    books_df['Genre'] + " " +
    books_df['Author'] + " " +
    books_df['Country'] + " " +
    books_df['Year_of_Publication'].astype(str)
)

# TF-IDF Vectorizer for content-based filtering
#TF-IDF transforms the text into meaningful representation of integers or numbers which is used to fit machine learning algorithm for predictions
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(books_df['metadata'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Display user details
def display_user_details(user_id):
    user_preferences = pd.read_sql_query(f"SELECT * FROM Users WHERE User_ID = {user_id}", conn).iloc[0]
    print("User Details:")
    print(f"User ID: {user_preferences['User_ID']}")
    print(f"Name: User_{user_preferences['User_ID']}")  # Simulated name
    print(f"Age: {user_preferences['Age']}")
    print(f"Gender: {user_preferences['Gender']}")
    print(f"Country: {user_preferences['Country']}")
    print(f"Prefers High Rated Books: {user_preferences['Prefers_High_Rated_Books']}")
    print(f"Preferred Genres: {user_preferences['Preferred_Genres']}")
    print(f"Preferred Authors: {user_preferences['Preferred_Authors']}")
    print(f"Preferred Decade: {user_preferences['Preferred_Decade']}")
    print(f"Likes Foreign Books: {user_preferences['Likes_Foreign_Books']}")
    print("-" * 50)

# Content-based recommendation function
def recommend_books_content(user_id, top_n=5):
    user_preferences = pd.read_sql_query(f"SELECT * FROM Users WHERE User_ID = {user_id}", conn).iloc[0]
    preferred_genres = user_preferences['Preferred_Genres'].split(", ")
    preferred_authors = user_preferences['Preferred_Authors'].split(", ")
    preferred_decade = user_preferences['Preferred_Decade']
    likes_foreign = user_preferences['Likes_Foreign_Books'] == "Yes"
    user_country = user_preferences['Country']

    # Apply strict filters
    filtered_books = books_df[
        (books_df['Genre'].isin(preferred_genres)) &
        (books_df['Author'].isin(preferred_authors)) &
        (books_df['Year_of_Publication'] // 10 * 10 == preferred_decade)
    ]
    if not likes_foreign:
        filtered_books = filtered_books[filtered_books['Country'] == user_country]

    # Relax filters if no matches
    if filtered_books.empty:
        print("No strict matches found. Relaxing filters...")
        filtered_books = books_df[
            (books_df['Genre'].isin(preferred_genres)) |
            (books_df['Author'].isin(preferred_authors)) |
            (books_df['Year_of_Publication'] // 10 * 10 == preferred_decade)
        ]
        if not likes_foreign:
            filtered_books = filtered_books[filtered_books['Country'] == user_country]

    # Fallback to all books if still no matches
    if filtered_books.empty:
        print("No matches found even after relaxing filters. Using all books.")
        filtered_books = books_df

    # Compute similarity for filtered books
    indices = filtered_books.index
    user_profile = cosine_sim[indices].mean(axis=0)
    top_indices = np.argsort(user_profile)[::-1][:top_n]
    return books_df.iloc[top_indices][['Title', 'Author', 'Genre', 'Year_of_Publication', 'Country']]

# rating filtering
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(ratings_df[['User_ID', 'ISBN', 'Rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2)
model = SVD()
model.fit(trainset)

def recommend_books_rating(user_id, top_n=5):
    user_books = ratings_df[ratings_df['User_ID'] == user_id]['ISBN'].unique()
    all_books = books_df['ISBN']
    non_rated_books = [book for book in all_books if book not in user_books]
    pred_ratings = [(book, model.predict(user_id, book).est) for book in non_rated_books]
    pred_ratings = sorted(pred_ratings, key=lambda x: x[1], reverse=True)[:top_n]
    return books_df[books_df['ISBN'].isin([x[0] for x in pred_ratings])][['Title', 'Author', 'Genre', 'Year_of_Publication', 'Country']]

# Hybrid recommendation
def recommend_books_hybrid(user_id, top_n=5):
    content_recommendations = recommend_books_content(user_id, top_n)
    rating_recommendations = recommend_books_rating(user_id, top_n)
    if isinstance(content_recommendations, str):
        return rating_recommendations
    hybrid_recommendations = pd.concat([content_recommendations, rating_recommendations])
    hybrid_recommendations = hybrid_recommendations.drop_duplicates(subset='Title').head(top_n)
    return hybrid_recommendations

# test (change user id based on testing)
user_id = 1
display_user_details(user_id)

print("\nContent-Based Recommendations:")
print(recommend_books_content(user_id))

print("\nRating Recommendations:")
print(recommend_books_rating(user_id))
#combines rating and content for the best possible recs (these are final)
print("\nHybrid Recommendations:")
print(recommend_books_hybrid(user_id))


User Details:
User ID: 1
Name: User_1
Age: 43
Gender: Male
Country: Australia
Prefers High Rated Books: Yes
Preferred Genres: Fiction, Science Fiction, Non-Fiction
Preferred Authors: Deborah Rodriguez, Amber Graham, Nicole Burton
Preferred Decade: 1960
Likes Foreign Books: Yes
--------------------------------------------------

Content-Based Recommendations:
No strict matches found. Relaxing filters...
                                 Title           Author            Genre  \
431             Beyond the Stars After     Andrew Smith  Science Fiction   
216             Galactic Wars Security     Steven Smith  Science Fiction   
824          A History of Empires Sell   Jennifer Smith      Non-Fiction   
349  Innovations of Tomorrow According  Jennifer Turner      Non-Fiction   
676               Terraformers Foreign    Raymond Smith  Science Fiction   

     Year_of_Publication Country  
431                 1966     USA  
216                 1962     USA  
824                 1970  Canada