<a href="https://colab.research.google.com/github/HaqTetsuya/rusdi-prototype-1/blob/book_recomendation/books.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title Filter
import pandas as pd
import csv

# Try using a different quoting character, if the error might be due to single quotes within values
#   enclosed by single quotes. For example if a value is 'John's book', try quotechar='"'
df = pd.read_csv("BooksDatasetClean.csv", quoting=csv.QUOTE_ALL)

# Or, if you suspect there might be unclosed strings, try ignoring bad lines:
# df = pd.read_csv("BooksDatasetClean.csv", on_bad_lines='skip')

# For both quotechar and on_bad_lines
# df = pd.read_csv("BooksDatasetClean.csv", quoting=csv.QUOTE_ALL, on_bad_lines='skip')


df_filtered = df[['Title', 'Authors', 'Description', 'Category', 'Publish Date (Year)']]

df_filtered.to_csv("BooksDatasetCleanFiltered.csv", index=False)


print(df_filtered.head())

In [None]:
# @title Visualisasi
# prompt: visualisation of the content of the datasets

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming df_filtered is already created from the previous code
# If not, uncomment the following lines and adjust the file path as needed
# df_filtered = pd.read_csv("BooksDatasetCleanFiltered.csv")

# Visualizations
plt.figure(figsize=(10, 6))
sns.histplot(df_filtered['Publish Date (Year)'], kde=True)
plt.title('Distribution of Book Publication Years')
plt.xlabel('Publication Year')
plt.ylabel('Number of Books')
plt.show()

plt.figure(figsize=(12, 6))
category_counts = df_filtered['Category'].value_counts().head(10)  # Top 10 categories
sns.barplot(x=category_counts.index, y=category_counts.values)
plt.title('Top 10 Book Categories')
plt.xlabel('Category')
plt.ylabel('Number of Books')
plt.xticks(rotation=45, ha='right')
plt.show()

# Wordcloud for descriptions (requires wordcloud library)
#!pip install wordcloud
from wordcloud import WordCloud

text = " ".join(df_filtered['Description'].astype(str))
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('Wordcloud of Book Descriptions')
plt.show()


In [None]:
# @title Ubah ke tabel dan download
import pandas as pd

# Load the CSV file
df = pd.read_csv("BooksDatasetCleanFiltered.csv")  # Replace with your actual CSV file

# Save as Excel file
df.to_excel("book_dataset.xlsx", index=False)

print("Conversion successful! The file is saved as 'filtered_dataset.xlsx'.")
from google.colab import files
#files.download('BooksDatasetCleanFiltered.csv')
#files.download('book_dataset.xlsx')


Conversion successful! The file is saved as 'filtered_dataset.xlsx'.


In [None]:
# @title hapus row kosong
# prompt: remove rows in BooksDatasetCleanFiltered.csv that has empty colloumn

import pandas as pd

# Load the CSV file
df = pd.read_csv("BooksDatasetCleanFiltered.csv")

# Remove rows with empty values in any column
df_cleaned = df.dropna()

# Save the cleaned DataFrame to a new CSV file
df_cleaned.to_csv("BooksDatasetCleanFiltered_cleaned.csv", index=False)

print("Rows with empty columns removed and saved to 'BooksDatasetCleanFiltered_cleaned.csv'")


Rows with empty columns removed and saved to 'BooksDatasetCleanFiltered_cleaned.csv'


In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# 🔹 Step 1: Load and Preprocess Dataset
def load_dataset(file_path):
    """Load CSV file with proper handling of quotes and delimiters."""
    # Try different parsing options to handle the CSV correctly
    try:
        # First attempt with standard parsing
        df = pd.read_csv(file_path)
    except:
        try:
            # Second attempt with more flexible parsing
            df = pd.read_csv(file_path, escapechar='\\', quotechar='"', encoding='utf-8')
        except:
            # Third attempt with maximum flexibility
            df = pd.read_csv(file_path, sep=',', engine='python', error_bad_lines=False)

    # Ensure we have the expected columns, if not, provide feedback
    expected_columns = ['Title', 'Authors', 'Description', 'Category', 'Publish Date (Year)']
    missing_columns = [col for col in expected_columns if col not in df.columns]

    if missing_columns:
        print(f"Warning: Missing expected columns: {missing_columns}")
        print(f"Available columns: {df.columns.tolist()}")

        # Try to make sensible mappings if column names are slightly different
        column_mapping = {}
        for expected in expected_columns:
            for actual in df.columns:
                if expected.lower() in actual.lower():
                    column_mapping[actual] = expected

        if column_mapping:
            df = df.rename(columns=column_mapping)

    # Select important columns that are available
    available_columns = [col for col in expected_columns if col in df.columns]
    if not available_columns:
        raise ValueError("No usable columns found in the dataset")

    df = df[available_columns]

    # Clean data: remove rows with missing essential information
    df = df.dropna(subset=['Title'])

    # Clean the text in each column
    for col in df.columns:
        if df[col].dtype == 'object':  # Only clean string columns
            df[col] = df[col].astype(str).str.strip()

    # Create combined text field for embedding
    text_columns = [col for col in ['Title', 'Authors', 'Description', 'Category'] if col in df.columns]
    df['combined_text'] = df[text_columns].apply(lambda row: ' '.join(row.values), axis=1)

    print(f"Successfully loaded {len(df)} books")
    return df

# 🔹 Step 2: Encode Books with BERT
def encode_books(df):
    """Convert book details into BERT embeddings."""
    model = SentenceTransformer('all-MiniLM-L6-v2')

    print("Encoding books with BERT (this may take a while for large datasets)...")
    # Encode combined text
    book_embeddings = model.encode(df['combined_text'].tolist(), show_progress_bar=True)

    print(f"Encoded {len(book_embeddings)} books")
    return model, book_embeddings

# 🔹 Step 3: Find Similar Books Based on User Query
def recommend_books_bert(user_query, df, model, book_embeddings, top_n=5):
    """Recommend books based on user query using BERT similarity."""
    print(f"\nFinding books similar to: '{user_query}'")

    # Encode user query
    user_embedding = model.encode([user_query])

    # Compute similarity between query and books
    similarities = cosine_similarity(user_embedding, book_embeddings)[0]

    # Get top N most similar books
    similar_books_idx = np.argsort(similarities)[-top_n:][::-1]

    # Display recommended books
    print("\n📚 Recommended Books:\n")
    recommendations = []

    for i, idx in enumerate(similar_books_idx):
        book_data = {}

        # Extract and clean book information
        if 'Title' in df.columns:
            book_title = df.iloc[idx]['Title']
            book_data['Title'] = book_title
            print(f"{i+1}. 📖 Title: {book_title}")

        if 'Authors' in df.columns:
            book_author = df.iloc[idx]['Authors']
            book_data['Author'] = book_author
            print(f"   ✍️ Author: {book_author}")

        if 'Category' in df.columns:
            book_category = df.iloc[idx]['Category']
            book_data['Category'] = book_category
            print(f"   📂 Category: {book_category}")

        if 'Publish Date (Year)' in df.columns:
            book_year = df.iloc[idx]['Publish Date (Year)']
            book_data['Year'] = book_year
            print(f"   📅 Year: {book_year}")

        # Add similarity score
        similarity = similarities[idx]
        book_data['Similarity'] = f"{similarity:.2f}"
        print(f"   🔍 Relevance: {similarity:.2f}")

        recommendations.append(book_data)
        print("-" * 50)

    return recommendations

# 🔹 Step 4: Run the Book Recommendation System with Error Handling
def run_recommender(dataset_path, user_query, top_n=5):
    """Run the complete recommendation system with proper error handling."""
    try:
        # Load dataset
        print(f"Loading dataset from: {dataset_path}")
        df = load_dataset(dataset_path)

        # Check if we have enough data
        if len(df) < top_n:
            print(f"Warning: Dataset only contains {len(df)} books, but {top_n} were requested.")
            top_n = len(df)

        # Encode books
        model, book_embeddings = encode_books(df)

        # Get recommendations
        recommendations = recommend_books_bert(user_query, df, model, book_embeddings, top_n)

        return recommendations, df, model, book_embeddings

    except Exception as e:
        print(f"Error: {str(e)}")
        import traceback
        traceback.print_exc()
        return None, None, None, None

# 🔹 Step 5: Interactive Mode
def interactive_mode(df, model, book_embeddings):
    """Allow user to enter multiple queries in an interactive session."""
    while True:
        print("\n" + "="*60)
        user_input = input("Enter your book preference (or 'exit' to quit): ")

        if user_input.lower() in ['exit', 'quit', 'q']:
            print("Thank you for using the Book Recommender!")
            break

        recommend_books_bert(user_input, df, model, book_embeddings)

# Main execution
if __name__ == "__main__":
    # Configuration
    dataset_path = "BooksDatasetCleanFiltered.csv"  # Change this to your actual dataset path
    initial_query = "a horror action book with supernatural elements"

    # Run the recommendation system
    recommendations, df, model, book_embeddings = run_recommender(dataset_path, initial_query)

    if df is not None and model is not None and book_embeddings is not None:
        # Ask if user wants to continue with more queries
        while True:
            continue_option = input("\nWould you like to try another query? (yes/no): ")
            if continue_option.lower() in ['yes', 'y']:
                interactive_mode(df, model, book_embeddings)
                break
            elif continue_option.lower() in ['no', 'n']:
                print("Thank you for using the Book Recommender!")
                break
            else:
                print("Please enter 'yes' or 'no'.")

Loading dataset from: BooksDatasetCleanFiltered.csv
Successfully loaded 65296 books
Encoding books with BERT (this may take a while for large datasets)...


Batches:   0%|          | 0/2041 [00:00<?, ?it/s]

Encoded 65296 books

Finding books similar to: 'a horror action book with supernatural elements'

📚 Recommended Books:

1. 📖 Title: Lord Loss (The Demonata, 1)
   ✍️ Author: By Shan, Darren
   📂 Category: Young Adult Fiction , Horror
   📅 Year: 2006
   🔍 Relevance: 0.63
--------------------------------------------------
2. 📖 Title: Scary Stories to Tell in the Dark: Collected from American Folklore
   ✍️ Author: By Schwartz, Alvin and Gammell, Stephen (ILT)
   📂 Category: Juvenile Fiction , Short Stories
   📅 Year: 1986
   🔍 Relevance: 0.63
--------------------------------------------------
3. 📖 Title: Count Karlstein
   ✍️ Author: By Pullman, Philip and Bryan, Diana (ILT)
   📂 Category: Juvenile Fiction , Horror
   📅 Year: 1998
   🔍 Relevance: 0.62
--------------------------------------------------
4. 📖 Title: Scary Stories 3 (Scary Stories Scary Stories)
   ✍️ Author: By Schwartz, Alvin and Gammell, Stephen (ILT)
   📂 Category: Juvenile Fiction , Short Stories
   📅 Year: 2001
   🔍 Re