In [1]:
#machine learning models

In [2]:
#libraries
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import NMF
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

In [3]:
#load data
file = 'encoded.csv'
books_df= pd.read_csv(file)

In [4]:
books_df['book_id'] = range(len(books_df))
books_df['user_id'] = 1
print(books_df[['user_id', 'book_id', 'rating']].head())

   user_id  book_id    rating
0        1        0 -0.959559
1        1        1  1.606694
2        1        2  0.673511
3        1        3 -0.155985
4        1        4 -0.415203


In [5]:
print(books_df.isnull().sum())

title                 0
author                0
desc                  0
genre                 0
rating                0
Languages             0
Culture               0
Mystery               0
Non-Fiction           0
Animals               0
Travel                0
Religion              0
Social Issues         0
Young Adult           0
Unknown               0
Politics              0
Geography             0
Science Fiction       0
Lifestyle             0
History               0
Hobbies               0
Fiction               0
Science               0
Action                0
Academia              0
Historical Fiction    0
Fantasy               0
Entertainment         0
Romance               0
Mythology             0
Technology            0
Education             0
Art                   0
book_id               0
user_id               0
dtype: int64


In [6]:
genre_columns = [
    'Languages', 'Culture', 'Mystery', 'Non-Fiction', 'Animals', 'Travel',
    'Religion', 'Social Issues', 'Young Adult', 'Unknown', 'Politics',
    'Geography', 'Science Fiction', 'Lifestyle', 'History', 'Hobbies',
    'Fiction', 'Science', 'Action', 'Academia', 'Historical Fiction',
    'Fantasy', 'Entertainment', 'Romance', 'Mythology', 'Technology',
    'Education', 'Art'
]

In [7]:
# Combine genre columns into a list of genres per book
books_df['genres'] = books_df[genre_columns].apply(lambda row: [genre for genre in genre_columns if row[genre] == 1], axis=1)

# Initialize MultiLabelBinarizer to transform genre lists to binary matrix
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(books_df['genres'])
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_)

# Combine with original DataFrame
books_df = pd.concat([books_df, genres_df], axis=1)


In [8]:
features = books_df[mlb.classes_].copy()
features['rating'] = books_df['rating']

In [9]:
# Create user-item matrix
user_item_matrix = books_df.pivot(index='user_id', columns='book_id', values='rating')

print(user_item_matrix.shape)


(1, 89521)


In [10]:
# Check and rename duplicate columns
def rename_duplicate_columns(df):
    cols = pd.Series(df.columns)
    for dup in cols[cols.duplicated()].unique(): 
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
    df.columns = cols

# Rename duplicate columns if any
rename_duplicate_columns(books_df)

In [11]:
# Apply renaming
rename_duplicate_columns(books_df)


In [12]:
# Parameters for simulation
num_simulated_users = 10  # Number of additional users
num_simulated_books = len(books_df)  # Number of existing books

# Generate simulated user ratings
np.random.seed(42)  # For reproducibility

# Simulated additional ratings
additional_data = {
    'user_id': np.random.randint(1, num_simulated_users + 1, size=num_simulated_books * num_simulated_users),
    'book_id': np.tile(books_df['title'], num_simulated_users),
    'rating': np.random.uniform(1, 5, size=num_simulated_books * num_simulated_users)
}

# Create DataFrame for simulated ratings
additional_df = pd.DataFrame(additional_data)

# Combine existing book data with simulated ratings
# Assuming you want to keep the existing book data and add the ratings to it
books_df = pd.concat([books_df, additional_df], ignore_index=True)

# Print the updated DataFrame
print("Updated DataFrame with simulated ratings:")
print(books_df)

# Create user-item matrix
user_item_matrix = books_df.pivot(index='user_id', columns='book_id', values='rating')

print("\nUser-item matrix:")
print(user_item_matrix)

Updated DataFrame with simulated ratings:
                                                    title  \
0       Between Two Fires: American Indians in the Civ...   
1                                Fashion Sourcebook 1920s   
2                                              Hungary 56   
3       All-American Anarchist: Joseph A. Labadie and ...   
4       The Human Equation: Building Profits by Puttin...   
...                                                   ...   
984726                                                NaN   
984727                                                NaN   
984728                                                NaN   
984729                                                NaN   
984730                                                NaN   

                                  author  \
0                   Laurence M. Hauptman   
1       Charlotte Fiell,Emmanuelle Dirix   
2                          Andy Anderson   
3                   Carlotta R. Anderson   
4    

ValueError: Index contains duplicate entries, cannot reshape

In [15]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split

# Convert DataFrame into Surprise dataset
reader = Reader(rating_scale=(books_df['rating'].min(), books_df['rating'].max()))
data = Dataset.load_from_df(books_df[['user_id', 'book_id', 'rating']], reader)

# Create train-test split using Surprise
trainset, testset = train_test_split(data, test_size=0.2)

In [16]:
sparsity = 1.0 - (np.count_nonzero(user_item_matrix) / float(user_item_matrix.size))
print(f"Sparsity: {sparsity:.2%}")

Sparsity: 0.00%


In [17]:
# Train the SVD model
model = SVD()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1c190e9f8f0>

In [18]:
predictions = model.test(testset)
rmse = accuracy.rmse(predictions)
print(f'RMSE: {rmse}')

RMSE: 1.3243
RMSE: 1.3243021853088448


In [19]:
def get_recommendations(user_id, model, user_item_matrix, top_n=10):
    # Predict ratings for all books that the user hasn't rated
    all_books = user_item_matrix.columns
    rated_books = user_item_matrix.loc[user_id].dropna().index
    unrated_books = [book for book in all_books if book not in rated_books]
    
    predictions = [model.predict(user_id, book) for book in unrated_books]
    predictions.sort(key=lambda x: x.est, reverse=True)
    
    top_recommendations = predictions[:top_n]
    return [(pred.iid, pred.est) for pred in top_recommendations]

# Example usage
recommendations = get_recommendations(user_id=1, model=model, user_item_matrix=user_item_matrix)
print(recommendations)

[]
