In [381]:
import chardet

with open('/Users/kana/Desktop/RS/project1/datasets/bookData.csv', 'rb') as f:
    result = chardet.detect(f.read())
charenc = result['encoding']
print(f"Detected encoding: {charenc}")

Detected encoding: Windows-1252


In [382]:
import pandas as pd


book_data = pd.read_csv('/Users/kana/Desktop/RS/project1/datasets/bookData.csv', encoding='Windows-1252')


user_data = pd.read_csv('/Users/kana/Desktop/RS/project1/datasets/UserData.csv', encoding='Windows-1252')


user_historical_view = pd.read_csv('/Users/kana/Desktop/RS/project1/datasets/UserHistoricalView.csv', encoding='Windows-1252')

test_user_answers = pd.read_csv('/Users/kana/Desktop/RS/project1/datasets/TestUserAnswers.csv', encoding='Windows-1252')

In [383]:
# remove X for isbn
user_historical_view['isbn'] = user_historical_view['isbn'].str.replace('X', '')
test_user_answers['isbn'] = test_user_answers['isbn'].str.replace('X', '')
book_data['isbn'] = book_data['isbn'].str.replace('X', '')

In [384]:
# Find missing values

print("Book Data Missing Values:")
print(book_data.isna().sum())

print("\nUser Data Missing Values:")
print(user_data.isna().sum())

print("\nUser Historical View Missing Values:")
print(user_historical_view.isna().sum())

print("\nTest User Answers Missing Values:")
print(test_user_answers.isna().sum())

Book Data Missing Values:
isbn                 0
booktitle            0
bookauthor           0
yearofpublication    0
publisher            0
Synopsis             1
dtype: int64

User Data Missing Values:
userid      0
location    0
age         3
dtype: int64

User Historical View Missing Values:
userid    0
isbn      0
dtype: int64

Test User Answers Missing Values:
userid    0
isbn      0
dtype: int64


In [385]:
# imputing user_data

user_data['age'].fillna(user_data['age'].mean(), inplace=True)
user_data['age'] = user_data['age'].astype(int)

most_frequent_location = user_data['location'].mode().iloc[0]
user_data.loc[0, 'location'] = most_frequent_location

In [386]:
# dropping missing synopsis as we dont know the info about book
book_data.dropna(subset=['Synopsis'], inplace=True)

In [387]:
merged_data = pd.merge(user_data, user_historical_view, on='userid')
merged_data = pd.merge(merged_data, book_data, left_on='isbn', right_on='isbn', how='left')
merged_data.isna().sum()

userid               0
location             0
age                  0
isbn                 0
booktitle            1
bookauthor           1
yearofpublication    1
publisher            1
Synopsis             1
dtype: int64

In [388]:
merged_data.dropna(inplace=True)
# Drop unnecessary columns
merged_data.drop(columns=['location', 'age', 'yearofpublication', 'publisher', 'Synopsis'], inplace=True)
merged_data


Unnamed: 0,userid,isbn,booktitle,bookauthor
0,11676,60938455,Fast Food Nation: The Dark Side of the AllAmer...,Eric Schlosser
1,11676,316096199,Lucky : A Memoir,Alice Sebold
2,11676,316569321,White Oleander : A Novel,Janet Fitch
3,11676,312195516,The Red Tent (Bestselling Backlist),Anita Diamant
4,11676,345361792,A Prayer for Owen Meany,John Irving
...,...,...,...,...
84,271448,043935806,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling
85,271448,044023722,A Painted House,John Grisham
86,271448,446672211,Where the Heart Is,Billie Letts
87,271448,452282829,We Were the Mulvaneys,Joyce Carol Oates


In [389]:
# Get unique authors and book titles
unique_authors = merged_data['bookauthor'].unique()
unique_booktitles = merged_data['booktitle'].unique()

# Create a dictionary to store user profiles
user_profiles = {}

# Iterate over each user
for userid in merged_data['userid'].unique():
    # Get the books read by this user
    user_books = merged_data[merged_data['userid'] == userid]
    
    # Create lists to store ISBN
    isbn_list = []
    
    # Create dictionaries to store author and book title features
    author_features = {author: False for author in unique_authors}
    booktitle_features = {booktitle: False for booktitle in unique_booktitles}
    
    # Iterate over each book read by this user
    for index, book in user_books.iterrows():
        # Get the author, ISBN, and booktitle of this book
        author = book['bookauthor']
        isbn = book['isbn']
        booktitle = book['booktitle']
        
        # Set the corresponding author and book title features to True
        author_features[author] = True
        booktitle_features[booktitle] = True
        
        # Add ISBN to the list
        isbn_list.append(isbn)
    
    # Store the user profile in the dictionary
    user_profiles[userid] = {'isbn_list': isbn_list, **author_features, **booktitle_features}

profile_df = pd.DataFrame.from_dict(user_profiles, orient='index')
profile_df.reset_index(inplace=True)
profile_df.columns = ['userid'] + list(profile_df.columns[1:])

# Save the profile DataFrame to a CSV file
profile_df.to_csv('/Users/kana/Desktop/RS/project1/Part1_File1_Profile_Group2.csv', index_label='index')
profile_df

Unnamed: 0,userid,isbn_list,Eric Schlosser,Alice Sebold,Janet Fitch,Anita Diamant,John Irving,MICHAEL CRICHTON,SOPHIE KINSELLA,J. K. Rowling,...,Life of Pi,Cold Mountain,American Gods,Red Dragon,ANGELA S ASHES,Ender s Game (Ender Wiggins Saga (Paperback)),Divine Secrets of the YaYa Sisterhood,The Nanny Diaries: A Novel,A Painted House,Fall On Your Knees (Oprah #45)
0,11676,"[60938455, 316096199, 316569321, 312195516, 34...",True,True,True,True,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
1,16795,"[142001740, 316666343, 446605239, 385504209, 6...",False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,22625,"[316601950, 316666343, 316769487, 345339681, 3...",False,True,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
3,35859,"[61009059, 312195516, 345342968, 044021145, 05...",False,False,False,True,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
4,95359,"[312305060, 316601950, 345361792, 60392452, 38...",False,False,False,False,True,False,False,True,...,False,False,False,False,False,False,False,False,False,False
5,104636,"[439064872, 439136369, 439139600, 044021145, 4...",False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
6,110912,"[60392452, 014028009, 385484518, 385722206, 43...",False,False,False,False,True,False,False,True,...,False,False,False,False,False,False,False,False,False,False
7,204864,"[156027321, 312195516, 316769487, 375700757, 3...",False,False,False,True,False,False,False,False,...,True,True,True,True,True,True,False,False,False,False
8,271448,"[60928336, 312291639, 316666343, 439139600, 04...",False,True,False,False,False,False,False,True,...,False,False,False,False,False,False,True,True,True,True


In [None]:
book_data.drop(['yearofpublication', 'Synopsis','publisher'], axis=1, inplace=True)

In [402]:
book_data

Unnamed: 0,isbn,booktitle,bookauthor
0,440234743,The Testament,John Grisham
1,971880107,Wild Animus,Rich Shapero
2,345417623,Timeline,MICHAEL CRICHTON
3,446310786,To Kill a Mockingbird,Harper Lee
4,671027360,Angels and Demons,Dan Brown
...,...,...,...
109,439064864,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling
110,043935806,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling
111,440220602,The Chamber,John Grisham
112,671001795,Two for the Dough,Janet Evanovich


In [418]:
import pandas as pd
import numpy as np

# Assuming you have already loaded `profile_df` and `book_data` DataFrames.

# Get unique authors from the book data
unique_authors = book_data['bookauthor'].unique()

# Filter only the authors that exist in both `profile_df` and `book_data`
matching_authors = [author for author in unique_authors if author in profile_df.columns]

# Create a binary vector for each book based on matching authors
book_vectors = []
for index, book in book_data.iterrows():
    book_vector = [0] * len(matching_authors)
    if book['bookauthor'] in matching_authors:
        author_index = matching_authors.index(book['bookauthor'])
        book_vector[author_index] = 1
    book_vectors.append(book_vector)

# Convert book vectors to a NumPy array
book_vectors = np.array(book_vectors)

# Filter `profile_df` to include only the matching authors
profile_df_filtered = profile_df.loc[:, matching_authors]
profile_matrix = profile_df_filtered.to_numpy()

# Calculate Cosine Similarity for each user and each book
similarity_matrix = np.zeros((profile_matrix.shape[0], book_vectors.shape[0]))

for i in range(profile_matrix.shape[0]):
    for j in range(book_vectors.shape[0]):
        # Calculate dot product
        dot_product = np.dot(profile_matrix[i], book_vectors[j])
        # Calculate magnitudes (L2 norm) of the vectors
        magnitude_user = np.linalg.norm(profile_matrix[i])
        magnitude_book = np.linalg.norm(book_vectors[j])
        # Calculate cosine similarity
        if magnitude_user != 0 and magnitude_book != 0:
            similarity_matrix[i, j] = dot_product / (magnitude_user * magnitude_book)
        else:
            similarity_matrix[i, j] = 0  # Handle cases where the magnitude is 0

# Convert the similarity matrix to a DataFrame with appropriate row and column names
similarity_df = pd.DataFrame(similarity_matrix, index=profile_df['userid'], columns=book_data['booktitle'])

# Store the similarity matrix in the specified file
similarity_df.to_csv('/Users/kana/Desktop/RS/project1/Part1_File2_SimMatrix_Group2.csv', index=True)
similarity_df

booktitle,The Testament,Wild Animus,Timeline,To Kill a Mockingbird,Angels and Demons,Little Altars Everywhere,The Firm,Fast Food Nation: The Dark Side of the AllAmerican Meal,Where the Heart Is,Icy Sparks,...,House of Sand and Fog,Silence of the Lambs,Angela s Ashes (MMP) : A Memoir,The Pilot s Wife : A Novel,Harry Potter and the Goblet of Fire (Book 4),Harry Potter and the Chamber of Secrets (Book 2),Harry Potter and the Order of the Phoenix (Book 5),The Chamber,Two for the Dough,The Horse Whisperer
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11676,0.0,0.0,0.258199,0.0,0.0,0.0,0.0,0.258199,0.258199,0.0,...,0.0,0.0,0.0,0.0,0.258199,0.258199,0.258199,0.0,0.258199,0.0
16795,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.333333,0.333333,0.333333,0.333333,0.0,0.0,0.0
35859,0.408248,0.0,0.0,0.0,0.0,0.0,0.408248,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.408248,0.408248,0.408248,0.408248,0.408248,0.0
95359,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.333333,0.333333,0.333333,0.333333,0.0,0.0,0.0
104636,0.408248,0.0,0.0,0.0,0.0,0.0,0.408248,0.0,0.408248,0.0,...,0.0,0.0,0.0,0.0,0.408248,0.408248,0.408248,0.408248,0.0,0.0
110912,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.301511,0.301511,0.301511,0.0,0.0,0.0
204864,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.316228,0.316228,0.0,0.0,0.0,0.0,0.0,0.0,0.0
271448,0.353553,0.0,0.0,0.0,0.0,0.353553,0.353553,0.0,0.353553,0.0,...,0.0,0.0,0.0,0.0,0.353553,0.353553,0.353553,0.353553,0.0,0.0


In [432]:
# Merge book_data with similarity_df to get book titles with their ISBNs
isbn_title_map = book_data[['isbn', 'booktitle']].set_index('booktitle').to_dict()['isbn']

# Create a DataFrame to store recommendations
recommendations = []

# For each user in user_data (since we have 9 users in total)
for user_id in user_data['userid']:
    # Get the list of books the user has already read
    read_books_isbns = user_historical_view[user_historical_view['userid'] == user_id]['isbn'].tolist()
    
    # Convert the ISBNs to titles using the book_data
    read_books_titles = book_data[book_data['isbn'].isin(read_books_isbns)]['booktitle'].tolist()
    
    # Get the user's similarity scores with books
    user_similarity_scores = similarity_df.loc[user_id]
    
    # Filter out the books that the user has already read
    unread_books = user_similarity_scores.drop(index=read_books_titles, errors='ignore')
    
    # Sort the books by similarity values in descending order and pick the top 5
    top_5_books = unread_books.sort_values(ascending=False).head(5)
    
    # Store the top 5 recommendations
    for title, similarity in top_5_books.items():
        recommendations.append({
            'User ID': user_id,
            'Book’s ISBN': isbn_title_map[title],
            'Book’s Title': title,
            'Similarity Value': similarity
        })

# Convert the list of recommendations into a DataFrame
recommendations_df = pd.DataFrame(recommendations)

# Store the result in the specified file
group_no = "your_group_no"  # Replace with your actual group number
output_file = f'Part1_File3_Recommendation_Group{group_no}.csv'
recommendations_df.to_csv(output_file, index=False)

# Display the first 5 books for each user
for user_id in user_data['userid']:
    user_recommendations = recommendations_df[recommendations_df['User ID'] == user_id].head(5)


# Convert the list of recommendations into a DataFrame
recommendations_df = pd.DataFrame(recommendations)


recommendations_df['Rank'] = [(i % 5) + 1 for i in range(len(recommendations_df))]
recommendations_df.to_csv('/Users/kana/Desktop/RS/project1/Part1_File3_Recommendation_Group2.csv', index=False)
recommendations_df


Unnamed: 0,User ID,Book’s ISBN,Book’s Title,Similarity Value,Rank
0,11676,61009059,One for the Money (Stephanie Plum Novels (Pape...,0.258199,1
1,11676,316284955,White Oleander : A Novel (Oprah s Book Club),0.258199,2
2,11676,316666343,The Lovely Bones,0.258199,3
3,11676,345353145,Sphere,0.258199,4
4,11676,446364193,Along Came a Spider (Alex Cross Novels),0.258199,5
5,16795,671003755,She s Come Undone,0.447214,1
6,16795,671027360,Angels and Demons,0.447214,2
7,16795,446608955,A Walk to Remember,0.447214,3
8,16795,316096199,Lucky : A Memoir,0.447214,4
9,16795,446606812,Message in a Bottle,0.447214,5
