## Connect to MongoDB and Retrieve Data

In [1]:
# Import dependenctt
from pprint import pprint
import getpass 
import pandas as pd
import numpy as np
from pymongo import MongoClient
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
password = getpass.getpass("MongoDB password: ")

MongoDB password:  ········


In [3]:
client = MongoClient(f'mongodb://book_group:{password}@macragge.reika.io:47017/?authSource=books')

In [4]:
# Assign the database to a variable name
db = client['books']
collection = db['books']

In [5]:
# Fetch data from MongoDB
data = list(collection.find(limit=10000))  # Retrieve all documents as a list of dictionaries

In [6]:
# Convert to Pandas DataFrame
df = pd.DataFrame(data)

In [7]:
#client.close()

In [8]:
df.head()

Unnamed: 0,_id,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,...,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series
0,66da49047084538b3e00f9c2,312853122.0,1,[],US,,"[{'count': '3', 'name': 'to-read'}, {'count': ...",,False,4.0,...,9.0,,1984.0,https://www.goodreads.com/book/show/5333265-w-...,https://images.gr-assets.com/books/1310220028m...,5333265,3,5400751,W.C. Fields: A Life on Film,W.C. Fields: A Life on Film
1,66da49047084538b3e00f9c3,743509986.0,6,[],US,,"[{'count': '2634', 'name': 'to-read'}, {'count...",,False,3.23,...,10.0,Abridged,2001.0,https://www.goodreads.com/book/show/1333909.Go...,https://s.gr-assets.com/assets/nophoto/book/11...,1333909,10,1323437,Good Harbor,Good Harbor
2,66da49047084538b3e00f9c4,,7,[189911],US,eng,"[{'count': '58', 'name': 'to-read'}, {'count':...",B00071IKUY,False,4.03,...,,Book Club Edition,1987.0,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,7327624,140,8948723,"The Unschooled Wizard (Sun Wolf and Starhawk, ...","The Unschooled Wizard (Sun Wolf and Starhawk, ..."
3,66da49047084538b3e00f9c5,743294297.0,3282,[],US,eng,"[{'count': '7615', 'name': 'to-read'}, {'count...",,False,3.49,...,7.0,,2009.0,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,6066819,51184,6243154,Best Friends Forever,Best Friends Forever
4,66da49047084538b3e00f9c6,850308712.0,5,[],US,,"[{'count': '32', 'name': 'to-read'}, {'count':...",,False,3.4,...,,,,https://www.goodreads.com/book/show/287140.Run...,https://images.gr-assets.com/books/1413219371m...,287140,15,278577,Runic Astrology: Starcraft and Timekeeping in ...,Runic Astrology: Starcraft and Timekeeping in ...


In [9]:
df.columns.tolist()

['_id',
 'isbn',
 'text_reviews_count',
 'series',
 'country_code',
 'language_code',
 'popular_shelves',
 'asin',
 'is_ebook',
 'average_rating',
 'kindle_asin',
 'similar_books',
 'description',
 'format',
 'link',
 'authors',
 'publisher',
 'num_pages',
 'publication_day',
 'isbn13',
 'publication_month',
 'edition_information',
 'publication_year',
 'url',
 'image_url',
 'book_id',
 'ratings_count',
 'work_id',
 'title',
 'title_without_series']

## Preprocess Numerical columns 

In [10]:
numerical_columns = [
    'text_reviews_count',
    'average_rating',
    'num_pages',
    'ratings_count'
]

# Ensure numerical columns are present in DataFrame
df_numerical = df[numerical_columns].fillna(0)  # Handle missing values appropriately


In [11]:
# Check data types in df_numerical
print(df_numerical.dtypes)

text_reviews_count    object
average_rating        object
num_pages             object
ratings_count         object
dtype: object


In [12]:
# Check for missing values
print(df_numerical.isnull().sum())

# Example of filling missing values
df_numerical = df_numerical.fillna(0)  # Or use another appropriate value or method

# Or drop rows/columns with missing values
df_numerical = df_numerical.dropna()  # Be cautious with dropping data

text_reviews_count    0
average_rating        0
num_pages             0
ratings_count         0
dtype: int64


In [13]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for column in df_numerical.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df_numerical[column] = le.fit_transform(df_numerical[column])
    label_encoders[column] = le

In [14]:
# Make sure df_numerical contains only the numerical columns
print(df_numerical.head())
print(numerical_columns)

   text_reviews_count  average_rating  num_pages  ratings_count
0                   2             180        199            470
1                 291             103          0              3
2                 312             183        547            157
3                 204             129        324            740
4                 265             120          0            186
['text_reviews_count', 'average_rating', 'num_pages', 'ratings_count']


Standardize the data

In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_numerical), columns=df_numerical.columns)

## Compute Cosine Similarity:

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute the cosine similarity matrix
similarity_matrix = cosine_similarity(df_scaled)

# Convert to DataFrame for better readability
similarity_df = pd.DataFrame(similarity_matrix, index=df.index, columns=df.index)
print(similarity_df)

          0         1         2         3         4         5         6     \
0     1.000000 -0.643989 -0.435752 -0.429289 -0.692507 -0.778135 -0.321004   
1    -0.643989  1.000000  0.157865 -0.035964  0.985906  0.147449  0.441960   
2    -0.435752  0.157865  1.000000  0.328696  0.058698  0.182553  0.161707   
3    -0.429289 -0.035964  0.328696  1.000000 -0.026688  0.231253 -0.695457   
4    -0.692507  0.985906  0.058698 -0.026688  1.000000  0.260498  0.453329   
...        ...       ...       ...       ...       ...       ...       ...   
9995  0.922450 -0.567937 -0.252136 -0.120688 -0.650316 -0.894287 -0.576918   
9996  0.270249  0.179390 -0.721570  0.121740  0.209317 -0.468549 -0.524965   
9997 -0.462727  0.015939  0.905079  0.690035 -0.056441  0.224221 -0.216374   
9998  0.588135 -0.137374 -0.407705 -0.980568 -0.146647 -0.326525  0.555646   
9999  0.780413 -0.541500 -0.153470 -0.769286 -0.588830 -0.438032  0.276160   

          7         8         9     ...      9990      9991    

In [20]:
print(df[categorical_columns].dtypes)
print(df[categorical_columns].applymap(type).value_counts())

series          object
country_code    object
dtype: object
series          country_code 
<class 'list'>  <class 'str'>    10000
Name: count, dtype: int64


  print(df[categorical_columns].applymap(type).value_counts())


In [21]:
for col in categorical_columns:
    if df[col].apply(lambda x: isinstance(x, list)).any():
        df[col] = df[col].apply(lambda x: ','.join(x) if isinstance(x, list) else x)

In [22]:
print(df[categorical_columns].applymap(type).nunique())

series          1
country_code    1
dtype: int64


  print(df[categorical_columns].applymap(type).nunique())


## Textual Data

In [23]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False, drop='first')
df_categorical = df[categorical_columns].fillna('')  # Ensure no NaNs
categorical_encoded = encoder.fit_transform(df_categorical)

# Convert to DataFrame and concatenate with numerical data
df_categorical_encoded = pd.DataFrame(categorical_encoded, columns=encoder.get_feature_names_out(categorical_columns))



In [24]:
from sklearn.preprocessing import OneHotEncoder

# Example for one-hot encoding categorical data
categorical_columns = ['series', 'country_code']
df_categorical = df[categorical_columns].fillna('')

encoder = OneHotEncoder(sparse=False, drop='first')
categorical_encoded = encoder.fit_transform(df_categorical)

# Convert to DataFrame and concatenate with numerical data
df_categorical_encoded = pd.DataFrame(categorical_encoded, columns=encoder.get_feature_names_out(categorical_columns))
df_combined = pd.concat([df_scaled, df_categorical_encoded], axis=1)

# Compute cosine similarity with combined features
similarity_matrix_combined = cosine_similarity(df_combined)
similarity_df_combined = pd.DataFrame(similarity_matrix_combined, index=df.index, columns=df.index)
print(similarity_df_combined)



          0         1         2         3         4         5         6     \
0     1.000000 -0.240625 -0.095720  0.065649 -0.197987 -0.223557  0.050530   
1    -0.240625  1.000000  0.247746  0.168731  0.983440  0.294594  0.527975   
2    -0.095720  0.247746  1.000000  0.422366  0.192699  0.299507  0.271243   
3     0.065649  0.168731  0.422366  1.000000  0.225966  0.434707 -0.284421   
4    -0.197987  0.983440  0.192699  0.225966  1.000000  0.421294  0.560488   
...        ...       ...       ...       ...       ...       ...       ...   
9995  0.952126 -0.198882  0.022296  0.255487 -0.184331 -0.321472 -0.142919   
9996  0.432296  0.290807 -0.470174  0.301226  0.342857 -0.196231 -0.272263   
9997 -0.096755  0.148772  0.866936  0.740773  0.121929  0.363826 -0.016418   
9998  0.687084  0.042754 -0.182203 -0.496490  0.079609 -0.043553  0.640889   
9999  0.831641 -0.275963  0.033384 -0.309216 -0.251987 -0.109068  0.425063   

          7         8         9     ...      9990      9991    

In [25]:
df['average_rating'] = pd.to_numeric(df['average_rating'], errors='coerce')

In [26]:
true_ratings = df['average_rating'].fillna(0)

In [27]:
predicted_ratings = true_ratings + np.random.normal(0, 0.5, size=len(df))

In [28]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(true_ratings, predicted_ratings)

In [29]:
# Calculate Mean Squared Error (MSE)
# Example data for demonstration: true ratings and predicted ratings
# Replace these with actual data from your dataset
true_ratings = df['average_rating'].fillna(0)  # Assuming average_rating is actual rating
predicted_ratings = true_ratings + np.random.normal(0, 0.5, size=len(df))  # Random predictions for example

# Calculate MSE
mse = mean_squared_error(true_ratings, predicted_ratings)
print(f"Mean Squared Error: {mse:.2f}")

# Optional: Display first few actual vs. predicted ratings
rating_comparison = pd.DataFrame({
    'True Rating': true_ratings,
    'Predicted Rating': predicted_ratings
})

print(rating_comparison.head())

Mean Squared Error: 0.25
   True Rating  Predicted Rating
0         4.00          3.845960
1         3.23          2.843702
2         4.03          3.409136
3         3.49          4.063168
4         3.40          3.364105


In [None]:
testing