In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
import numpy as np
import seaborn as sns
import math
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.cluster import KMeans, AgglomerativeClustering
from mpl_toolkits.mplot3d import Axes3D
from sklearn.metrics import silhouette_score

**Reading and preprocessing data**

"ISBN";"Book-Title";"Book-Author";"Year-Of-Publication";"Publisher";"Image-URL-S";"Image-URL-M";"Image-URL-L" - *Books.csv*

"User-ID";"Location";"Age" - *Users.csv*

"User-ID";"ISBN";"Book-Rating" - *Ratings.csv*

#Loading data
books = pd.read_csv('BX_Books.csv', sep=';', error_bad_lines=False, encoding="latin-1")
books.columns = ['ISBN', 'bookTitle', 'bookAuthor', 'yearOfPublication', 'publisher', 'imageUrlS', 'imageUrlM', 'imageUrlL']
users = pd.read_csv('BX-Users.csv', sep=';', error_bad_lines=False, encoding="latin-1")
users.columns = ['userID', 'Location', 'Age']
ratings = pd.read_csv('BX-Book-Ratings.csv', sep=';', error_bad_lines=False, encoding="latin-1")
ratings.columns = ['userID', 'ISBN', 'bookRating']

#checking shapes of the datasets
print (books.shape)
print (users.shape)
print (ratings.shape)

#Exploring books dataset
books.head()

#dropping last three columns containing image URLs which will not be required for analysis
books.drop(['imageUrlS', 'imageUrlM', 'imageUrlL'],axis=1,inplace=True)

#Now the books datasets looks like....
books.head()

#checking data types of columns
books.dtypes

#making this setting to display full text in columns
pd.set_option('display.max_colwidth', None)

books.head()

#yearOfPublication should be set as having dtype as int
#checking the unique values of yearOfPublication
books.yearOfPublication.unique()

#Correcting the dtypes of yearOfPublication
books.yearOfPublication=pd.to_numeric(books.yearOfPublication, errors='coerce')

print (sorted(books['yearOfPublication'].unique()))

#The value 0 is invalid and as this dataset was published in 2020, set the years after 2020 to be invalid
#setting invalid years as NaN
books.loc[(books.yearOfPublication > 2020) | (books.yearOfPublication == 0),'yearOfPublication'] = np.NAN

#replacing NaNs with mean value of yearOfPublication
books.yearOfPublication.fillna(round(books.yearOfPublication.mean()), inplace=True)

#rechecking - no NANs
books.yearOfPublication.isnull().sum()

#resetting the dtype as int32
books.yearOfPublication = books.yearOfPublication.astype(np.int32)

#exploring 'publisher' column
books.loc[books.publisher.isnull(),:]
# two NANs

#investigating rows having NaNs
#Checking with rows having bookTitle as Tyrant Moon to see if we can get any concusion
books.loc[(books.bookTitle == 'Tyrant Moon'),:]
#no concusions

#Checking rows having bookTitle as Finder Keepers to see if we can get any conclusion
books.loc[(books.bookTitle == 'Finders Keepers'),:]
#all rows with different publisher and bookAuthor

#checking by bookAuthor to find patterns
books.loc[(books.bookAuthor == 'Elaine Corvidae'),:]
#all having different publisher...no conclusions here

#checking by bookAuthor to find patterns
books.loc[(books.bookAuthor == 'Linnea Sinclair'),:]

#since there is nothing in common to infer publisher for NaNs, replacing these with 'other
books.loc[(books.ISBN == '193169656X'),'publisher'] = 'other'
books.loc[(books.ISBN == '1931696993'),'publisher'] = 'other'

# label encoding - title, author, publisher
label_encoder = LabelEncoder()
books['bookTitleCoded'] = label_encoder.fit_transform(books["bookTitle"].astype(np.str)).astype(np.int64)
books['bookAuthorCoded'] = label_encoder.fit_transform(books["bookAuthor"].astype(np.str)).astype(np.int64)
books['publisherCoded'] = label_encoder.fit_transform(books["publisher"].astype(np.str)).astype(np.int64)
books.head()

books.hist(bins = 50, figsize = (15,10))
plt.show()

# Finding parameter correlations
fig, ax = plt.subplots(figsize=(8, 8))
corr = books.corr()
ax = sns.heatmap(corr, annot = True, cmap='Blues')

print(users.shape)
users.head()

users.dtypes

users.userID.values

print (sorted(users.Age.unique()))

#Age column has some invalid entries like nan, 0 and very high values like 100 and above
#Values below 5 and above 90 do not make much sense for our book rating case...hence replacing these by NaNs
users.loc[(users.Age > 90) | (users.Age < 5), 'Age'] = np.nan

#replacing NaNs with mean
users.Age = users.Age.fillna(users.Age.mean())

#setting the data type as int
users.Age = users.Age.astype(np.int32)

#rechecking
print(sorted(users.Age.unique()))

users['Age'].hist(bins = 50, figsize = (15,10))
plt.show()

#checking shape
ratings.shape

#ratings dataset will have n_users*n_books entries if every user rated every item, this shows that the dataset is very sparse
n_users = users.shape[0]
n_books = books.shape[0]
print(n_users * n_books)

#checking first few rows...
ratings.head(5)

ratings.bookRating.unique()

#ratings dataset should have books only which exist in our books dataset, unless new books are added to books dataset
ratings_new = ratings[ratings.ISBN.isin(books.ISBN)]

print (ratings.shape)
print (ratings_new.shape)
#it can be seen that many rows having book ISBN not part of books dataset got dropped off

#ratings dataset should have ratings from users which exist in users dataset, unless new users are added to users dataset
ratings = ratings[ratings.userID.isin(users.userID)]

print(ratings.shape)
print(ratings_new.shape)
#no new users added, hence we will go with above dataset ratings_new (1031175, 3)

print("number of users: " + str(n_users))
print("number of books: " + str(n_books))

#Sparsity of dataset in %
sparsity=1.0-len(ratings_new)/float(n_users*n_books)
print('The sparsity level of Book Crossing dataset is ' +  str(sparsity*100) + ' %')

#BX-Book-Ratings contains the book rating information. Ratings are either explicit, expressed on a scale from 1-10 
#higher values denoting higher appreciation, or implicit, expressed by 0
ratings.bookRating.unique()

#Hence segragating implicit and explict ratings datasets
ratings_explicit = ratings_new[ratings_new.bookRating != 0]
ratings_implicit = ratings_new[ratings_new.bookRating == 0]

#checking shapes
print(ratings_new.shape)
print(ratings_explicit.shape)
print(ratings_implicit.shape)

#plotting count of bookRating
sns.countplot(data=ratings_explicit , x='bookRating')
plt.show()
#It can be seen that higher ratings are more common amongst users and rating 8 has been rated highest number of times

#At this point , a simple popularity based recommendation system can be built based on count of user ratings for different books
ratings_count = pd.DataFrame(ratings_explicit.groupby(['ISBN'])['bookRating'].sum())
top10 = ratings_count.sort_values('bookRating', ascending = False).head(10)
print("Following books are recommended")
top10.merge(books, left_index = True, right_on = 'ISBN')

#Similarly segregating users who have given explicit ratings from 1-10 and those whose implicit behavior was tracked
users_exp_ratings = users[users.userID.isin(ratings_explicit.userID)]
users_imp_ratings = users[users.userID.isin(ratings_implicit.userID)]

#checking shapes
print(users.shape)
print(users_exp_ratings.shape)
print(users_imp_ratings.shape)

#To cope up with computing power I have and to reduce the dataset size, I am considering users who have rated atleast 100 books
#and books which have atleast 100 ratings
counts1 = ratings_explicit['userID'].value_counts()
ratings_explicit = ratings_explicit[ratings_explicit['userID'].isin(counts1[counts1 >= 100].index)]
counts = ratings_explicit['bookRating'].value_counts()
ratings_explicit = ratings_explicit[ratings_explicit['bookRating'].isin(counts[counts >= 100].index)]

#Merging tables from different files to include all the collumns we need 
allData = ratings_count.merge(books, left_index = True, right_on = 'ISBN')
#dropping columns that won't be used
allData.drop(['bookTitle', 'bookAuthor', 'publisher', 'ISBN'],axis=1,inplace=True)
allData.head()

#Merging tables from different files to include all the collumns we need 
df = ratings_count.merge(books, left_index = True, right_on = 'ISBN')
df = df.merge(ratings_explicit, how='inner', left_on='ISBN', right_on='ISBN')
df = df.merge(users_exp_ratings, how='inner', left_on='userID', right_on='userID')

df['ISBNCoded'] = label_encoder.fit_transform(df["ISBN"].astype(np.str)).astype(np.int64)
df['bookRating'] = df['bookRating_x']
df.drop(['bookTitle', 'bookAuthor', 'publisher', 'bookRating_x', 'bookRating_y', 'Location', 'ISBN', 'userID'],axis=1,inplace=True)

df.head()

df.dtypes

# Finding parameter correlations
fig, ax = plt.subplots(figsize=(14, 14))
corr = df.corr()
ax = sns.heatmap(corr, annot = True, cmap='Blues')

# Scaling data
min_max_scaler = MinMaxScaler()
df_scaled = pd.DataFrame(min_max_scaler.fit_transform(df), columns=df.columns, index=df.index)

**K-Means**

# finding optimal number of clusters
distortions = []
K = range(1,8)
for k in K:
    kmeanModel = KMeans(n_clusters=k)
    kmeanModel.fit(df_scaled)
    distortions.append(kmeanModel.inertia_)

# Ploting Elbow Method in order to find optimal k
plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal number of clusters')
plt.show()

kmeanModel = KMeans(n_clusters = 3, n_jobs=-1)
kmeanModel.fit(df_scaled)

clust_labels = kmeanModel.predict(df_scaled)
centers = kmeanModel.cluster_centers_

fig = plt.figure()
ax = fig.add_subplot()
scatter = ax.scatter(df_scaled['bookRating'], df_scaled['bookAuthorCoded'], c=pd.DataFrame(kmeanModel.predict(df_scaled)))
ax.set_title('K-Means Clustering')
ax.set_xlabel('Book Rating' )
ax.set_ylabel('Age')
ax.grid(True)

fig = plt.figure()
ax = fig.add_subplot()
scatter = ax.scatter(df_scaled['bookRating'], df_scaled['bookTitleCoded'], c=pd.DataFrame(kmeanModel.predict(df_scaled)))
ax.set_title('K-Means Clustering')
ax.set_xlabel('Book Rating')
ax.set_ylabel('Title')
ax.grid(True)

fig = plt.figure()
ax = fig.add_subplot()
scatter = ax.scatter(df_scaled['bookRating'], df_scaled['bookAuthorCoded'], c=pd.DataFrame(kmeanModel.predict(df_scaled)))
ax.set_title('K-Means Clustering')
ax.set_xlabel('Book Rating')
ax.set_ylabel('Author')
ax.grid(True)

fig = plt.figure(figsize=(10,8))
ax = Axes3D(fig)
ax.scatter(df_scaled['bookTitleCoded'], df_scaled['Age'], df_scaled['bookRating'], c=pd.DataFrame(kmeanModel.predict(df_scaled)))
ax.set_title('K-Means Clustering')
ax.set_xlabel('Book Author')
ax.set_ylabel('Publisher')
ax.set_zlabel('Book Rating')