In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Data Science Problem 

Goodreads is a popular online platform where readers can discover new books, read reviews, and connect with other readers. However, with millions of books available on the platform, it can be overwhelming for users to find books that match their preferences. In this project, we aim to build a recommendation system for Goodreads users that suggests books based on their reading history, preferences, and ratings. Our goal is to provide a personalized and intuitive experience for users, helping them discover new books that they are likely to enjoy. Using supervised learning and NLP techniques, we will build a model that can predict the likelihood of a user liking a book based on features such as book title, author, genre, description, user reviews, and book ratings. The model will be trained on a subset of the Goodreads dataset and evaluated based on accuracy, precision, recall, and F1-score. The final output will be a recommendation engine that suggests books to users based on their input and history on the platform.

Data Extracted from UCSD Goodreads data: https://sites.google.com/eng.ucsd.edu/ucsdbookgraph/home

## Objectives

In the preprocessing step, we aim to clean and transform the raw data into a format that is suitable for machine learning models. 
Here are some of the questions we aim to answer during the preprocessing step in this notebook:

Data Cleaning:
Are there any missing values in the dataset?
Are there any duplicate entries in the dataset?
Are there any irrelevant features in the dataset that can be removed?
Are there any inconsistencies in the data that need to be corrected?

Data Transformation:
How can we extract relevant features from the dataset, such as book title, author, genre, description, user reviews, and book ratings?
How can we preprocess the text data to make it suitable for machine learning models, such as tokenization, removing stop words, stemming, and lemmatization?
How can we convert the text data into numerical features that can be used in machine learning models, such as TF-IDF, Bag of Words, or Word2Vec?

Exploratory Data Analysis:
What is the distribution of ratings in the dataset?
What are the most popular genres and authors in the dataset?
Are there any correlations between different features in the dataset?

Data Preparation:
How can we split the dataset into training and testing sets?
How can we balance the dataset to handle class imbalance?
How can we encode categorical variables into numerical variables?

Answering these questions during the preprocessing step is crucial in building an accurate and robust recommendation system that can provide personalized recommendations to Goodreads users.

In [None]:
df = pd.read_csv('/kaggle/input/goodreads-books/full_df.csv.bz2', compression='bz2')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.info()

## Number Of Missing Values By Column

In [None]:
missing = pd.concat([df.isna().sum(), 100* df.isna().mean()], axis = 1)
missing.columns = ['count','%']
missing.sort_values(by = 'count')

In [None]:
# Let's look closely at the "language code" and "isbn" coloums
df['language_code'].value_counts()


In [None]:
df['language_code'].isna().sum()

In [None]:
#Removing the language code columns: There are 50% missing value in "language code coloumn". We saw that the only langauge represented in this column is Engish, so, it does not provide any insights for the recommender system.
df = df.drop('language_code', axis = 1)

In [None]:
df = df.drop('isbn', axis =1)

## Categorical Features

In [None]:
df.select_dtypes('object')

In [None]:
df = df.drop('cover_image', axis =1)

In [None]:
df['book_id'].value_counts().sum()

In [None]:
df['author_id'].value_counts().head()

In [None]:
df['title'].value_counts().head()

In [None]:
df[['title','authors']].nunique()

In [None]:
(df['title'] + ', ' + df['authors']).value_counts().head()

In [None]:
df[df[['title', 'authors']].duplicated()]

In [None]:
df.duplicated(subset=['title','authors']).value_counts()

In [None]:
df[df['title']=='Second Glance']

In [None]:
df[df['title']=='Three Days and a Life']

In [None]:
 len(df['authors'].unique())


In [None]:
len(df['title'].unique())

## Imputing the NAN values and Removing the duplicates

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
# create the imputer object with most_frequent strategy
imputer = SimpleImputer(strategy = 'most_frequent')
# fit the imputer to the data
imputer.fit(df)
# impute missing values in each column
data_imputed = imputer.transform(df)

# convert the imputed data back to a pandas dataframe
data_imputed_df = pd.DataFrame(data_imputed, columns=df.columns)

# print the first 5 rows of the imputed dataframe
print(data_imputed_df.head())



In [None]:
# We check to see if all NAN are imputed
data_imputed_df[data_imputed_df['title']=='Second Glance']

In [None]:
# sort the dataset by ratings_count in descending order
data_sorted = data_imputed_df.sort_values('ratings_count', ascending=False)

# drop duplicates based on author and title, and keep the first occurence (which has the highest ratings_count)
data_unique = data_sorted.drop_duplicates(subset=['authors', 'title'], keep='first')

# print the first 5 rows of the unique dataframe
print(data_unique.head())

In [None]:
data_unique[data_unique['title']=='Second Glance']

# We have successfully removed all the duplicates based on author and title and only kept the one with the highest rating counts. 

### Genres

In [None]:
data_unique[data_unique['authors']== 'Jodi Picoult']


In [None]:
data_unique.to_csv('df.csv', index = False)

In [None]:
!pwd