## **1. Import Necessary Libraries**

In [1]:
import pandas as pd

## **2. Explore Data**

In [2]:
pd.set_option('display.max_colwidth', None)

In [3]:
df = pd.read_csv("C:/Users/HP/Downloads/FourMinuteBooksSummaries/books_summary.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,book_name,summaries,categories
0,0,The Highly Sensitive Person,"is a self-assessment guide and how-to-live template for people who feel, relate, process, and notice more deeply than others, and who frequently suffer from overstimulation as a result.",science
1,1,Why Has Nobody Told Me This Before?,"is a collection of a clinical psychologist’s best practical advice to combat anxiety and depression and improve our mental health in small increments, collected from over a decade of 1-on-1 work with patients.",science
2,2,The Midnight Library,"tells the story of Nora, a depressed woman in her 30s, who, on the day she decides to die, finds herself in a library full of lives she could have lived, where she discovers there’s a lot more to life, even her current one, than she had ever imagined.",science
3,3,Brave New World,"presents a futuristic society engineered perfectly around capitalism and scientific efficiency, in which everyone is happy, conform, and content — but only at first glance.",science
4,4,1984,is the story of a man questioning the system that keeps his futuristic but dystopian society afloat and the chaos that quickly ensues once he gives in to his natural curiosity and desire to be free.,science


In [5]:
df.shape

(5201, 4)

### a. Check for null values

In [6]:
df.isna().sum()

Unnamed: 0    0
book_name     0
summaries     7
categories    0
dtype: int64

### b. Check for duplicated values

In [7]:
df.duplicated(['book_name','summaries','categories']).value_counts()

False    4979
True      222
Name: count, dtype: int64

## **3. Pre Process Data**

In [8]:
df.drop('Unnamed: 0', inplace = True, axis = 1)

In [9]:
df.dropna(inplace = True)

In [10]:
df.isna().sum()

book_name     0
summaries     0
categories    0
dtype: int64

In [11]:
df.drop_duplicates(subset = ['book_name','summaries','categories'], inplace = True)

In [12]:
df.duplicated(['book_name','summaries','categories']).value_counts()

False    4972
Name: count, dtype: int64

In [13]:
df['summaries'].isin([' ']).value_counts()

summaries
False    4964
True        8
Name: count, dtype: int64

In [14]:
df['categories'].isin([' ']).value_counts()

categories
False    4972
Name: count, dtype: int64

In [15]:
df = df.loc[~df['summaries'].isin([' '])]

In [16]:
df['summaries'].isin([' ']).value_counts()

summaries
False    4964
Name: count, dtype: int64

In [17]:
df.drop('book_name', inplace = True, axis = 1)

In [18]:
df.reset_index(drop = True, inplace = True)

## **4. Re name Columns and Genres**

- This dataset is being used to aid the `Goodreads` dataset.
- Since the original dataset scraped from Goodreads is `highly imbalanced`.

- So we will rename the column names and the genres to match the Goodreads dataset.

In [19]:
df.head()

Unnamed: 0,summaries,categories
0,"is a self-assessment guide and how-to-live template for people who feel, relate, process, and notice more deeply than others, and who frequently suffer from overstimulation as a result.",science
1,"is a collection of a clinical psychologist’s best practical advice to combat anxiety and depression and improve our mental health in small increments, collected from over a decade of 1-on-1 work with patients.",science
2,"tells the story of Nora, a depressed woman in her 30s, who, on the day she decides to die, finds herself in a library full of lives she could have lived, where she discovers there’s a lot more to life, even her current one, than she had ever imagined.",science
3,"presents a futuristic society engineered perfectly around capitalism and scientific efficiency, in which everyone is happy, conform, and content — but only at first glance.",science
4,is the story of a man questioning the system that keeps his futuristic but dystopian society afloat and the chaos that quickly ensues once he gives in to his natural curiosity and desire to be free.,science


In [20]:
df.rename(columns = {'summaries':'Summary','categories':'Genres'}, inplace = True)

In [21]:
df.dtypes

Summary    object
Genres     object
dtype: object

In [22]:
df['Genres'].value_counts()

Genres
psychology       582
productivity     474
motivation       423
happiness        413
work             362
business         354
mindfulness      310
relationships    263
communication    227
science          198
creativity       185
management       178
health           176
money            147
politics         117
marketing        116
education        108
technology        97
biography         79
economics         68
environment       48
religion          29
fiction           10
Name: count, dtype: int64

### **a. Remove Unnecessary Genres**

In [23]:
genres_to_remove = ['relationships','business','management','productivity','fiction','communication', 'work','environment','education','health','creativity',"economics", "marketing", "money","politics","motivation","happiness"]

df = df[~df['Genres'].isin(genres_to_remove)]
df.reset_index(drop = True, inplace = True)

### **b. Create Genre Mappings**

- We will create genre mappings such that they match the `Goodreads Dataset`

In [24]:
genre_mappings = {
    "psychology": "Psychology", 
    "mindfulness" : "Inspirational and Self-Help", 
    "religion" : "Religion and Spirituality",
    "biography" : "Biographies and Memoirs", 
    "technology" : "Science and Technology",
    "science" : "Science and Technology"
}

In [25]:
df['GroupedGenres'] = df['Genres'].replace(genre_mappings)

In [26]:
df.drop('Genres', inplace = True, axis = 1)

In [27]:
df['GroupedGenres'].value_counts()

GroupedGenres
Psychology                     582
Inspirational and Self-Help    310
Science and Technology         295
Biographies and Memoirs         79
Religion and Spirituality       29
Name: count, dtype: int64

In [28]:
df.shape

(1295, 2)

In [29]:
df.head()

Unnamed: 0,Summary,GroupedGenres
0,"is a self-assessment guide and how-to-live template for people who feel, relate, process, and notice more deeply than others, and who frequently suffer from overstimulation as a result.",Science and Technology
1,"is a collection of a clinical psychologist’s best practical advice to combat anxiety and depression and improve our mental health in small increments, collected from over a decade of 1-on-1 work with patients.",Science and Technology
2,"tells the story of Nora, a depressed woman in her 30s, who, on the day she decides to die, finds herself in a library full of lives she could have lived, where she discovers there’s a lot more to life, even her current one, than she had ever imagined.",Science and Technology
3,"presents a futuristic society engineered perfectly around capitalism and scientific efficiency, in which everyone is happy, conform, and content — but only at first glance.",Science and Technology
4,is the story of a man questioning the system that keeps his futuristic but dystopian society afloat and the chaos that quickly ensues once he gives in to his natural curiosity and desire to be free.,Science and Technology


In [32]:
df.to_csv("C:/Users/HP/Downloads/FourMinuteBookSummaries.csv")