In [None]:
import re
import csv
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Markdown

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download("punkt")

## Load data

Link to data: https://www.kaggle.com/datasets/ymaricar/cmu-book-summary-dataset/data

In [None]:
DATA_PATH = "../data/booksummaries.txt"
PROCESSED_DATA_PATH = "../data/processed_data.csv"

In [None]:
data = []
with open(DATA_PATH, 'r') as f:
    reader = csv.reader(f, dialect='excel-tab')
    for row in reader:
        data.append(row)

columns = ['book_id', 'freebase_id', 'book_title', 'author', 'publication_date', 'genre', 'summary']
df = pd.DataFrame.from_records(data, columns=columns)[:2000]
df.head()

In [None]:
df.shape

In [None]:
df.nunique()

## Process data

### Columns with IDs

IDs will not be used for creating book recommendation system, so they can be dropped from the dataset.

In [None]:
df.drop(columns=['book_id', 'freebase_id'], inplace=True)

In [None]:
df.head()

### Book title

Some titles occurs more than one time.

In [None]:
vc = df.book_title.value_counts()
vc[vc > 1]

Those are quite common and short titles. We need to check if there are duplicated pairs with the same author and title.

In [None]:
vc = df[["book_title", "author"]].value_counts()
vc[vc > 1]

There are 8 examples where book with the same title and author was in the dataset. Duplicated rows are dropped.

In [None]:
df.drop_duplicates(subset=["book_title", "author"], keep="first", inplace=True)
df.shape

### Author

In [None]:
df["author"] = df.author.replace('', np.nan)
Markdown(f"There are {sum(df.author.isna())} books without the author in the dataset.")

The most popular authors

In [None]:
df.author.value_counts().head(10).plot.barh(title="TOP 10 most popular authors in the dataset", xlabel="Number of books")
plt.show()

### Publication date

In [None]:
def get_year_from_date(dates):
    years = []
    for date in dates:
        if date == "":
            years.append(np.nan)
        elif re.search("^\d{4}$", date) is not None:
            years.append(int(date))
        else:
            years.append(int(date.split("-")[0]))
    return years

In [None]:
df["publication_year"] = get_year_from_date(df.publication_date.to_list())

In [None]:
df.publication_year.dropna().describe()

In [None]:
df.query("publication_year > 1900").publication_year.dropna().plot.box(title="Publication year ditribution after 1900")
plt.show()

### Genre

In [None]:
def parse_genre_entry(genre_info):
    if genre_info == '':
        return []
    genre_dict = json.loads(genre_info)
    genres = list(genre_dict.values())
    return genres

In [None]:
df["genre"] = df['genre'].apply(parse_genre_entry)

In [None]:
df.genre.explode().dropna().value_counts().head(20).plot.barh(xlabel="Number of titles")
plt.show()

In [None]:
Markdown(f"There are {df.genre.explode().nunique()} unique genre.")

### Summary

In [None]:
any(df.summary.isna())

In [None]:
df.summary.apply(len).plot.box()
plt.show()

In [None]:
df.summary.apply(lambda x: len(x.split("."))).describe()

In [None]:
def clean_data(list_of_texts: list[str]) -> list[str]:
    clean_text = []
    stopwords_list = stopwords.words("english")
    lem = WordNetLemmatizer()
    for text in list_of_texts:
        new_text = text.lower().strip() # change text to lowercase
        new_text = re.sub('[^a-zA-Z]', ' ', new_text) # remove numbers and special chars
        new_text = re.sub(" +", " ", new_text) # remove repetetive spaces
        new_text = ' '.join(lem.lemmatize(text) for text in new_text.split() if text not in stopwords_list)
        clean_text.append(new_text)
    return clean_text

In [None]:
df["clean_summary"] = clean_data(df.summary.to_list())
df.head()

## Save processed dataset

In [None]:
df.to_csv(PROCESSED_DATA_PATH, sep=";", index=False)