In [1]:
import pandas as pd
import numpy as np
import os
from datasets import load_dataset
from tqdm.auto import tqdm
from fuzzywuzzy import fuzz
from datetime import datetime
import re
tqdm.pandas()

The plan of action is to use user from [3], book detail from [2], and genre and author from [1].

# User

In [2]:
users = pd.read_csv("3_clean/Users.csv")
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   user_id   278858 non-null  int64  
 1   location  278858 non-null  object 
 2   age       168096 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.4+ MB


In [3]:
users.head()

Unnamed: 0,user_id,location,age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


# Book Details

## How to Merge Books?

One way is to use ISBNs. The problem with this is that the same book (by the same author) can have different ISBN depending on the publisher. Another approach is to use book title along with the author name. Of course, this will result in some more problems.

The steps are:

1. Using title and author to merge
2. Separate the title, series name and book number
3. Get the author names from author id
4. Do preprocessing to both author name and title
5. Use fuzzy matching with a very high threshold

We will start decoding the author codes. If there are more than one author, we will separate them with a colon (:).

In [4]:
books = load_dataset("parquet", data_files={"train": os.path.join("1_clean", "dataset.parquet")})

In [5]:
books["train"][0:5]

{'authors': ['[1]', '[2, 3]', '[4]', '[5, 6, 7, 8]', '[9]'],
 'categories': ['[214, 220, 237, 2646, 2647, 2659, 2660, 2679]',
  '[235, 3386]',
  '[358, 2630, 360, 2632]',
  '[377, 2978, 2980]',
  '[2813, 2980]'],
 'format': [1.0, 1.0, 1.0, 1.0, 2.0],
 'isbn10': ['184018907X', '184454737X', '8416327866', '571308996', '8352518'],
 'lang': ['en', 'en', 'es', 'en', 'en'],
 'publication-date': ['2004-10-14 00:00:00',
  '2009-03-13 00:00:00',
  '2020-04-30 00:00:00',
  '2015-10-01 00:00:00',
  '2019-06-18 00:00:00'],
 'title': ['Soldier Five : The Real Truth About The Bravo Two Zero Mission',
  'Underbelly : The Gangland War',
  'A Sir Phillip, Con Amor',
  'QI: The Third Book of General Ignorance',
  'The Hidden Power of F*cking Up']}

## Get Author Name

In [6]:
authors = pd.read_csv('1_clean/authors.csv')
authors.head()

Unnamed: 0,author_id,author_name
0,1,Mike Coburn
1,2,John Silvester
2,3,Andrew Rule
3,4,Julia Quinn
4,5,Andrew Hunter Murray


In [7]:
authors.fillna('Unknown', inplace=True)

In [8]:
# A class is needed if want to run in parallel
# Otherwise will get error like authors is not defined
class AddAuthor:
    def __init__(self, books, authors):
        self.authors = authors
        self.books = books

    def _decode_author(self, row):
        ids = row["authors"]
        ids = eval(ids)
        authors_ = []
        for id_ in ids:
            author = self.authors.iloc[id_ - 1]["author_name"]
            authors_.append(author)
        row["authors_"] = ": ".join(authors_)
        return row
    
    def add_author(self):
        self.books = self.books.map(self._decode_author, batch_size=10000, num_proc=8)
        return self.books

In [9]:
#the same function that is used in the class above
def _decode_author(row):
    ids = row["authors"]
    ids = eval(ids)
    authors_ = []
    for id_ in ids:
        author = authors.iloc[id_ - 1]["author_name"]
        authors_.append(author)
    row["authors_"] = ": ".join(authors_)
    return row

In [10]:
add_author = AddAuthor(books, authors)
books = add_author.add_author()

In [11]:
books["train"][:5]

{'authors': ['[1]', '[2, 3]', '[4]', '[5, 6, 7, 8]', '[9]'],
 'categories': ['[214, 220, 237, 2646, 2647, 2659, 2660, 2679]',
  '[235, 3386]',
  '[358, 2630, 360, 2632]',
  '[377, 2978, 2980]',
  '[2813, 2980]'],
 'format': [1.0, 1.0, 1.0, 1.0, 2.0],
 'isbn10': ['184018907X', '184454737X', '8416327866', '571308996', '8352518'],
 'lang': ['en', 'en', 'es', 'en', 'en'],
 'publication-date': ['2004-10-14 00:00:00',
  '2009-03-13 00:00:00',
  '2020-04-30 00:00:00',
  '2015-10-01 00:00:00',
  '2019-06-18 00:00:00'],
 'title': ['Soldier Five : The Real Truth About The Bravo Two Zero Mission',
  'Underbelly : The Gangland War',
  'A Sir Phillip, Con Amor',
  'QI: The Third Book of General Ignorance',
  'The Hidden Power of F*cking Up'],
 'authors_': ['Mike Coburn',
  'John Silvester: Andrew Rule',
  'Julia Quinn',
  'Andrew Hunter Murray: James Harkin: John Lloyd: John Mitchinson',
  'The Try Guys']}

We have authors now. Let's load the other book data from [2] and have a look at the first 5 items:

In [12]:
books2 = load_dataset("csv", data_files="2_clean/book1-100k.csv", delimiter=",")
books2["train"][:5]

{'id': [1, 2, 3, 4, 5],
 'title': ['Harry Potter and the Half-Blood Prince (Harry Potter, #6)',
  'Harry Potter and the Order of the Phoenix (Harry Potter, #5)',
  "Harry Potter and the Sorcerer's Stone (Harry Potter, #1)",
  'Harry Potter and the Chamber of Secrets (Harry Potter, #2)',
  'Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)'],
 'authors': ['J.K. Rowling',
  'J.K. Rowling',
  'J.K. Rowling',
  'J.K. Rowling',
  'J.K. Rowling'],
 'pages': [652, 870, 309, 352, 435],
 'language': ['eng', 'eng', 'eng', 'eng', 'eng'],
 'star_1': [9896, 12455, 108202, 11896, 10128],
 'star_2': [25317, 37005, 130310, 49353, 24849],
 'star_3': [159960, 211781, 567458, 288821, 194848],
 'star_4': [556485, 604283, 1513191, 706082, 630534],
 'star_5': [1546466, 1493113, 4268227, 1504505, 1749958],
 'rating': [4.57, 4.5, 4.47, 4.42, 4.57],
 'total_ratings': [2298124, 2358637, 6587388, 2560657, 2610317],
 'total_reviews': [28062, 29770, 75911, 244, 37093],
 'isbn': [None, '0439358078', None,

## Get Book Name and Series Name

First five books are from Harry Potter series. Let's see what detail is available for these books in the dataset [1]:

In [13]:
JK = books["train"].filter(lambda row: "J.K. Rowling" in row["authors_"])
JK[:5]

{'authors': ['[191489, 191490]',
  '[191489]',
  '[191489]',
  '[191489]',
  '[191489]'],
 'categories': ['[218, 222, 292, 821, 3366, 2804, 2812, 2813]',
  '[355, 2629, 2496]',
  '[355, 2629, 2496]',
  '[355, 2629, 2496]',
  '[355, 2629, 2496]'],
 'format': [2.0, 3.0, 3.0, 3.0, 3.0],
 'isbn10': ['1408706784',
  '1408824094',
  '1408824132',
  '1408821583',
  '1408821516'],
 'lang': ['en', 'en', 'en', 'en', 'en'],
 'publication-date': ['2015-04-14 00:00:00',
  '2011-04-04 00:00:00',
  '2011-05-03 00:00:00',
  '2011-05-03 00:00:00',
  '2011-05-19 00:00:00'],
 'title': ['Very Good Lives : The Fringe Benefits of Failure and the Importance of Imagination',
  'Harry Potter and the Chamber of Secrets',
  'Harry Potter and the Prisoner of Azkaban',
  'Harry Potter and the Goblet of Fire',
  'Harry Potter and the Chamber of Secrets'],
 'authors_': ['J.K. Rowling: Joel Holland',
  'J.K. Rowling',
  'J.K. Rowling',
  'J.K. Rowling',
  'J.K. Rowling']}

We can see that the books title does not have the series name in [1] as it is in [2]. This means that we can not directly use the title to merge the two dataframes. We will have to extract the book name first. This is what we will do next. Fortunately, the book name, the series name and the book number are separated in a logical way and we can use RegEx to extract them.

In [14]:
# Again, we have to use a class if you don't want to import re every time you call the function:
class GetBookInfo:
    import re
    def __init__(self, books):
        self.books = books
        self.regex = re.compile(r"(.*?)\s\((.*?),?\s#(\d{1,3}-?\d{0,3})\)")

    def get_book_info(self, row):
        title = row["title"]
        matches = self.regex.findall(title)
        if len(matches) == 0:
            row["title_"] = title
            row["series"] = "Standalone"
            row["book_num"] = 1
            return row 
        matches = matches[0]
        series = matches[1]
        title_ = matches[0]
        book_num = matches[2]
        if "-" in book_num:
            book_num = int(book_num.split("-")[-1])
        row["title_"] = title_
        row["series"] = series
        row["book_num"] = int(book_num)
        return row
    
    def map_book_info(self):
        self.books = self.books.map(self.get_book_info, batch_size=5000, num_proc=8)
        return self.books

In [15]:
# Here is the same function used in above class
def get_series(row):
    import re
    title = row["title"]
    regex = re.compile(r"(.*?)\s\((.*?),?\s#(\d{1,3}-?\d{0,3})\)")
    matches = regex.findall(title)
    if len(matches) == 0:
        row["title_"] = title
        row["series"] = "Standalone"
        row["book_num"] = 1
        return row 
    matches = matches[0]
    series = matches[1]
    title_ = matches[0]
    book_num = matches[2]
    if "-" in book_num:
        book_num = int(book_num.split("-")[-1])
    row["title_"] = title_
    row["series"] = series
    row["book_num"] = int(book_num)
    return row

In [16]:
book_info_getter = GetBookInfo(books2)
books2 = book_info_getter.map_book_info()

We will have to run this for all the book dataframes in [2]. Let's see the output:

In [17]:
books2["train"][:5]

{'id': [1, 2, 3, 4, 5],
 'title': ['Harry Potter and the Half-Blood Prince (Harry Potter, #6)',
  'Harry Potter and the Order of the Phoenix (Harry Potter, #5)',
  "Harry Potter and the Sorcerer's Stone (Harry Potter, #1)",
  'Harry Potter and the Chamber of Secrets (Harry Potter, #2)',
  'Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)'],
 'authors': ['J.K. Rowling',
  'J.K. Rowling',
  'J.K. Rowling',
  'J.K. Rowling',
  'J.K. Rowling'],
 'pages': [652, 870, 309, 352, 435],
 'language': ['eng', 'eng', 'eng', 'eng', 'eng'],
 'star_1': [9896, 12455, 108202, 11896, 10128],
 'star_2': [25317, 37005, 130310, 49353, 24849],
 'star_3': [159960, 211781, 567458, 288821, 194848],
 'star_4': [556485, 604283, 1513191, 706082, 630534],
 'star_5': [1546466, 1493113, 4268227, 1504505, 1749958],
 'rating': [4.57, 4.5, 4.47, 4.42, 4.57],
 'total_ratings': [2298124, 2358637, 6587388, 2560657, 2610317],
 'total_reviews': [28062, 29770, 75911, 244, 37093],
 'isbn': [None, '0439358078', None,

It is working as intended. Now, the next step is to use these to merge the dataframes.

## Merging the Dataframes

To merge, we will check similarity between the title+author. We will be using fuzzy matching for now. Later, we might try using embeddings. Here is the algorithm:

- Clean the author name and title by removing any special characters and converting to lower case.
- Give different weight to author matching and the title matching, say 0.7 and 0.3 respectively.
- Use `partial_ratio` from `fuzzywuzzy` to get the similarity score for author as well as title. Set a threshold that must be met for the two books to be considered same. Say 90%.
- The function must return two rows: the ID of the book matched and the score. The map function will run on the smaller dataframe [2].

In [25]:
class BookMatching:
    import re
    from fuzzywuzzy import fuzz
    def __init__(self, all_books, books):
        """all_books: list of all books in the database
           books: list of books to be matched"""
        self.all_books = all_books
        self.books = books
        self.books_all_titles = self.all_books[:]["title_"]
        self.books_all_authors = self.all_books[:]["authors"]
        self.author_weight = 0.7 # give more weight to author name
        self.title_weight = 0.3
        self.threshold = 90 # 90% match
        self.final_results = []
        self.regex = re.compile('[^a-zA-Z]')
        self.fuzz_function = fuzz.token_set_ratio

    def clean_name(self, name):
        name = self.regex.sub('', name)
        return name.lower()
    
    def calculate_score(self, author1, author2, title1, title2):
        author1 = self.clean_name(author1)
        author2 = self.clean_name(author2)
        title1 = self.clean_name(title1)
        title2 = self.clean_name(title2)
        author_score = 0
        title_score = 0
        if author1 == author2:
            author_score = 100
        else:
            author_score = self.fuzz_function(author1, author2)
        if title1 == title2:
            title_score = 100
        else:
            title_score = self.fuzz_function(title1, title2)
        return author_score*self.author_weight + title_score*self.title_weight

    def map_function(self, row):
        title = row["title"]
        author = row["authors_"]
        max_score = 0
        max_idx = 0
        for i in range(len(self.books_all_titles)):
            score = self.calculate_score(author, self.books_all_authors[i], title, self.books_all_titles[i])
            if score > max_score:
                max_score = score
                max_idx = i
            if max_score == 100:
                break
        if not max_score >= self.threshold:
            row["book_id"] = -1
            row["score"] = -1
        else:
            row["book_id"] = max_idx
            row["score"] = max_score
        return row

    def match(self):
        self.books_u = self.books.map(self.map_function, batch_size=10, num_proc=8)
        return self.books_u

In [19]:
books["train"][:5]

{'authors': ['[1]', '[2, 3]', '[4]', '[5, 6, 7, 8]', '[9]'],
 'categories': ['[214, 220, 237, 2646, 2647, 2659, 2660, 2679]',
  '[235, 3386]',
  '[358, 2630, 360, 2632]',
  '[377, 2978, 2980]',
  '[2813, 2980]'],
 'format': [1.0, 1.0, 1.0, 1.0, 2.0],
 'isbn10': ['184018907X', '184454737X', '8416327866', '571308996', '8352518'],
 'lang': ['en', 'en', 'es', 'en', 'en'],
 'publication-date': ['2004-10-14 00:00:00',
  '2009-03-13 00:00:00',
  '2020-04-30 00:00:00',
  '2015-10-01 00:00:00',
  '2019-06-18 00:00:00'],
 'title': ['Soldier Five : The Real Truth About The Bravo Two Zero Mission',
  'Underbelly : The Gangland War',
  'A Sir Phillip, Con Amor',
  'QI: The Third Book of General Ignorance',
  'The Hidden Power of F*cking Up'],
 'authors_': ['Mike Coburn',
  'John Silvester: Andrew Rule',
  'Julia Quinn',
  'Andrew Hunter Murray: James Harkin: John Lloyd: John Mitchinson',
  'The Try Guys']}

In [20]:
filtered_books = books["train"].filter(lambda row: "355" in row["categories"])
filtered_books

Dataset({
    features: ['authors', 'categories', 'format', 'isbn10', 'lang', 'publication-date', 'title', 'authors_'],
    num_rows: 21755
})

In [26]:
book_matcher = BookMatching(books2["train"], filtered_books)

In [27]:
len(book_matcher.books_all_titles)

58292

In [28]:
books2_ = book_matcher.match()

Map (num_proc=8):   0%|          | 0/21755 [00:00<?, ? examples/s]

In [45]:
books2_[:5]

{'authors': ['[191489, 191490]',
  '[191489]',
  '[191489]',
  '[191489]',
  '[191489]'],
 'categories': ['[218, 222, 292, 821, 3366, 2804, 2812, 2813]',
  '[355, 2629, 2496]',
  '[355, 2629, 2496]',
  '[355, 2629, 2496]',
  '[355, 2629, 2496]'],
 'format': [2.0, 3.0, 3.0, 3.0, 3.0],
 'isbn10': ['1408706784',
  '1408824094',
  '1408824132',
  '1408821583',
  '1408821516'],
 'lang': ['en', 'en', 'en', 'en', 'en'],
 'publication-date': ['2015-04-14 00:00:00',
  '2011-04-04 00:00:00',
  '2011-05-03 00:00:00',
  '2011-05-03 00:00:00',
  '2011-05-19 00:00:00'],
 'title': ['Very Good Lives : The Fringe Benefits of Failure and the Importance of Imagination',
  'Harry Potter and the Chamber of Secrets',
  'Harry Potter and the Prisoner of Azkaban',
  'Harry Potter and the Goblet of Fire',
  'Harry Potter and the Chamber of Secrets'],
 'authors_': ['J.K. Rowling: Joel Holland',
  'J.K. Rowling',
  'J.K. Rowling',
  'J.K. Rowling',
  'J.K. Rowling'],
 'book_id': [-1, 3, 4, 5, 3],
 'score': [-1.0

In [46]:
books2["train"][:5]["title_"]

['Harry Potter and the Half-Blood Prince',
 'Harry Potter and the Order of the Phoenix',
 "Harry Potter and the Sorcerer's Stone",
 'Harry Potter and the Chamber of Secrets',
 'Harry Potter and the Prisoner of Azkaban']

In [33]:
JK[:5]["authors_"]

['J.K. Rowling: Joel Holland',
 'J.K. Rowling',
 'J.K. Rowling',
 'J.K. Rowling',
 'J.K. Rowling']

In [32]:
JK[:5]["title"]

['Very Good Lives : The Fringe Benefits of Failure and the Importance of Imagination',
 'Harry Potter and the Chamber of Secrets',
 'Harry Potter and the Prisoner of Azkaban',
 'Harry Potter and the Goblet of Fire',
 'Harry Potter and the Chamber of Secrets']

In [38]:
books2["train"][:5]["title_"]

['Harry Potter and the Half-Blood Prince',
 'Harry Potter and the Order of the Phoenix',
 "Harry Potter and the Sorcerer's Stone",
 'Harry Potter and the Chamber of Secrets',
 'Harry Potter and the Prisoner of Azkaban']

In [35]:
books2["train"][:5]["authors"]

['J.K. Rowling',
 'J.K. Rowling',
 'J.K. Rowling',
 'J.K. Rowling',
 'J.K. Rowling']

In [46]:
fuzz.partial_ratio(books2["train"][:5]["authors"][0], JK[:5]["authors_"][0])

100

In [13]:
# from transformers import AutoTokenizer, AutoModel
# import torch

# model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
# tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
# model = AutoModel.from_pretrained(model_ckpt)

# device = torch.device("cpu")
# model.to(device)

In [14]:
# def cls_pooling(model_output):
#     return model_output.last_hidden_state[:, 0]

In [15]:
# def get_embeddings(text_list):
#     encoded_input = tokenizer(
#         text_list, padding=True, truncation=True, return_tensors="pt"
#     )
#     encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
#     model_output = model(**encoded_input)
#     return cls_pooling(model_output)

In [16]:
# em1 = get_embeddings("Harry Potter and the Sorcerer's Stone (Harry Potter  #1)")
# em2 = get_embeddings("Harry Potter and the Chamber of Secrets (Harry Potter  #2)")

In [80]:
books_

DatasetDict({
    train: Dataset({
        features: ['authors', 'categories', 'format', 'isbn10', 'lang', 'publication-date', 'title', 'authors_'],
        num_rows: 1109383
    })
})

In [17]:
# embeddings_dataset = books_.map(
#     lambda x: {"title_embeddings": get_embeddings(x["title"]).detach().cpu().numpy()[0]},
#     batch_size=10000,
# )

In [4]:
def get_series(row):
    import re
    title = row["title"]
    regex = re.compile(r"(.*?)\s\((.*?),?\s#(\d{1,3}-?\d{0,3})\)")
    matches = regex.findall(title)
    if len(matches) == 0:
        row["title_"] = title
        row["series"] = "Standalone"
        row["book_num"] = 1
        return row 
    matches = matches[0]
    series = matches[1]
    title_ = matches[0]
    book_num = matches[2]
    if "-" in book_num:
        book_num = int(book_num.split("-")[-1])
    row["title_"] = title_
    row["series"] = series
    row["book_num"] = int(book_num)
    return row

In [5]:
books2 = load_dataset("csv", data_files="2_clean/book1-100k.csv", delimiter=",")

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [6]:
books2_ = books2.map(get_series, batch_size=5000, num_proc=5)

Map (num_proc=5):   0%|          | 0/58292 [00:00<?, ? examples/s]

In [81]:
# def get_series(row):
#     title = row["title"]
#     regex = re.compile(r"(.*?)\s\((.*?),?\s#(\d{1,3}-?\d{0,3})\)")
#     matches = regex.findall(title)
#     if len(matches) == 0:
#         row["title_"] = title
#         row["series"] = "Standalone"
#         row["book_num"] = 1
#         # return title, "Standalone", 1
#         return row
#     matches = matches[0]
#     series = matches[1]
#     title_ = matches[0]
#     book_num = matches[2]
#     if "-" in book_num:
#         book_num = int(book_num.split("-")[-1])
#     row["title_"] = title_
#     row["series"] = series
#     row["book_num"] = int(book_num)
#     # print(row)
#     return row
#     return title_, series, int(book_num)

# books2_df = pd.read_csv("2_clean/book1-100k.csv")
# books2_df.progress_apply(get_series, axis = 1)

In [7]:
dataset = load_dataset("parquet", data_files={"train": os.path.join("1_clean", "dataset.parquet")})

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['authors', 'categories', 'format', 'isbn10', 'lang', 'publication-date', 'title'],
        num_rows: 1109383
    })
})

In [9]:
dataset_ = dataset.map(get_series, batch_size=5000, num_proc=5)

Map (num_proc=5):   0%|          | 0/1109383 [00:00<?, ? examples/s]

In [10]:
dataset_

DatasetDict({
    train: Dataset({
        features: ['authors', 'categories', 'format', 'isbn10', 'lang', 'publication-date', 'title', 'title_', 'series', 'book_num'],
        num_rows: 1109383
    })
})

In [11]:
authors1 = pd.read_csv("1_clean/authors.csv")
authors1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 654021 entries, 0 to 654020
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   author_id    654021 non-null  int64 
 1   author_name  654019 non-null  object
dtypes: int64(1), object(1)
memory usage: 10.0+ MB


In [12]:
authors1 = authors1.set_index("author_id").sort_index()
authors1 = authors1.fillna("NA")
authors = authors1["author_name"].values
len(authors)

654021

In [13]:
def match_author_name(row):
    ids = row["authors"]
    ids = eval(ids)
    authors_ = []
    for id_ in ids:
        author = authors[id_ - 1]
        authors_.append(author)
    row["authors_"] = ": ".join(authors_)
    return row

In [15]:
dataset_ = dataset_.map(match_author_name, batch_size=10000, num_proc=6)

Map (num_proc=6):   0%|          | 0/1109383 [00:00<?, ? examples/s]

NameError: name 'authors' is not defined

In [164]:
dataset_["train"][1000]

{'authors': '[1384, 1385]',
 'categories': '[253, 272, 314, 787, 3332, 833, 3378, 834, 3379]',
 'format': 2.0,
 'isbn10': '1847697909',
 'lang': 'en',
 'publication-date': '2012-09-15 00:00:00',
 'title': 'Researching Language Teacher Cognition and Practice : International Case Studies',
 'title_': 'Researching Language Teacher Cognition and Practice : International Case Studies',
 'series': 'Standalone',
 'book_num': 1,
 'authors_': 'Roger Barnard: Anne Burns'}

In [174]:
dataset_["train"]["authors_"]

'Mike Coburn'

In [177]:
books2_["train"][100]

{'id': 147,
 'title': 'Rails Cookbook: Recipes for Rapid Web Development with Ruby',
 'authors': 'Rob Orsini',
 'pages': 514,
 'language': 'eng',
 'star_1': 3,
 'star_2': 10,
 'star_3': 20,
 'star_4': 23,
 'star_5': 12,
 'rating': 3.46,
 'total_ratings': 68,
 'total_reviews': 1,
 'isbn': '0596527314',
 'publication_date': '2007-01-01 00:00:00',
 'title_': 'Rails Cookbook: Recipes for Rapid Web Development with Ruby',
 'series': 'Standalone',
 'book_num': 1}