In [2]:
import pandas as pd
import pprint
import requests
import json

# Getting book ids

In [3]:
def get_book_id(isbn):
    url = f"https://www.googleapis.com/books/v1/volumes?q=isbn:{isbn}"
    response = requests.get(url)
    if response.status_code == 200:
        book_data = response.json()
        if 'items' in book_data:
            return book_data['items'][0]['id']
    return None

# Retrieving book ID values with ISBN13

In [45]:
kaggle_data = pd.read_csv("books.csv")
isbn_13 = kaggle_data['isbn13']
book_ids = isbn_13.apply(get_book_id)

print(book_ids)

0                None
1                None
2        h2Y-PgAACAAJ
3        FBXRzgEACAAJ
4        DAAAAAAACAAJ
             ...     
11122            None
11123    d0buAAAAMAAJ
11124            None
11125            None
11126            None
Name: isbn13, Length: 11127, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isbn_13['book_id'] = book_ids


In [55]:
book_ids.shape

(11127,)

In [56]:
# count the number of missing book IDs
num_missing = book_ids.isna().sum()

print(f"Number of missing book IDs: {num_missing}")

Number of missing book IDs: 8437


In [73]:
# Pulling only the not None rows
not_none = book_ids.notna()
Final_book_ids = book_ids[not_none]
Final_book_ids = Final_book_ids.reset_index(drop=True)
Final_book_ids

0       h2Y-PgAACAAJ
1       FBXRzgEACAAJ
2       DAAAAAAACAAJ
3       LH5C9q83T6wC
4       62CEzQEACAAJ
            ...     
2685    tcWMPAAACAAJ
2686    O2JfAAAAMAAJ
2687    y4kgSgAACAAJ
2688    TaQZzgEACAAJ
2689    d0buAAAAMAAJ
Name: isbn13, Length: 2690, dtype: object

In [69]:
# Exporting Final_book_ids so i dont accidentally erase the data
Final_book_ids.to_csv('isbn13_book_ids.csv', index=False)

# Testing Regular ISBN numbers instead of isbn13

In [4]:
kaggle_data = pd.read_csv("books.csv")
isbn = kaggle_data['isbn']
book_ids_isbn = isbn.apply(get_book_id)

print(book_ids_isbn)

0                None
1                None
2                None
3        FBXRzgEACAAJ
4                None
             ...     
11122            None
11123            None
11124            None
11125            None
11126            None
Name: isbn, Length: 11127, dtype: object


In [5]:
book_ids_isbn.shape

(11127,)

In [6]:
# count the number of missing book IDs
num_missing_isbn = book_ids_isbn.isna().sum()

print(f"Number of missing book IDs: {num_missing_isbn}")

Number of missing book IDs: 10328


In [8]:
# Pulling only the not None rows
not_none = book_ids_isbn.notna()

Final_book_isbn_ids = book_ids_isbn[not_none]

Final_book_isbn_ids = Final_book_isbn_ids.reset_index(drop=True)

Final_book_isbn_ids

0      FBXRzgEACAAJ
1      yyxXzQEACAAJ
2      YjAnfhsAQ8wC
3      xb4wSmJLnhAC
4      Qq9nQgAACAAJ
           ...     
794    98-cPQAACAAJ
795    RZbQPAAACAAJ
796    4tuEuAAACAAJ
797    dmqguAAACAAJ
798    DiJVzwEACAAJ
Name: isbn, Length: 799, dtype: object

In [9]:
# Exporting Final_book_ids so i dont accidentally erase the data
Final_book_isbn_ids.to_csv('isbn_book_ids.csv', index=False)

# Merging both results

In [15]:
isbn_csv = pd.read_csv("isbn_book_ids.csv")
isbn13_csv = pd.read_csv("isbn13_book_ids.csv")


In [21]:
print("------- isbn -------")
print(isbn_csv)
print(isbn_csv.columns)
print("\n------- isbn13 -------")
print(isbn13_csv)
print(isbn13_csv.columns)

------- isbn -------
             isbn
0    FBXRzgEACAAJ
1    yyxXzQEACAAJ
2    YjAnfhsAQ8wC
3    xb4wSmJLnhAC
4    Qq9nQgAACAAJ
..            ...
794  98-cPQAACAAJ
795  RZbQPAAACAAJ
796  4tuEuAAACAAJ
797  dmqguAAACAAJ
798  DiJVzwEACAAJ

[799 rows x 1 columns]
Index(['isbn'], dtype='object')

------- isbn13 -------
            isbn13
0     h2Y-PgAACAAJ
1     FBXRzgEACAAJ
2     DAAAAAAACAAJ
3     LH5C9q83T6wC
4     62CEzQEACAAJ
...            ...
2685  tcWMPAAACAAJ
2686  O2JfAAAAMAAJ
2687  y4kgSgAACAAJ
2688  TaQZzgEACAAJ
2689  d0buAAAAMAAJ

[2690 rows x 1 columns]
Index(['isbn13'], dtype='object')


In [60]:
isbn_df = pd.DataFrame(isbn_csv)
isbn_df = isbn_df.rename(columns={'isbn': 'book_id'})

isbn13_df = pd.DataFrame(isbn13_csv)
isbn13_df = isbn13_df.rename(columns={'isbn13': 'book_id'})


In [68]:
# Concatenate the dataframes
merged_df = pd.concat([isbn_df, isbn13_df])

# Drop duplicates based on the 'book_id' column
merged_df = merged_df.drop_duplicates(subset=['book_id'])

# Reset the index
merged_df = merged_df.reset_index(drop=True)
merged_df

Unnamed: 0,book_id
0,FBXRzgEACAAJ
1,yyxXzQEACAAJ
2,YjAnfhsAQ8wC
3,xb4wSmJLnhAC
4,Qq9nQgAACAAJ
...,...
3282,tcWMPAAACAAJ
3283,O2JfAAAAMAAJ
3284,y4kgSgAACAAJ
3285,TaQZzgEACAAJ


In [None]:
# Exporting merged_df so i dont accidentally erase the data
merged_df.to_csv('all_book_ids.csv', index=False)