In [1]:
import pandas as pd
import isbnlib
import sys
import os

current_dir = os.path.abspath('')
parent_dir = os.path.join(current_dir, '..', 'functions')
sys.path.append(parent_dir)

from isbn_utils import contain_isbn13
from duplicates_utils import number_of_duplicates
from lang_utils import contain_lang

IMPORT DATA

In [2]:
path = "../../dataset/origin/dataset.csv"
df = pd.read_csv(path, sep=",", encoding="cp1250", encoding_errors="replace")

df = df.replace({"[\u2028\u2029]": ""}, regex=True)
df


Unnamed: 0,authors,bestsellers-rank,categories,description,dimension-x,dimension-y,dimension-z,edition,edition-statement,for-ages,...,isbn10,isbn13,lang,publication-date,publication-place,rating-avg,rating-count,title,url,weight
0,[1],49848.0,"[214, 220, 237, 2646, 2647, 2659, 2660, 2679]",SOLDIER FIVE is an elite soldier's explosive m...,129.00,198.00,20.00,,,,...,184018907X,9781840189070,en,2004-10-14 00:00:00,,4.03,292.0,Soldier Five : The Real Truth About The Bravo ...,/Soldier-Five-Mike-Coburn/9781840189070,224.00
1,"[2, 3]",115215.0,"[235, 3386]",John Moran and Carl Williams were the two bigg...,127.00,203.20,25.40,,,,...,184454737X,9781844547371,en,2009-03-13 00:00:00,,3.60,335.0,Underbelly : The Gangland War,/Underbelly-Andrew-Rule/9781844547371,285.76
2,[4],11732.0,"[358, 2630, 360, 2632]",Sir Phillip knew that Eloise Bridgerton was a ...,150.00,224.00,28.00,New edition,,,...,8416327866,9788416327867,es,2020-04-30 00:00:00,,3.88,37211.0,"A Sir Phillip, Con Amor",/Sir-Phillip-Con-Amor-Julia-Quinn/9788416327867,386.00
3,"[5, 6, 7, 8]",114379.0,"[377, 2978, 2980]",The Third Book of General Ignorance gathers t...,153.00,234.00,24.00,,Export - Airside ed,,...,571308996,9780571308996,en,2015-10-01 00:00:00,,4.17,384.0,QI: The Third Book of General Ignorance,/QI-Third-Book-General-Ignorance-John-Lloyd/97...,436.00
4,[9],98413.0,"[2813, 2980]",The Try Guys deliver their first book-an inspi...,191.00,240.00,29.00,,,,...,8352518,9780008352516,en,2019-06-18 00:00:00,,3.90,5095.0,The Hidden Power of F*cking Up,/Hidden-Power-F-cking-Up-Try-Guys/9780008352516,980.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1109378,[120629],,[3055],,140.00,216.00,6.00,,,,...,879802685,9780879802684,en,1980-12-01 00:00:00,,,,Making Money at the Races,/Making-Money-at-Races-David-Barr/9780879802684,227.00
1109379,[336369],,"[3045, 3054, 3068, 3086]",This 2020 Diary/Planner has one week per 2 pag...,151.89,229.11,7.11,,,,...,1711791962,9781711791968,en,2019-11-25 00:00:00,,,,"2020 : Diary, Weekly Planner, Organiser, Year ...",/2020-Annie-Mac-Journals/9781711791968,185.97
1109380,"[29792, 654019]",,[3096],This special re-print edition of James A. Crui...,216.00,279.00,8.00,,,,...,1792858019,9781792858017,en,2018-12-28 00:00:00,,,,Canoeing and Camping,/Canoeing-Camping-James-Cruikshank/9781792858017,372.00
1109381,[654020],,"[3054, 3056]",Good western riding doesn't just happen. J.P. ...,222.00,287.00,19.05,,,,...,876059825,9780876059821,en,1995-04-12 00:00:00,,,,The Complete Guide to Western Horsemanship (Cl...,/Complete-Guide-Western-Horsemanship-Cloth-For...,916.00


ADD ID COLUMN

In [3]:
df['id'] = range(1, len(df) + 1)

DROP DUPLICATES

In [4]:
number_of_duplicates(df)

0

DROP COLUMNS

In [5]:
drop_columns = ['index-date','illustrations-note','image-checksum','image-path','image-url','publication-place',
                'url','edition','imprint','edition-statement']
df = df.drop(columns = drop_columns)

CLEAN DATA

In [6]:
#shows how many rows has only isbn10 value
filtered_df = df[df['isbn10'].notna() & df['isbn13'].isna()]
filtered_df

Unnamed: 0,authors,bestsellers-rank,categories,description,dimension-x,dimension-y,dimension-z,for-ages,format,id,isbn10,isbn13,lang,publication-date,rating-avg,rating-count,title,weight


In [7]:
#validate isbn13 values
df['isbn13_valid'] = contain_isbn13(df['isbn13'])
df['isbn13_valid'].unique()

array([ True])

In [8]:
df['lang_valid'] = contain_lang(df['lang'])
df['lang_valid'].unique()

array([ True, False])

In [9]:
valid_lang_rows = df[df['lang'].notna() & (df['lang_valid'] == False)]
valid_lang_rows


Unnamed: 0,authors,bestsellers-rank,categories,description,dimension-x,dimension-y,dimension-z,for-ages,format,id,isbn10,isbn13,lang,publication-date,rating-avg,rating-count,title,weight,isbn13_valid,lang_valid


In [10]:
#Drop auxiliary columns
df = df.drop(columns = ['isbn10','isbn13_valid','lang_valid'])

DUPLICATES AFTER DATA CLEANING

In [11]:
number_of_duplicates(df['isbn13'])

0

In [12]:
df

Unnamed: 0,authors,bestsellers-rank,categories,description,dimension-x,dimension-y,dimension-z,for-ages,format,id,isbn13,lang,publication-date,rating-avg,rating-count,title,weight
0,[1],49848.0,"[214, 220, 237, 2646, 2647, 2659, 2660, 2679]",SOLDIER FIVE is an elite soldier's explosive m...,129.00,198.00,20.00,,1.0,1,9781840189070,en,2004-10-14 00:00:00,4.03,292.0,Soldier Five : The Real Truth About The Bravo ...,224.00
1,"[2, 3]",115215.0,"[235, 3386]",John Moran and Carl Williams were the two bigg...,127.00,203.20,25.40,,1.0,2,9781844547371,en,2009-03-13 00:00:00,3.60,335.0,Underbelly : The Gangland War,285.76
2,[4],11732.0,"[358, 2630, 360, 2632]",Sir Phillip knew that Eloise Bridgerton was a ...,150.00,224.00,28.00,,1.0,3,9788416327867,es,2020-04-30 00:00:00,3.88,37211.0,"A Sir Phillip, Con Amor",386.00
3,"[5, 6, 7, 8]",114379.0,"[377, 2978, 2980]",The Third Book of General Ignorance gathers t...,153.00,234.00,24.00,,1.0,4,9780571308996,en,2015-10-01 00:00:00,4.17,384.0,QI: The Third Book of General Ignorance,436.00
4,[9],98413.0,"[2813, 2980]",The Try Guys deliver their first book-an inspi...,191.00,240.00,29.00,,2.0,5,9780008352516,en,2019-06-18 00:00:00,3.90,5095.0,The Hidden Power of F*cking Up,980.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1109378,[120629],,[3055],,140.00,216.00,6.00,,1.0,1109379,9780879802684,en,1980-12-01 00:00:00,,,Making Money at the Races,227.00
1109379,[336369],,"[3045, 3054, 3068, 3086]",This 2020 Diary/Planner has one week per 2 pag...,151.89,229.11,7.11,,1.0,1109380,9781711791968,en,2019-11-25 00:00:00,,,"2020 : Diary, Weekly Planner, Organiser, Year ...",185.97
1109380,"[29792, 654019]",,[3096],This special re-print edition of James A. Crui...,216.00,279.00,8.00,,1.0,1109381,9781792858017,en,2018-12-28 00:00:00,,,Canoeing and Camping,372.00
1109381,[654020],,"[3054, 3056]",Good western riding doesn't just happen. J.P. ...,222.00,287.00,19.05,,2.0,1109382,9780876059821,en,1995-04-12 00:00:00,,,The Complete Guide to Western Horsemanship (Cl...,916.00


DATA SPLIT

In [13]:
id = ['id']

addition_columns =['dimension-x','dimension-y','dimension-z','rating-avg',
                    'rating-count','weight','bestsellers-rank','for-ages']

columns = id + addition_columns


addition_data = df[columns].copy()
addition_data
df = df.drop(columns=addition_columns)


RENAME A FEW ADDITION_DATA COLUMNS AND CHANGE DATA TYPE

In [None]:
addition_data = addition_data.rename(columns={'id': 'book_id',
                                            'rating-count': 'rating_count',
                                            'dimension-x': 'dimension_x',
                                            'dimension-y': 'dimension_y',
                                            'dimension-z': 'dimension_z',
                                            'rating-avg': 'rating_avg',
                                            'bestsellers-rank': 'bestsellers_rank',
                                            'for-ages': 'for_ages'})

addition_data[["rating_count", "bestsellers_rank"]] = addition_data[
                        ["rating_count","bestsellers_rank"]].astype("Int64")


Unnamed: 0,book_id,dimension_x,dimension_y,dimension_z,rating_avg,rating_count,weight,bestsellers_rank,for_ages
0,1,129.00,198.00,20.00,4.03,292,224.00,49848,
1,2,127.00,203.20,25.40,3.60,335,285.76,115215,
2,3,150.00,224.00,28.00,3.88,37211,386.00,11732,
3,4,153.00,234.00,24.00,4.17,384,436.00,114379,
4,5,191.00,240.00,29.00,3.90,5095,980.00,98413,
...,...,...,...,...,...,...,...,...,...
1109378,1109379,140.00,216.00,6.00,,,227.00,,
1109379,1109380,151.89,229.11,7.11,,,185.97,,
1109380,1109381,216.00,279.00,8.00,,,372.00,,
1109381,1109382,222.00,287.00,19.05,,,916.00,,


EXPORT DAT

In [16]:
def clean_text(text):
    if isinstance(text, str):
        text = text.encode('utf-8', errors='ignore').decode('utf-8')
        text = text.replace('\u2028', ' ').replace('\u2029', ' ')
        text = ''.join(char for char in text if ord(char) < 128)
    return text

text_columns = ['title', 'description']
for col in text_columns:
    df[col] = df[col].apply(clean_text)

file_name = "dataset_cleaned.csv"
folder_path = "../../dataset/cleaned"
final_path = os.path.join(folder_path, file_name)
df.to_csv(final_path, encoding="utf-8", quoting=0, index=False, na_rep="",
           escapechar='\\')

In [17]:
file_name = "addition_cleaned.csv"
final_path = os.path.join(folder_path, file_name)
addition_data.to_csv(final_path, sep=",", encoding="utf-8", index=False)