In [1]:
import pandas as pd
import sys
import os
import numpy as np

current_dir = os.path.abspath('')
parent_dir = os.path.join(current_dir, '..', 'functions')
sys.path.append(parent_dir)

from duplicates_utils import normalize_id
from mn_table_utils import create_mn_table
from one_to_one_table_utils import create_one_to_one_table

DROP DUPLICATES ROWS FROM AUTHORS AND CATEGORIES

In [2]:
authors_path = "../../dataset/cleaned/authors_cleaned.csv"
df_authors = pd.read_csv(authors_path)

print(df_authors.shape)

df_authors,df_authors_map = normalize_id(df_authors, key_column = 'author_name', id_column = 'author_id')

print(df_authors.shape)
print(len(df_authors_map))

(654021, 2)
(653783, 2)
654021


In [4]:
categories_path = "../../dataset/cleaned/categories_cleaned.csv"
df_categories = pd.read_csv(categories_path)


print(df_categories.shape)

df_categories,df_categories_map = normalize_id(df_categories, key_column = 'category_name', id_column = 'category_id')

print(df_categories.shape)
print(len(df_categories_map))

(2775, 2)
(2585, 2)
2775


CREATE M:N TABLES FROM MAIN DATASET

In [5]:
dataset_path = "../../dataset/cleaned/dataset_cleaned.csv"
df_dataset = pd.read_csv(dataset_path)


In [6]:
#AUTHOR MN TABLE
authors_mn_table = create_mn_table(df_dataset, id_column = 'id', key_column = 'authors')

#add unknown id instead of missing id in mn table
unknown_author_id = df_authors.loc[df_authors['author_name'] == 'Unknown', 'author_id'].iloc[0]
authors_mn_table['authors_id'] = authors_mn_table['authors_id'].replace('', unknown_author_id)

authors_mn_table['authors_id'] = authors_mn_table['authors_id'].str.strip()

authors_mn_table['authors_id'] = (authors_mn_table['authors_id']
    .map(df_authors_map).fillna(authors_mn_table['authors_id'])
    .infer_objects(copy=False).astype('Int64'))

In [7]:

#CATHEGORIES MN TABLE
categories_mn_table = create_mn_table(df_dataset, id_column = 'id', key_column = 'categories')

print(categories_mn_table.loc[categories_mn_table['categories_id'] == ''])


categories_mn_table['categories_id'] = (categories_mn_table['categories_id']
    .replace('', np.nan)  
    .map(df_categories_map)
    .fillna(categories_mn_table['categories_id']))  

              id  main_id categories_id
1390099  1390100   328665              
2459394  2459395   636324              
2888189  2888190   774007              
3893958  3893959  1054359              
4040637  4040638  1103243              
4050848  4050849  1106382              


CREATE 1:1 TABLE FROM MAIN DATASET

In [8]:
lang_dataset, lang_mapping = create_one_to_one_table(df_dataset,'lang')

df_dataset['lang'] = (df_dataset['lang']
    .replace('', np.nan)  
    .map(lang_mapping)) 

In [9]:
df_dataset

Unnamed: 0,authors,categories,description,format,id,isbn13,lang,publication-date,title
0,[1],"[214, 220, 237, 2646, 2647, 2659, 2660, 2679]",SOLDIER FIVE is an elite soldier's explosive m...,1.0,1,9781840189070,1.0,2004-10-14 00:00:00,Soldier Five : The Real Truth About The Bravo ...
1,"[2, 3]","[235, 3386]",John Moran and Carl Williams were the two bigg...,1.0,2,9781844547371,1.0,2009-03-13 00:00:00,Underbelly : The Gangland War
2,[4],"[358, 2630, 360, 2632]",Sir Phillip knew that Eloise Bridgerton was a ...,1.0,3,9788416327867,2.0,2020-04-30 00:00:00,"A Sir Phillip, Con Amor"
3,"[5, 6, 7, 8]","[377, 2978, 2980]",The Third Book of General Ignorance gathers t...,1.0,4,9780571308996,1.0,2015-10-01 00:00:00,QI: The Third Book of General Ignorance
4,[9],"[2813, 2980]",The Try Guys deliver their first book-an inspi...,2.0,5,9780008352516,1.0,2019-06-18 00:00:00,The Hidden Power of F*cking Up
...,...,...,...,...,...,...,...,...,...
1109378,[120629],[3055],,1.0,1109379,9780879802684,1.0,1980-12-01 00:00:00,Making Money at the Races
1109379,[336369],"[3045, 3054, 3068, 3086]",This 2020 Diary/Planner has one week per 2 pag...,1.0,1109380,9781711791968,1.0,2019-11-25 00:00:00,"2020 : Diary, Weekly Planner, Organiser, Year ..."
1109380,"[29792, 654019]",[3096],This special re-print edition of James A. Crui...,1.0,1109381,9781792858017,1.0,2018-12-28 00:00:00,Canoeing and Camping
1109381,[654020],"[3054, 3056]",Good western riding doesn't just happen. J.P. ...,2.0,1109382,9780876059821,1.0,1995-04-12 00:00:00,The Complete Guide to Western Horsemanship (Cl...


DROP OLD COLUMNS

In [10]:
df_dataset = df_dataset.drop(columns=['authors','categories'])

In [11]:
df_dataset

Unnamed: 0,description,format,id,isbn13,lang,publication-date,title
0,SOLDIER FIVE is an elite soldier's explosive m...,1.0,1,9781840189070,1.0,2004-10-14 00:00:00,Soldier Five : The Real Truth About The Bravo ...
1,John Moran and Carl Williams were the two bigg...,1.0,2,9781844547371,1.0,2009-03-13 00:00:00,Underbelly : The Gangland War
2,Sir Phillip knew that Eloise Bridgerton was a ...,1.0,3,9788416327867,2.0,2020-04-30 00:00:00,"A Sir Phillip, Con Amor"
3,The Third Book of General Ignorance gathers t...,1.0,4,9780571308996,1.0,2015-10-01 00:00:00,QI: The Third Book of General Ignorance
4,The Try Guys deliver their first book-an inspi...,2.0,5,9780008352516,1.0,2019-06-18 00:00:00,The Hidden Power of F*cking Up
...,...,...,...,...,...,...,...
1109378,,1.0,1109379,9780879802684,1.0,1980-12-01 00:00:00,Making Money at the Races
1109379,This 2020 Diary/Planner has one week per 2 pag...,1.0,1109380,9781711791968,1.0,2019-11-25 00:00:00,"2020 : Diary, Weekly Planner, Organiser, Year ..."
1109380,This special re-print edition of James A. Crui...,1.0,1109381,9781792858017,1.0,2018-12-28 00:00:00,Canoeing and Camping
1109381,Good western riding doesn't just happen. J.P. ...,2.0,1109382,9780876059821,1.0,1995-04-12 00:00:00,The Complete Guide to Western Horsemanship (Cl...


SAVE FILES

In [12]:

def save_dataframes(df_dict, folder_path, file_format='csv'):
    if not os.path.exists(folder_path) or not os.path.isdir(folder_path):
        raise FileNotFoundError(f"Dictionary '{folder_path}' does not exist.")

    for file_name, df in df_dict.items():
        file_name = f"{file_name}.{file_format}" 
        file_path = os.path.join(folder_path, file_name)  

        if file_format == 'csv':
            df.to_csv(file_path, index=False, encoding='utf-8')
        elif file_format == 'xlsx':
            df.to_excel(file_path, index=False, engine='openpyxl')
        else:
            raise ValueError("Unsuported file format.")

        print(f"✅ File was save: {file_path}")



In [13]:
dfs = {'final_dataset': df_dataset, 'final_categories': df_categories,
       'final_authors': df_authors,'mn_author': authors_mn_table,
       'mn_categories': categories_mn_table,'final_lang': lang_dataset}
folder = "../../dataset/to_sql"

save_dataframes(dfs, folder)


✅ File was save: ../../dataset/to_sql\final_dataset.csv
✅ File was save: ../../dataset/to_sql\final_categories.csv
✅ File was save: ../../dataset/to_sql\final_authors.csv
✅ File was save: ../../dataset/to_sql\mn_author.csv
✅ File was save: ../../dataset/to_sql\mn_categories.csv
✅ File was save: ../../dataset/to_sql\final_lang.csv


In [13]:
def move_files(file_names, source_folder, destination_folder):
    for file_name in file_names:
        source_path = os.path.join(source_folder, file_name)
        destination_path = os.path.join(destination_folder, file_name)

        try:
            with open(source_path, "rb") as src, open(destination_path, "wb") as dst:
                dst.write(src.read())
            print(f"✅ {file_name} was saved successfully.")
        except FileNotFoundError:
            print(f"❌ File {file_name} not found in {source_folder}.")
        except Exception as e:
            print(f"❌ Error processing {file_name}: {e}")

In [14]:
dt_without_trans = ['formats_cleaned.csv','addition_cleaned.csv']
source_folder = "../../dataset/cleaned"
final_path = "../../dataset/to_sql"
move_files(dt_without_trans,source_folder,final_path)

✅ formats_cleaned.csv was saved successfully.
✅ addition_cleaned.csv was saved successfully.
