# KBART metadata reshaping script
This script takes a title list and generates two derivative KBART files (NISO KBART and OCLC KBART).

Written using Python 3.7.10

to do: incorporate filters to exclude rows from the original title list that are still in process.

In [1]:
#import modules and libraries
import pandas as pd
import numpy as np
from datetime import datetime, date, time
filetime = datetime.now()
filetime = filetime.strftime("%Y-%m-%d")

In [2]:
#enter the name of the title list to reshape into KBART files
title_list = input("enter file name and if appropriate filepath of title list csv: ")

enter file name and if appropriate filepath of title list csv:  AWDL_Title_List - New KBART_2022-03-22.csv


In [3]:
#enter the name of the digital collection so that we can use it in the file name later
collection_name = input("enter the collection name for this title list: ")

enter the collection name for this title list:  awdl


In [4]:
# load title list dataframe from csv input above
df_title_list = pd.read_csv(title_list,na_filter=False,quotechar = '"')
print(df_title_list.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 484 entries, 0 to 483
Data columns (total 41 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   note                               484 non-null    object
 1   date updated                       484 non-null    object
 2   status                             484 non-null    object
 3   publication_title_original         484 non-null    object
 4   publication_title_transliteration  484 non-null    object
 5   print_identifier                   484 non-null    object
 6   online_identifier                  484 non-null    object
 7   date_first_issue_online            484 non-null    object
 8   num_first_vol_online               484 non-null    object
 9   num_first_issue_online             484 non-null    object
 10  date_last_issue_online             484 non-null    object
 11  num_last_vol_online                484 non-null    object
 12  num_last

## NISO KBART
First, let's create a KBART file compliant with the NISO standard.

In [None]:
#copy the title list into a new dataframe that will be used to create the NISO KBART file.
df_niso_kbart = df_title_list.copy()
#print(df_niso_kbart)

In [None]:
#replace ebook with fulltext in coverage_depth column
#from https://datatofish.com/replace-values-pandas-dataframe/
df_niso_kbart['coverage_depth'] = df_niso_kbart['coverage_depth'].replace(['ebook'],'fulltext')
print(df_niso_kbart['coverage_depth'])

In [None]:
#concatenate original script and transliterated title columns
#based on https://stackoverflow.com/questions/60724940/concatenate-strings-across-columns-that-are-not-null
df_niso_kbart['publication_title'] = df_niso_kbart[['publication_title_original', 'publication_title_transliteration']].apply(lambda x: ' '.join(x.dropna()), axis=1)
print(df_niso_kbart['publication_title'])
print(df_niso_kbart.loc[21,'publication_title'])

In [None]:
#drop rows that contain a status value of "in progress" or "delete"
#based on https://www.statology.org/pandas-drop-rows-that-contain-string/
df_niso_kbart = df_niso_kbart[df_niso_kbart['status'].str.contains("in progress|delete")==False]
print(df_niso_kbart.loc[95:120,'publication_title'])

In [None]:
#drop unecessary columns from the NISO KBART
df_niso_kbart.drop(['note','date updated','status','publication_title_original','publication_title_transliteration','ACTION','Aleph BSN','previous_url'], axis=1, inplace=True)
print(df_niso_kbart.info())

In [None]:
#move publication_title to be the first column
#based on https://www.geeksforgeeks.org/how-to-move-a-column-to-first-position-in-pandas-dataframe/
#and https://stackoverflow.com/questions/25122099/move-column-by-name-to-front-of-table-in-pandas
col = df_niso_kbart.pop('publication_title')
df_niso_kbart.insert(0, col.name, col)
print(df_niso_kbart.info())

In [None]:
#write the new NISO KBART to tsv, inserting the collection name and today's date.
#from https://matthew-brett.github.io/teaching/string_formatting.html
df_niso_kbart.to_csv("nyu_global_{}_{}.txt".format(collection_name,filetime), sep="\t", index=False)

## OCLC KBART
Next, let's create a KBART that can be uploaded to OCLC's Worldshare platform.

In [5]:
#copy the title list into a new dataframe that will be used to create the OCLC KBART file.
df_oclc_kbart = df_title_list.copy()
#print(df_oclc_kbart)

In [6]:
#rename the notes column to conform to OCLC's expectations
df_oclc_kbart.rename(columns={'notes':'coverage_notes'}, inplace=True)
print(df_oclc_kbart.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 484 entries, 0 to 483
Data columns (total 41 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   note                               484 non-null    object
 1   date updated                       484 non-null    object
 2   status                             484 non-null    object
 3   publication_title_original         484 non-null    object
 4   publication_title_transliteration  484 non-null    object
 5   print_identifier                   484 non-null    object
 6   online_identifier                  484 non-null    object
 7   date_first_issue_online            484 non-null    object
 8   num_first_vol_online               484 non-null    object
 9   num_first_issue_online             484 non-null    object
 10  date_last_issue_online             484 non-null    object
 11  num_last_vol_online                484 non-null    object
 12  num_last

In [7]:
#concatenate original script and transliterated title columns
#based on https://stackoverflow.com/questions/60724940/concatenate-strings-across-columns-that-are-not-null
df_oclc_kbart['publication_title'] = df_oclc_kbart[['publication_title_original', 'publication_title_transliteration']].apply(lambda x: ' '.join(x.dropna()), axis=1)
print(df_oclc_kbart['publication_title'])
print(df_oclc_kbart.loc[21,'publication_title'])

0      "Wenn Du mein Bruder bist,…" : Interaktion und...
1      A catalogue of the Egyptian antiquities in the...
2      A catalogue of the Egyptian antiquities in the...
3      A collection of hieroglyphs a contribution to ...
4                        A companion to Demotic studies 
                             ...                        
479    Tools and weapons illustrated by the Egyptian ...
480                                  Two Theban princes 
481    Spätbabylonische Texte zum lokalen und regiona...
482                               The arrows of the sun 
483    The archive of Mušēzib-Marduk, son of Kiribtu ...
Name: publication_title, Length: 484, dtype: object
Abū Sa‘īd-i Abū l-Hayr (357-440/967-1049) : Wirklichkeit und Legende. (Textes et Mémoires, 4). Abū Sa‘īd-i Abū l-Hayr (357-440/967-1049) : Wirklichkeit und Legende. (Textes et Mémoires, 4).


In [8]:
#drop rows that contain a status value of "in progress" or "delete"
#based on https://www.statology.org/pandas-drop-rows-that-contain-string/
df_oclc_kbart = df_oclc_kbart[df_oclc_kbart['status'].str.contains("in progress|delete")==False]
print(df_oclc_kbart.loc[95:120,'publication_title'])

95              Chester Beatty Biblical papyri IV and V 
96     Christianity, Judaism and Other Greco-Roman Cu...
97     Circle of Inner Asian Art and Archaeology News...
99                 Circle of Inner Asian Art Newsletter 
119          Collection d'anciennes étoffes égyptiennes 
120            Collection du Docteur Fouquet, du Caire. 
Name: publication_title, dtype: object


In [9]:
#drop unnecessary columns
df_oclc_kbart.drop(["note","date updated","status","Aleph BSN","previous_url","publication_title_original","publication_title_transliteration","publication_type","date_monograph_published_print","date_monograph_published_online","monograph_volume","monograph_edition","first_editor","parent_publication_title_id","preceding_publication_title_id","access_type"], axis=1, inplace=True)
print(df_oclc_kbart.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 461 entries, 0 to 483
Data columns (total 26 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   print_identifier         461 non-null    object
 1   online_identifier        461 non-null    object
 2   date_first_issue_online  461 non-null    object
 3   num_first_vol_online     461 non-null    object
 4   num_first_issue_online   461 non-null    object
 5   date_last_issue_online   461 non-null    object
 6   num_last_vol_online      461 non-null    object
 7   num_last_issue_online    461 non-null    object
 8   title_url                461 non-null    object
 9   first_author             461 non-null    object
 10  title_id                 461 non-null    object
 11  embargo_info             461 non-null    object
 12  coverage_depth           461 non-null    object
 13  coverage_notes           461 non-null    object
 14  publisher_name           461 non-null    o

In [10]:
#reorder the columns by moving the NISO KBART columns not found in OCLC's KBART to the end
df_oclc_kbart = df_oclc_kbart[["publication_title","print_identifier","online_identifier","date_first_issue_online","num_first_vol_online","num_first_issue_online","date_last_issue_online","num_last_vol_online","num_last_issue_online","title_url","first_author","title_id","embargo_info","coverage_depth","coverage_notes","publisher_name","location","title_notes","staff_notes","vendor_id","oclc_collection_name","oclc_collection_id","oclc_entry_id","oclc_linkscheme","oclc_number","ACTION"]]
print(df_oclc_kbart.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 461 entries, 0 to 483
Data columns (total 26 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   publication_title        461 non-null    object
 1   print_identifier         461 non-null    object
 2   online_identifier        461 non-null    object
 3   date_first_issue_online  461 non-null    object
 4   num_first_vol_online     461 non-null    object
 5   num_first_issue_online   461 non-null    object
 6   date_last_issue_online   461 non-null    object
 7   num_last_vol_online      461 non-null    object
 8   num_last_issue_online    461 non-null    object
 9   title_url                461 non-null    object
 10  first_author             461 non-null    object
 11  title_id                 461 non-null    object
 12  embargo_info             461 non-null    object
 13  coverage_depth           461 non-null    object
 14  coverage_notes           461 non-null    o

In [11]:
#write the new OCLC KBART to csv, inserting the collection name and today's date.
#from https://matthew-brett.github.io/teaching/string_formatting.html
df_oclc_kbart.to_csv("nyu_oclc_{}_{}.txt".format(collection_name,filetime), sep="\t", index=False)