# HIDVL metadata spreadsheet reshape script
Sometimes, an HIDVL batch will only include new records. As part of the process to generate draft HIDVL MARC records by batch, we need to reshape the spreadsheet exported from the Airtable metadata submission form.

In [1]:
#import modules and libraries
import pandas as pd
import numpy as np
from datetime import datetime, date, time
filetime = datetime.now()
filetime = filetime.strftime("%Y-%m-%d_%I-%M_%p")

In [2]:
post_2019_dmd = input("enter file name and if appropriate filepath of airtable metadata csv: ")

enter file name and if appropriate filepath of airtable metadata csv:  Metadata-December_2022.csv


In [3]:
# load new metadata dataframe from csv
df_post_2019_dmd = pd.read_csv(post_2019_dmd,na_filter=False,quotechar = '"')

In [4]:
df_post_2019_dmd = df_post_2019_dmd.replace(r'^\s*$', np.nan, regex=True)

In [5]:
df_post_2019_dmd = df_post_2019_dmd.fillna(np.nan)

In [6]:
# specify new column names for the incoming metadata column headers
# for august 2020 batch, mapped Run time rounded to Run_Time
post_2019_dmd_newcols = {
    "HI #" : "HI",
    "Inventory": "NOID",
    "Publication cycle" : "Publication_Cycle",
    "Date of event" : "Date_of_Production",
    "Location information": "Location_Venue",
    "Language note": "Language_Note",
    "Language": "Language_List",
    "Main production credits": "Main_Production_Credits",
    "Event type" : "Worktypes",
    "Subject": "Subjects_653",
    "Copyright holder": "Rights_Holder",
    "Artist bio": "Artist_Bio",
    "Run time rounded":"Run_Time_Submitted",
    "Collection": "Series_Title",
    "Conference":"Meeting_Information"
}

In [7]:
#rename the column headers
df_post_2019_dmd.rename(columns=post_2019_dmd_newcols, inplace=True)
#see what columns we have in the dataframe now:
#print("new df",df_post_2019_dmd)
print("new df",df_post_2019_dmd.info())
#Alternate titles have imported as non-null float64 values, and I'm not sure why!

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 36 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   HI                                 30 non-null     object 
 1   NOID                               30 non-null     object 
 2   DMD Finalized                      30 non-null     object 
 3   Title                              30 non-null     object 
 4   Alternate title 1                  6 non-null      object 
 5   Alternate title 2                  0 non-null      float64
 6   Alternate title 3                  0 non-null      float64
 7   Alternate title 4                  0 non-null      float64
 8   Alternate title 5                  0 non-null      float64
 9   Series_Title                       30 non-null     object 
 10  Meeting_Information                0 non-null      float64
 11  Worktypes                          30 non-null     object 
 

In [8]:
#add an empty column for 650 subjects
#based on https://stackoverflow.com/questions/16327055/how-to-add-an-empty-column-to-a-dataframe
df_post_2019_dmd["Subjects_650"] = np.nan

In [9]:
#do some concatenation to populate the format field
df_post_2019_dmd ["Format"] = df_post_2019_dmd["How many source media form items?"].astype(str) + " " + df_post_2019_dmd["Source media format"]

In [10]:
#combine copyright holder contact info into a single field
#based on https://stackoverflow.com/questions/60724940/concatenate-strings-across-columns-that-are-not-null
df_post_2019_dmd["Copyright_Contact"] = df_post_2019_dmd[["Copyright contact designation","Copyright address","Copyright business phone","Copyright mobile phone","Copyright fax","Copyright email 1","Copyright email 2","Copyright email 3","Copyright website"]].apply(lambda x: ', '.join(x.dropna()), axis=1)
df_post_2019_dmd["Copyright_Contact"] = df_post_2019_dmd["Copyright_Contact"].replace('\\n', ', ', regex=True)
print(df_post_2019_dmd["Copyright_Contact"])


0     Valeria Macías Rodríguez, tel. (52-55) 4122802...
1     Valeria Macías Rodríguez, tel. (52-55) 4122802...
2     Valeria Macías Rodríguez, tel. (52-55) 4122802...
3     Valeria Macías Rodríguez, tel. (52-55) 4122802...
4     Valeria Macías Rodríguez, tel. (52-55) 4122802...
5     Valeria Macías Rodríguez, tel. (52-55) 4122802...
6     Valeria Macías Rodríguez, tel. (52-55) 4122802...
7     Valeria Macías Rodríguez, tel. (52-55) 4122802...
8     Valeria Macías Rodríguez, tel. (52-55) 4122802...
9     Valeria Macías Rodríguez, tel. (52-55) 4122802...
10    Valeria Macías Rodríguez, tel. (52-55) 4122802...
11    Valeria Macías Rodríguez, tel. (52-55) 4122802...
12    Valeria Macías Rodríguez, tel. (52-55) 4122802...
13    Valeria Macías Rodríguez, tel. (52-55) 4122802...
14    Valeria Macías Rodríguez, tel. (52-55) 4122802...
15    Carlos Martiel, 407 Kosciuszko St., Brooklyn, ...
16    Carlos Martiel, 407 Kosciuszko St., Brooklyn, ...
17    Carlos Martiel, 407 Kosciuszko St., Brookl

In [11]:
#combine alternate titles into a single cell
#based on https://stackoverflow.com/questions/60724940/concatenate-strings-across-columns-that-are-not-null
#this may not actually concatenate any alternate titles, because there is usually only ever one
df_post_2019_dmd["Alternate_Titles"] = df_post_2019_dmd[["Alternate title 1","Alternate title 2","Alternate title 3","Alternate title 4","Alternate title 5"]].apply(lambda x: '|'.join(x.dropna()), axis=1)

In [12]:
#tried filling the blank cells in this field with np.nan but it didn't work...
df_post_2019_dmd["Alternate_Titles"].fillna(np.nan)

0                                                      
1                                                      
2                                                      
3                                                      
4                                                      
5                                                      
6                                                      
7                                                      
8                                                      
9                                                      
10                                                     
11                                                     
12                                                     
13                                                     
14                                                     
15                            Maze, Enacting Stillness.
16    Death to Oblivion. Produced as part of the sol...
17                                              

In [13]:
#I still wanted to check to see if the values had become null values!
print(df_post_2019_dmd ["Alternate_Titles"].isnull())

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
Name: Alternate_Titles, dtype: bool


In [14]:
#get rid of any newline characters
df_post_2019_dmd["Alternate_Titles"] = df_post_2019_dmd["Alternate_Titles"].replace('\\n', '', regex=True)
df_post_2019_dmd["Main_Production_Credits"] = df_post_2019_dmd["Main_Production_Credits"].replace('\\n', '', regex=True)
df_post_2019_dmd["Summary"] = df_post_2019_dmd["Summary"].replace('\\n', '', regex=True)
df_post_2019_dmd["Artist_Bio"] = df_post_2019_dmd["Artist_Bio"].replace('\\n', '', regex=True)
df_post_2019_dmd["Participants"] = df_post_2019_dmd["Participants"].replace('\\n', '', regex=True)


In [15]:
#print a sample record that had newlines
print(df_post_2019_dmd.loc[8,"Artist_Bio"])

El Museo Ex Teresa Arte Actual is an artistic space located in the Centro Histórico neighborhood of Mexico City. Opened in 1993, Ex Teresa was named for its location in what was formerly the temple of Santa Teresa la Antigua. For more than 25 years, the museum and cultural space has dedicated itself to the promotion and creation of contemporary art, as well as exhibitions and research into creative processes around the creation of performance, sound, and video art, among other modalities.


In [16]:
#drop unwanted columns and see what remains
#for august 2020, dropped Run time instead of Run time rounded
df_post_2019_dmd.drop(["Run time","DMD Finalized","How many source media form items?","Source media format","Alternate title 1","Alternate title 2","Alternate title 3","Alternate title 4","Alternate title 5","Copyright contact designation","Copyright address","Copyright business phone","Copyright mobile phone","Copyright fax","Copyright email 1","Copyright email 2","Copyright email 3","Copyright website"], axis=1, inplace=True)

In [17]:
print(df_post_2019_dmd.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   HI                       30 non-null     object 
 1   NOID                     30 non-null     object 
 2   Title                    30 non-null     object 
 3   Series_Title             30 non-null     object 
 4   Meeting_Information      0 non-null      float64
 5   Worktypes                30 non-null     object 
 6   Date_of_Production       30 non-null     object 
 7   Location_Venue           30 non-null     object 
 8   Subjects_653             30 non-null     object 
 9   Summary                  30 non-null     object 
 10  Artist_Bio               30 non-null     object 
 11  Rights_Holder            30 non-null     object 
 12  Main_Production_Credits  25 non-null     object 
 13  Participants             30 non-null     object 
 14  Run_Time_Submitted       30 

In [18]:
df_post_2019_dmd = df_post_2019_dmd.sort_index(axis=1)
#df_post_2019_dmd = df_post_2019_dmd[["NOID","Alternate_Titles","Artist_Bio","Copyright_Contact","Date_of_Production","Format","HI","Language_List","Location_Venue","Main_Production_Credits","Meeting_Information","Participants","Publication_Cycle","Rights_Holder","Run_Time","Series_Title","Subjects_650","Subjects_653","Summary","Title","Worktypes"]]

In [19]:
#df_combined_dmd = pd.concat([df_pre_2019_dmd,df_post_2019_dmd],ignore_index=True,keys=['pre', 'post'])
#print(df_combined_dmd)
print(df_post_2019_dmd.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Alternate_Titles         30 non-null     object 
 1   Artist_Bio               30 non-null     object 
 2   Copyright_Contact        30 non-null     object 
 3   Date_of_Production       30 non-null     object 
 4   Format                   30 non-null     object 
 5   HI                       30 non-null     object 
 6   Language_List            15 non-null     object 
 7   Language_Note            0 non-null      float64
 8   Location_Venue           30 non-null     object 
 9   Main_Production_Credits  25 non-null     object 
 10  Meeting_Information      0 non-null      float64
 11  NOID                     30 non-null     object 
 12  Participants             30 non-null     object 
 13  Publication_Cycle        30 non-null     object 
 14  Rights_Holder            30 

In [20]:
df_post_2019_dmd.to_csv("hidvl_metadata_reshaped_%s.csv"%filetime, index=False)

The end!