#### Import modules, initialize variables, etc.

In [37]:
from datetime import datetime
from os.path import exists as file_exists
import pandas as pd
from pprint import pprint as pp
import re
import requests
from time import sleep
import xmltodict
import Credentials      # Get API keys, etc.

apikey = Credentials.prod_api
baseurl = 'https://api-na.hosted.exlibrisgroup.com'
item_query = '/almaws/v1/bibs/{mms_id}/holdings/{holding_id}/items/{item_pid}?apikey={apikey}'

exported_csv = "FullItemList.csv"
filled_csv = "FilledEnumChron.csv"
err_log_txt = "log.txt"

### Read CSV into Pandas dataframe

In [49]:
# Depending on how the file was exported, column names may or may have either spaces or underscores
df = pd.read_csv(exported_csv, dtype=str)

#### Clean up, tweak & format df

In [50]:
# Remove spaces from column names
df.columns = [c.replace(' ', '_') for c in df.columns]
# Rename certain columns
df = df.rename(columns={'Permanent_Location':'Location',
                        'Item_Policy': 'Policy',
                        'Material_Type': 'Material'})
# Strip leading/trailing space from Description
df.Description = df.Description.str.strip()
# Collapse multiple spaces within the Description
df.Description.replace(' +', ' ', regex=True, inplace=True)

In [51]:
# Exclude special cases—GovDocs of certain material types, with weird numbering
df = df[~((df['Material'].isin(['Issue','Microform','Other'])) & (df['Location'].isin(['gdmo','gdrf','gdrfi','govdo'])))]

#### Add empty columns for the Enum/Chron fields

In [52]:
# Add empty columns for the Enum/Chron fields
EC_fields = ['Enum_A', 'Enum_B', 'Chron_I', 'Chron_J']
df[EC_fields] = None

In [53]:
df

Unnamed: 0,Title,Location,Policy,Material,Description,Item_ID,Holdings_ID,MMS_ID,Enum_A,Enum_B,Chron_I,Chron_J
0,Nebraska life,LL Periodicals,Periodicals,Issue,2022 Nebraska Traveler,23128725750006388,2271324180006388,991004431099706388,,,,
1,Kazoo,LL Juvenile Periodicals,Periodicals,Issue,no.14 Fal 2019,23128314540006388,2263486450006388,991001229859706388,,,,
2,Nebraska life,LL Periodicals,Periodicals,Issue,2021 Nebraska Traveler,23117539840006388,2271324180006388,991004431099706388,,,,
3,Ladybug,LL Juvenile Periodicals,Periodicals,Issue,v.31 no.6 Mar 2021,23115249638806388,2272848990006388,991004669599706388,,,,
4,Cricket,LL Juvenile Periodicals,Periodicals,Issue,v.48 no.6 Mar 2021,23115249600006388,2265645400006388,991004670019706388,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2680,Canadian journal of research,LL Periodicals,Periodicals,Issue,v.8 1933,2364540230006388,2264540310006388,991003811559706388,,,,
2681,Canadian journal of research,LL Periodicals,Periodicals,Issue,v.9 1933,2364540220006388,2264540310006388,991003811559706388,,,,
2682,Canadian journal of research,LL Periodicals,Periodicals,Issue,v.10 1934,2364540210006388,2264540310006388,991003811559706388,,,,
2683,Canadian journal of research,LL Periodicals,Periodicals,Issue,v.11 1934,2364540200006388,2264540310006388,991003811559706388,,,,


### The function for getting info from Description to Enum/Chron fields:

In [43]:

def extract_and_fill(regex, these_fields):
    exp = re.compile(regex, re.IGNORECASE)
    for i, f in enumerate(these_fields):
        df[f] = df['Description'].str.extract(exp, expand=True)[i].fillna(df[f])


**[Test your Regex here](https://regex101.com/)**

#### Examples of usage:

In [44]:
# Records with just a volume number:
extract_and_fill(r'^v\.(\d+)$', ['Enum_A'])

# Records with just a volume number and issue number:
extract_and_fill(r'^v\.(\d+) no\.(\d+)$', ['Enum_A', 'Enum_B'])

# Records with just a volume number and year:
extract_and_fill(r'^v\.(\d+) (\d{4})$', ['Enum_A', 'Chron_I'])

In [48]:
df

Unnamed: 0,Title,Location,Policy,Material,Description,Item_ID,Holdings_ID,MMS_ID,Enum_A,Enum_B,Chron_I,Chron_J
0,Nebraska life,LL Periodicals,Periodicals,Issue,2022 Nebraska Traveler,23128725750006388,2271324180006388,991004431099706388,,,,
1,Kazoo,LL Juvenile Periodicals,Periodicals,Issue,no.14 Fal 2019,23128314540006388,2263486450006388,991001229859706388,,,,
2,Nebraska life,LL Periodicals,Periodicals,Issue,2021 Nebraska Traveler,23117539840006388,2271324180006388,991004431099706388,,,,
3,Ladybug,LL Juvenile Periodicals,Periodicals,Issue,v.31 no.6 Mar 2021,23115249638806388,2272848990006388,991004669599706388,,,,
4,Cricket,LL Juvenile Periodicals,Periodicals,Issue,v.48 no.6 Mar 2021,23115249600006388,2265645400006388,991004670019706388,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2321,Publishers weekly,LL Periodicals,Periodicals,Issue,v.267 no.41 Oct 12 2020,2364607520006388,2264607990006388,991000982779706388,,,,
2322,Publishers weekly,LL Periodicals,Periodicals,Issue,v.267 no.42 Oct 19 2020,2364607510006388,2264607990006388,991000982779706388,,,,
2323,Publishers weekly,LL Periodicals,Periodicals,Issue,v.267 no.43 Oct 26 2020,2364607500006388,2264607990006388,991000982779706388,,,,
2324,Publishers weekly,LL Periodicals,Periodicals,Issue,v.267 no.45 Nov 9 2020,2364607490006388,2264607990006388,991000982779706388,,,,


#### Set some common expressions that can be used in a modular way

In [46]:
# Single volume/book - CAPTURE
vvvRE = r'(?:v|bk)\. ?(\d+[a-z]?)'

##### 

In [6]:
# Volume(s)/book(s) - CAPTURE
vvv_vvRE = vvvRE[:-1] + r'(?:[\&\-]\d+)?)'
# Index/Supp/etc
iiiiRE = r'(?:abstracts?|addendum|brief|Directory|exec(?:utive)? summ(?:ary)?|guide|handbook|(?:author |cum |master |subj )?Index(?:es)?|revisions?|spec(?:ial(?:edition|issue|rep|report)?)?|Suppl?\.?(?: \d+)? ?)|title sheet|updates?'
# Month/season
mmmRE = r'(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sept?(?:ember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?|Spr(?:ing)?|Sum(?:mer)?|Fall?|Aut(?:umn)?|Win(?:ter)?)'
# Month + date(s)
mmm_ddRE = mmmRE + r'(?: \d{1,2}(?:[\-\/]\d{1,2})?)?'
# 4-digit year/range of years (post–18th-century)
yyyyRE = r'(?:1[89]|20)\d{2}'
# 4-digit year leading to a range
#   E.g., 1976-89
yyyy_yyRE = r'(?:1[89]|20)\d{2}(?:-\d{2}|-\d{4})?'
# 4-digit year leading to a range, possibly using a SLASH
#   E.g., 1976/89
#   Beware false positives
#   Remember to replace the slash with a hyphen in the Chron_I
yyyy_SyyRE = r'(?:1[89]|20)\d{2}(?:[\-\/]\d{2}|[\-\/]\d{4})?'

#### Call it repeatedly for each regex that you come up with:

In [70]:
# Just volume(s) & nothing else
extract_and_fill(r'^' + vvv_vvRE + '$',
                 ['Enum_A'])

In [71]:
# Volume + issue
extract_and_fill(r'^' + vvvRE + '[ \/]no\. ?(\d+)$',
                 ['Enum_A', 'Enum_B'])

In [None]:
# Vol + issue + year
extract_and_fill(r'^' + vvvRE + r'[ \/]no\. ?(\d+) (' + yyyyRE + r')$',
    ['Enum_A', 'Enum_B', 'Chron_I'])

In [72]:
# Vol + issue + date + year
extract_and_fill(r'^' + vvvRE + '[ \/]no\. ?(\d+) (' + mmm_ddRE + r'),? (' + yyyyRE + ')$',
                 ['Enum_A', 'Enum_B', 'Chron_J', 'Chron_I'])

In [73]:
# Volume(s) + "Index"
extract_and_fill(r'^' + vvv_vvRE + ' (' + iiiiRE + r')$',
                 ['Enum_A', 'Enum_C'])

In [74]:
# Volume + year(s)
extract_and_fill(r'^' + vvv_vvRE + ' +\(?(' + yyyy_yyRE + r')\)?$',
                 ['Enum_A', 'Chron_I'])

In [None]:
# Volume + part(s)
extract_and_fill(r'^' + vvvRE + r' pt\. ?(\d+[a-z]?(?:[\&\-]\d+)?)$',
    ['Enum_A', 'Enum_C'])

In [75]:
# Just issue & nothing else
extract_and_fill(r'^no\. ?(\d+)$',
                 ['Enum_B'])

In [422]:
# Issue + date + year
extract_and_fill(r'^no\. ?(\d+) (' + mmm_ddRE + ') (' + yyyy_yyRE + r')$',
                 ['Enum_B', 'Chron_J', 'Chron_I'])

In [76]:
# Issue + year(s)
extract_and_fill(r'^no\. ?(\d+) (' + yyyy_yyRE + r')$',
                 ['Enum_B', 'Chron_I'])

In [77]:
# Just part(s) & nothing else
extract_and_fill(r'^pt\. ?(\d+[a-z]?(?:[\&\-]\d+)?)$',
                 ['Enum_C'])

In [78]:
# Just year/range of years
extract_and_fill(r'^(' + yyyy_yyRE + r')$',
                 ['Chron_I'])

In [79]:
# Year(s) + volume(s)
extract_and_fill(r'^(' + yyyy_yyRE + r') ' + vvv_vvRE + '$',
                 ['Chron_I', 'Enum_A'])

In [80]:
# Year(s) + part(s)
extract_and_fill(r'^(' + yyyy_yyRE + r') pt\. ?(\d+(?:[\-\/]\d+)?)$',
                 ['Chron_I', 'Enum_C'])

In [81]:
# Year(s) + "Index"
extract_and_fill(r'^(' + yyyy_yyRE + r') (' + iiiiRE + r')$',
                 ['Chron_I', 'Enum_C'])

In [82]:
# Year + month/season/date
extract_and_fill(r'^(' + yyyyRE + r') (' + mmm_ddRE + r')$',
                 ['Chron_I', 'Chron_J'])

In [83]:
# Range of dates within one calendar year
extract_and_fill(r'^(' + mmm_ddRE + r'[\-\/]' + mmm_ddRE + '), (' + yyyyRE + ')$',
                 ['Chron_J', 'Chron_I'])

In [441]:
# Range of dates and range of years (only capture years)
extract_and_fill(r'^' + mmm_ddRE + r'[\-\/]' + mmm_ddRE + ', (' + yyyyRE + r'[\-\/]\d\d(?:\d\d)?)$',
    ['Chron_I'])

In [84]:
# Date + year
extract_and_fill(r'^(' + mmm_ddRE + '),? (' + yyyyRE + ')$',
                 ['Chron_J', 'Chron_I'])

In [450]:
# SPECIAL CASES: Range of dates spanning across years
#   Only capture the year range, not the dates
# ex. "April 1976-February 1980" ⇒ 1976-1980
exp = re.compile(r'^' + mmm_ddRE + ',? (' + yyyyRE + ') ?- ?' + mmm_ddRE + ',? (' + yyyyRE + ')$')
for row, years in df['Description'].str.extract(exp, expand=True).dropna().apply('-'.join, axis=1).items():
    df.at[row, 'Chron_I'] = years
# ex. "Nov 11-May 16, 1977-1978"
exp = re.compile(r'^' + mmm_ddRE + ' ?- ?' + mmm_ddRE + ' (' + yyyy_SyyRE + ')$')
for row, years in df['Description'].str.extract(exp, expand=True).dropna().apply('-'.join, axis=1).items():
    df.at[row, 'Chron_I'] = years.replace('/', '-')
# ex. "Nov 11-May 16, 1977-1978"
exp = re.compile(r'^' + mmm_ddRE + ' ?- ?' + mmm_ddRE + ' (' + yyyy_SyyRE + ')$')
for row, years in df['Description'].str.extract(exp, expand=True).dropna().apply('-'.join, axis=1).items():
    df.at[row, 'Chron_I'] = years.replace('/', '-')

In [459]:
# SPECIAL CASES: Range of volumes spanning across years
# ex. "v.16-20 1976-1980"
exp = re.compile(r'^' + vvv_vvRE + ' (' + yyyy_yyRE + ')$')
for i, field in enumerate(['Enum_A', 'Chron_I']):
    for row, x in df['Description'].str.extract(exp, expand=True).dropna()[i].items():
        df.at[row, field] = x
# ex. "v.76 Jan 16, 1986-v.80 Dec 1989"
#   Ignore dates, capture vols & years
exp = re.compile(r'^' + vvvRE + ' ' + mmm_ddRE + r',? (' + yyyyRE + ') ?- ?' + vvvRE + ' ' + mmm_ddRE + r',? (' + yyyyRE + ')$')
for item, field in enumerate(['Enum_A', 'Chron_I']):
    for row, x in df['Description'].str.extract(exp, expand=True).dropna()[[item, item + 2]].apply('-'.join, axis=1).items():
        df.at[row, field] = x
# ex. "v.43-45 Jun 21-Jan 30, 1928/30"
#   Ignore dates, capture vols & years
exp = re.compile(r'^' + vvv_vvRE + ' ' + mmm_ddRE + ' ?- ?' + mmm_ddRE + ',? (' + yyyy_SyyRE + ')$')
for i, field in enumerate(['Enum_A', 'Chron_I']):
    for row, x in df['Description'].str.extract(exp, expand=True).dropna()[i].items():
        df.at[row, field] = x.replace('/', '-')

### Once all those replacements are done, pull filled-in records out to a new dataframe

In [29]:
# Create a dataframe to hold JUST records that get filled
filled = pd.DataFrame()
# Populate the new dataframe with any records that now have at least one Enum/Chron field filled
filled = df.dropna(subset=EC_fields, thresh=1)

#### View df & filled

In [47]:
df

Unnamed: 0,Title,Location,Policy,Material,Description,Item_ID,Holdings_ID,MMS_ID,Enum_A,Enum_B,Chron_I,Chron_J
0,Nebraska life,LL Periodicals,Periodicals,Issue,2022 Nebraska Traveler,23128725750006388,2271324180006388,991004431099706388,,,,
1,Kazoo,LL Juvenile Periodicals,Periodicals,Issue,no.14 Fal 2019,23128314540006388,2263486450006388,991001229859706388,,,,
2,Nebraska life,LL Periodicals,Periodicals,Issue,2021 Nebraska Traveler,23117539840006388,2271324180006388,991004431099706388,,,,
3,Ladybug,LL Juvenile Periodicals,Periodicals,Issue,v.31 no.6 Mar 2021,23115249638806388,2272848990006388,991004669599706388,,,,
4,Cricket,LL Juvenile Periodicals,Periodicals,Issue,v.48 no.6 Mar 2021,23115249600006388,2265645400006388,991004670019706388,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2321,Publishers weekly,LL Periodicals,Periodicals,Issue,v.267 no.41 Oct 12 2020,2364607520006388,2264607990006388,991000982779706388,,,,
2322,Publishers weekly,LL Periodicals,Periodicals,Issue,v.267 no.42 Oct 19 2020,2364607510006388,2264607990006388,991000982779706388,,,,
2323,Publishers weekly,LL Periodicals,Periodicals,Issue,v.267 no.43 Oct 26 2020,2364607500006388,2264607990006388,991000982779706388,,,,
2324,Publishers weekly,LL Periodicals,Periodicals,Issue,v.267 no.45 Nov 9 2020,2364607490006388,2264607990006388,991000982779706388,,,,


___________________________________

In [None]:
filled

## Apply the changes via the API and log filled items to the Filled CSV

#### Run this bit to see what got "filled" before hitting the API

In [None]:
c = 0
for index, row in filled.fillna('').iterrows():
    c += 1
    print(c, ' / '.join([row['MMS_ID'], row['Holdings_ID'], row['Item_ID']]),
          str(row['Description']),
          ' | '.join(x or '' for x in [row['Enum_A'], row['Enum_B'], row['Enum_C'], row['Chron_I'], row['Chron_J']]),
          sep="\t")

#### This actually applies the changes

In [None]:
records = len(filled)
# If the "filled" file doesn't exist, note that we'll need a header
needs_header=not file_exists(filled_csv)

with open(err_log_txt, 'a') as err_log:
    for index, row in filled.fillna('').iterrows():
        c += 1
        r = requests.get(''.join([baseurl,
                                  item_query.format(mms_id=str(row['MMS_ID']),
                                                    holding_id=str(row['Holdings_ID']),
                                                    item_pid=str(row['Item_ID']),
                                                    apikey=apikey)]))
        rdict = xmltodict.parse(r.text)
        if r.status_code == 429:  # Too many requests--daily limit
            print()
            print('Reached API request limit for today. Stopping execution.')
            print()            
            ## Drop this record & everything after from "filled"
            filled = filled.iloc[:c-1]
            break
        if r.status_code != 200:
            e = xmltodict.parse(r._content)
            # Log the error
            print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                  ' Error FETCHING item ', row['Item_ID'], ': (', r.status_code, ') ',
                  e['web_service_result']['errorList']['error']['errorMessage'],
                  sep='',
                  file=err_log)
            # Remove this item from the "filled" df
            filled = filled.drop([index])
            continue
        if (c % (records/100) < 1):
            print(int(100*c/records), '% complete', sep='')#, end='\r')
            sleep(5)
            
        # Merge derived values into the retrieved data (rdict)
        
        rdict['item']['item_data']['description'] = str(row['Description'])
        
        rdict['item']['item_data']['enumeration_a'] = str(row['Enum_A'])
        rdict['item']['item_data']['enumeration_b'] = str(row['Enum_B'])
        rdict['item']['item_data']['enumeration_c'] = str(row['Enum_C'])
        rdict['item']['item_data']['chronology_i'] = str(row['Chron_I'])
        rdict['item']['item_data']['chronology_j'] = str(row['Chron_J'])
        # Set an internal note, if there's an empty one available
        if ('Enum/Chron derived from Description' not in rdict['item']['item_data'].values()):
            if (not rdict['item']['item_data']['internal_note_1']):
                rdict['item']['item_data']['internal_note_1'] = 'Enum/Chron derived from Description'
            elif (not rdict['item']['item_data']['internal_note_2']):
                rdict['item']['item_data']['internal_note_2'] = 'Enum/Chron derived from Description'
            elif (not rdict['item']['item_data']['internal_note_3']):
                rdict['item']['item_data']['internal_note_3'] = 'Enum/Chron derived from Description'
            else: # Nbd, just log it
                print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                      ' No internal note available for item MMS ID ',
                      str(row['MMS_ID']), sep="", file=err_log)
 
        # Push the altered record back into Alma
        pxml = xmltodict.unparse(rdict)
        p = requests.put(''.join([baseurl,
                                  item_query.format(mms_id=row['MMS_ID'],
                                                    holding_id=row['Holdings_ID'],
                                                    item_pid=row['Item_ID'],
                                                    apikey=apikey)]),
                         data=pxml.encode('utf-8'), headers={'Content-Type': 'application/xml'})
        if r.status_code == 429:  # Too many requests--daily limit
            print()
            print('Reached API request limit for today. Stopping execution.')
            print()            
            ## Drop this record & everything after from "filled"
            filled = filled.iloc[:c-1]
            break
        if p.status_code != 200:
            e = xmltodict.parse(p._content)
            # Log the error
            print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), ' Error UPDATING item ', row['Item_ID'], ': (', p.status_code, ') ',
                e['web_service_result']['errorList']['error']['errorMessage'],
                sep='', file=err_log)
            # Remove this item from the "filled" df
            filled = filled.drop([index])
            continue
        print(c, ' / '.join([row['MMS_ID'], row['Holdings_ID'], row['Item_ID']]),
              row['Description'],
              ' | '.join(x or '' for x in [row['Enum_A'], row['Enum_B'], row['Chron_I'], row['Chron_J']]),
              sep="\t")

        # Log it to the CSV
        #    Btw, the 'to_frame().T' transposes it, so it all goes in as a single comma-separated row
        row.to_frame().T.to_csv(filled_csv, mode='a', index=False, header=needs_header)
        needs_header = False # Henceforth

#### Purge filled rows from the original CSV

In [31]:
# Purge filled records from the original df
df = df.loc[~df['Item_ID'].isin(filled['Item_ID'])]

# Re-create the original CSV from that df
df.to_csv(exported_csv, index=False) # By default, will overwrite

## Testing stuff

In [249]:
### SPECIAL CASE: 3-digit Description, which is a year with first digit removed

exp = re.compile(r'^(9\d{2}(?:\-19\d{2}|\-20\d{2})?)$')
df['Chron_I'] = df['Description'] = '1' + df['Description'].str.extract(exp, expand=True).dropna()

exp = re.compile(r'^' + vvv_vvRE + ' ' + mmm_ddRE + ' ?- ?' + mmm_ddRE + ',? (' + yyyyRE + '[\-\/]\d\d(?:\d\d)?)$')
for i, field in enumerate(['Enum_A', 'Chron_I']):
    for row, x in df['Description'].str.extract(exp, expand=True).dropna()[i].items():
#         df.at[row, field] = x.replace('/', '-')
        print(row, field, x.replace('/', '-'))

df['Chron_I'] = df['Description'] = '1' + df['Description'].str.extract(exp, expand=True).fillna(df['Description'])