In [1]:
import os
import pandas as pd
import altair as alt
import warnings
warnings.filterwarnings('ignore')
import sys
sys.path.append("..")
from compute_magazines.load_datasets import *

In [2]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

In [3]:
htrc_records = pd.read_csv("../datasets/original_files/htrc_third_world_records.csv")

In [4]:
htrc_records.Record_URL = htrc_records.Record_URL.str.replace(
    ';', ',').str.split(',')


In [5]:
cleaned_htrc_records = htrc_records.explode('Record_URL')


In [6]:
cleaned_htrc_records.Periodical_Name.unique()

array(['La Documentation Arabe ', 'Afrique Documents', 'OAU Bulletin',
       'IM information bulletin /Mozambique Information Agency',
       'News from Xinhua News Agency, China.Daily bulletin',
       "Bulletin d’information  Bureau d'information du Gouvernement révolutionnaire provisoire de la République du Sud Viet-Nam à Paris.",
       'Révolution africaine', 'Arab Observer and The Scribe',
       'Afro Asian Bulletin', 'al-Nashrah al-Ifrīqīyah al-Āsyawīyah',
       'Afro Asian Peoples', 'Afro Asian and World Affairs',
       'Solidarity AAPSO', 'Tricontinental', 'Afro Asian Women',
       'Solidarity SWAPO', 'Asia and Africa today', 'Spearhead',
       'Mozambican Revolution', 'Black News', 'Black World',
       'Negro Digest', 'Direct from Cuba', 'Lotus',
       'Nahdat Ifriqiyah/African Renaissance', 'Liberator', 'Freedomways',
       'LSM news /Liberation Support Movement, Information Center',
       'Presence Africaine'], dtype=object)

In [7]:
# url = cleaned_htrc_records.Record_URL[0]
# result = requests.get(url)
# ht_page = result.content
# soup = BeautifulSoup(ht_page, 'html.parser')

# rows = soup.find_all('tr')
# for row in rows:

#     if row.find(class_="IndItem"):
#         print('links', row.find('a').get('href'))
#         print('dates', row.find(class_="IndItem").text)
#         print(row.find_all('td')[-1].get_text())


In [8]:
if os.path.exists("../datasets/original_files/htrc_third_world_records_links_with_metadata.csv"):
    links_df = pd.read_csv("../datasets/original_files/htrc_third_world_records_links_with_metadata.csv")
else:
    dfs = []
    progress_bar = tqdm(total=len(cleaned_htrc_records), desc="Getting HathiTrust links")
    for index, row in cleaned_htrc_records.iterrows():
        progress_bar.update(1)
        url = row['Record_URL']
        result = requests.get(url)
        ht_page = result.content
        soup = BeautifulSoup(ht_page, 'html.parser')

        table_rows = soup.find_all('tr')

        for table_row in table_rows:
            if table_row.find('a', class_="rights-Array searchonly"):
                link = table_row.find('a', class_="rights-Array searchonly").get('href')
                htid = link.split('/')[-1]
                date = table_row.find(class_="IndItem").text
                source = table_row.find_all('td')[-1].get_text()
                new_df = {}
                new_df['link'] = link
                new_df['htid'] = htid
                new_df['date'] = date
                new_df['original_source'] = source
                new_df['record_url'] = url
                new_df['periodical_name'] = row['Periodical_Name']
                new_df['publication_type'] = row['Type']
                dfs.append(new_df)
    progress_bar.close()
    links_df = pd.DataFrame(dfs)
    links_df.to_csv("../original_files/htrc_third_world_records_links.csv", index=False)


In [10]:
output_path = '../datasets/ht_ef_datasets/full_hathitrust_annotated_magazines_with_htids.csv'
output_directory = "../datasets/ht_ef_datasets/"
df = get_full_combined_dataset(output_path, output_directory)
output_path = "../datasets/ht_ef_datasets/combined_full_hathitrust_annotated_magazines_with_htids.csv"
issues_df = get_combined_issues(output_path, df)
if os.path.exists("../datasets/original_files/htrc_third_world_records_links_with_metadata.csv"):
    full_links_df = pd.read_csv("../datasets/original_files/htrc_third_world_records_links_with_metadata.csv")
else:
    htrc_periodicals_full = pd.read_csv("../datasets/original_files/htrc_periodicals_se.csv")
    full_links_df = pd.merge(links_df, htrc_periodicals_full, on='htid', how='left')
    full_links_df.to_csv("../datasets/original_files/htrc_third_world_records_links_with_metadata.csv", index=False)



In [19]:
issues_df.cleaned_magazine_title.unique()

array(['afro-asian_and_world_affairs', 'afro-asian_bulletin_',
       'afro-asian_peoples', 'arab_observer',
       'arab_observer_and_the_scribe', 'liberator', 'lotus', 'solidarity',
       'the_scribe', 'tricontinental'], dtype=object)

In [20]:
issues_df.start_issue.nunique()

435

In [24]:

from htrc_features import FeatureReader



In [40]:
htid = full_links_df.htid[0]
htid

'inu.30000093395964'

In [25]:
full_links_df = links_df.copy()

In [26]:
full_links_df["keep_periodical"] = True
list_of_existing_periodicals = [
    "Afro Asian and World Affairs", "Afro Asian Bulletin", "Afro Asian Peoples", "Arab Observer and The Scribe", "Freedomways", "Liberator", "Lotus", "Solidarity AAPSO", "Tricontinental"]
full_links_df.loc[full_links_df.periodical_name.isin(list_of_existing_periodicals), "keep_periodical"] = False




In [27]:
periodicals = full_links_df[full_links_df.keep_periodical == True].periodical_name.unique().tolist()

periodicals = [periodical.strip().replace(" ", "_").replace('.', '').replace('/', '') for periodical in periodicals]


In [28]:
full_links_df['directory_name'] = "../ht_ef_datasets/" +full_links_df.periodical_name.str.strip().str.replace(" ", "_").str.replace('.', '').str.replace('/', '') + "_HathiTrust/"

In [29]:
import os

# for periodical in periodicals:
#     os.mkdir(f"../ht_ef_datasets/{periodical}_HathiTrust")

In [30]:
full_links_df['directory_exists'] = full_links_df.directory_name.apply(lambda x: os.path.exists(x))

In [68]:
full_links_df.to_csv("../original_files/htrc_third_world_records_links_with_metadata.csv", index=False)

In [67]:
dfs = []
progress_bar = tqdm(total=len(full_links_df[full_links_df.keep_periodical == True]), desc="Getting HathiTrust volumes")
for _, row in full_links_df[full_links_df.keep_periodical == True].iterrows():
    progress_bar.update(1)
    dir_name = row['directory_name']
    for dir, subdir, files in os.walk(dir_name):
        for file in files:
            vol_df = pd.read_csv(dir_name + file)
            
            row_df = pd.DataFrame(row).T
            row_df = row_df.rename(columns={'title': 'metadata_title', 'pub_place': 'metadata_pub_place', 'author': 'metadata_author', 'pub_date': 'metadata_pub_date', 'language': 'metadata_language', 'genre': 'metadata_genre',})
            cols = list(set(vol_df.columns)& set(row_df.columns))
            merged_df = pd.merge(vol_df, row_df, on=cols, how='left')
            dfs.append(merged_df)

progress_bar.close()

Getting HathiTrust volumes:   0%|          | 0/353 [00:09<?, ?it/s]


KeyboardInterrupt: 

In [57]:
final_df = pd.concat(dfs)

In [69]:
final_df.columns


Index(['sequence', 'section', 'token', 'pos', 'count', 'htid', 'title',
       'author', 'pub_date', 'pub_place', 'language', 'publisher', 'genre',
       'source_institution', 'link', 'date', 'original_source', 'record_url',
       'periodical_name', 'publication_type', 'access', 'rights', 'ht_bib_key',
       'description', 'source', 'source_bib_num', 'oclc_num', 'isbn', 'issn',
       'lccn', 'metadata_title', 'imprint', 'rights_reason_code',
       'rights_timestamp', 'us_gov_doc_flag', 'rights_date_used',
       'metadata_pub_place', 'lang', 'bib_fmt', 'collection_code',
       'content_provider_code', 'responsible_entity_code',
       'digitization_agent_code', 'access_profile_code', 'metadata_author',
       'keep_periodical', 'directory_name', 'directory_exists'],
      dtype='object')

In [102]:
# progress_bar = tqdm(total=len(full_links_df[full_links_df.keep_periodical == True]), desc="Getting HathiTrust links")
# for index, row in full_links_df[full_links_df.keep_periodical == True].iterrows():
#     progress_bar.update(1)
#     dir_path = row.directory_name
#     htid = row.htid
#     file_name = row.periodical_name.replace(" ", "_").replace(
#         '.', '').replace('/', '') + '_' + htid.replace('.', '_')
#     if os.path.exists(dir_path + file_name + '.csv'):
#         continue
#     try: 
#         vols = FeatureReader(ids=[htid])
#         for vol in vols:
            
#             volume_df = vol.tokenlist(section='all')
#             volume_df = volume_df.reset_index()
#             volume_df = volume_df.rename(
#                 columns={'lowercase': 'token', 'page': 'sequence'})
#             volume_df['htid'] = htid
#             volume_df['title'] = vol.title
#             volume_df['author'] =','.join(vol.author)
#             volume_df['pub_date'] = vol.pub_date
#             volume_df['pub_place'] = vol.pub_place
#             volume_df['language'] = vol.language
#             volume_df['publisher'] = vol.publisher
#             volume_df['genre'] = ','.join(vol.genre)
#             volume_df['source_institution'] = vol.source_institution
#             volume_df.to_csv(f"{dir_path}{file_name}.csv", index=False)
#     except:
#         print(f"Error getting {htid} for {row.periodical_name}")
# progress_bar.close()