# Civil War Book Review scratchpad

https://www.reportlab.com/docs/reportlab-userguide.pdf



In [1]:
from collections import namedtuple
import csv
import re
import os

In [2]:
cwbr_dict = dict()

# name the file you wish to examine [Split_Into_Types/---------.csv]
def csv_to_dictionary(filename):
    with open(filename, newline='') as csvfile:
        csvreader = csv.reader(csvfile, delimiter="\t", quotechar='"')
        headers = next(csvreader)
        CWBR = namedtuple('CWBR', headers)
        for row in csvreader:
            item = CWBR(*row)
            cwbr_dict[item.ID] = item
            
for filename in (
#                   'Split_Into_Types/Annotations_and_cousins.csv',
                  'Split_Into_Types/audio_supplemental.csv',
                  'Split_Into_Types/Needs_Unpacking.csv',
                  'Split_Into_Types/Main_set.csv',
                  'Split_Into_Types/Reference_and_audio.csv',
                  'Split_Into_Types/Review_no_text.csv',
                  'Split_Into_Types/Simple_text.csv',
#                   'Split_Into_Types/Review_Just_URL.csv',
                ):
    csv_to_dictionary(filename)

In [3]:
count = 0
for k, v in cwbr_dict.items():
    if v.Record_type != "Interview":
#         print(v.ID, v.Record_type)
        count += 1
print(count)

1735


In [4]:
# prints the item with the longest Review

longest_item = ('', '')
for k, v in cwbr_dict.items():
    if len(v.Review) > len(longest_item[1]):
        longest_item = (k, v.Review)
print(longest_item[0])

3454


In [5]:
def strip_and_replace(text):
    return text.strip().replace(":", "").replace(".", "").replace(",", "").replace('"', '').replace("<p>", "").lower().replace(' ', '')

inside, not_inside = 0, 0
for uid, issue in cwbr_dict.items():
    if issue.Teaser in issue.Title:
        inside += 1
    elif strip_and_replace(issue.Teaser[:-5]) in strip_and_replace(issue.Title):
        inside += 1
    else:
        not_inside += 1
        
#       is Teaser a subset of Title??
#         print("Teaser: {}\nTitle: {}\n\n".format(issue.Teaser, issue.Title))
# print(inside, not_inside)

In [6]:
# how many have Sub_headlines?  How do they Sub_headlines compare to Titles, Headlines?

count = 0
for uri, issue in cwbr_dict.items():
    if len(issue.Sub_headline):
        count += 1
#         print("{}\nS: {}\nH: {}\nT: {}\n\n".format(uri, issue.Sub_headline, issue.Headline, issue.Title))
# print(count)

In [7]:
# How many have a Headline?  Are Headlines a shortened form of the Title?

count = 0
similar = 0
for uri, issue in cwbr_dict.items():
    if len(issue.Headline):
        count += 1
        if strip_and_replace(issue.Headline) in strip_and_replace(issue.Title):
            similar += 1
#             print("{}\nS: {}\nH: {}\nT: {}\n\n".format(uri, issue.Sub_headline, issue.Headline, issue.Title))
# print(count, similar)

In [8]:
items_with_metadata, items_without_metadata = 0, 0

for uri, item in cwbr_dict.items():
    if item.SubCategories or item.Metadata:
        items_with_metadata += 1
    else:
        items_without_metadata += 1
print(items_with_metadata, items_without_metadata)

109 1697


In [9]:
# what is our datastructure?  give an example

# item = cwbr_dict['6115']
# print(dir(item))
# for field in item._fields:
#     print("{}\t\t{}".format(field, getattr(item, field)[:100]))

In [10]:
# how many have a Teaser?  print all the Teasers to screen.

count = 0
for k, v in cwbr_dict.items():
    if len(v.Teaser):
        count += 1
#         print(v.Teaser)
# print(count)

In [11]:
from reportlab.lib.enums import TA_JUSTIFY, TA_CENTER
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from reportlab.lib.units import inch

In [12]:
# a styling with justified lines & indentation of some number of points.

styles=getSampleStyleSheet()
styles.add(ParagraphStyle(name='Justify',
                          fontName = 'Times-Roman',
                          alignment=TA_JUSTIFY,
                          firstLineIndent=25,
                          fontSize = 14,
                          leading = 17))

styles.add(ParagraphStyle(name='TitleBig',
                          fontName ='Times-Bold',
                          alignment=TA_CENTER,
                          fontSize = 14,
                          leading = 22))

styles.add(ParagraphStyle(name='SubTitleBig',
                          fontName ='Times-Bold',
                          alignment=TA_CENTER,
                          fontSize = 14,
                          leading = 18))

styles.add(ParagraphStyle(name='AuthorBig',
                          fontName ='Times-Bold',
                          alignment = TA_CENTER,
                          fontSize = 12,
                          leading = 14))

styles.add(ParagraphStyle(name='Citation',
                          fontName = 'Times-Roman',
                          alignment=TA_JUSTIFY,
                          firstLineIndent=0,
                          fontSize = 14,
                          leading = 17))


In [13]:
# print(cwbr_dict['6048'])

In [14]:
# script to turn the html into a pdf.  
# several different variations are included, to challenge our dataset -- to see which have errors.

# This one crashes on the first failure.


def do_thing(uri):
    issue = cwbr_dict[uri]
    make_pdf(issue)

def strip_breaks(text):
    for i in ("<br>", "</br>", "<p>", "</p>", "<BR>", "</BR>", ):
        text = text.replace(i, "")
    return text
        
        
def make_pdf(issue):
    doc = SimpleDocTemplate('output/{}.pdf'.format(issue.ID),
                           pagesize=letter,
                           rightMargin=72,
                           leftMargin=72,
                           topMargin=72,
                           bottomMargin=50)

    template = []
    
    title = issue.Headline
    for item in title.split('<p>'):
        if item:
            p = Paragraph(item.upper(), styles["TitleBig"])
            template.append(p)
#     template.append(Spacer(1,0.1*inch))
    
    subtitle = issue.Sub_headline
    for item in subtitle.split('<p>'):
        if item:
            p = Paragraph(item, styles["SubTitleBig"])
            template.append(p)
    template.append(Spacer(1, 0.3*inch))    
    
    if issue.Record_type != 'Editorial':
        if issue.Record_type not in ('Review', 'Classics'): 
            for author in (issue.Auth_1, issue.Auth_2, issue.Auth_3):
                author = author.replace('<br>', '<p>').replace('</br>', '</p>')
                if author:
                    p = Paragraph(author, styles["AuthorBig"])
                    template.append(p)
            template.append(Spacer(1, 0.5*inch))
        else:
            if issue.Reviewer:
                p = Paragraph(issue.Reviewer, styles["AuthorBig"])
                template.append(p)
            template.append(Spacer(1, 0.5*inch))   


        authors_list = [i for i in (issue.Auth_1, issue.Auth_2, issue.Auth_3) if i]
        if len(authors_list) == 1:
            authors = '<b>{}</b>'.format(authors_list[0])
        elif len(authors_list) == 2:
            authors = '<b>{} and {}.</b>'.format(authors_list[0], authors_list[1])
        elif len(authors_list) == 3:
            authors = '<b>{}, {}, and {}.</b>'.format(authors_list[0], authors_list[1], authors_list[2])
        else:
            authors = ''

        book_title = "<i>{}.</i>".format(strip_breaks(issue.Title))

        if issue.Publisher:
            publisher = '{}, '.format(issue.Publisher)
        else:
            publisher = ''
        if issue.Pub_date:
            pub_date = '{}. '.format(issue.Pub_date)
        else:
            pub_date = ''
        citation = '{} {} {}{}{}'.format(authors,
                                         strip_breaks(book_title),
                                         publisher,
                                         pub_date,
                                         strip_breaks(issue.Price),
                                        )
        if issue.ISBN:
            citation += ' ISBN {}'.format(strip_breaks(issue.ISBN))
        p = Paragraph(citation, styles["Citation"])
        template.append(p)
        template.append(Spacer(1, 0.2*inch))
    
    list_of_paragraphs = issue.Review.replace('<br>', '<p>').replace('</br>', '</p>').split('<p>')
    for item in list_of_paragraphs:
        if item:
            p = Paragraph(item, styles["Justify"])
            template.append(p)
            template.append(Spacer(1, 0.2*inch))    
                
    doc.build(template)

In [15]:
do_thing('1003')

In [16]:
# This one logs which ones fail, then continues to the next one.

def do_thing_log_fails(uri):
    try:
        do_thing(uri)
    except:
        failed.append(uri)

In [17]:
# this runs all the items in the loaded csv, with logging fails into a list called "failed".

failed = []

for uri in cwbr_dict:
    do_thing_log_fails(uri)

In [18]:
# prints how many failed, how many are in the loaded csv, and what percent failed.

print(len(failed),
      len(cwbr_dict),
      print(failed),
      '{}%'.format(len(failed)/len(cwbr_dict)//.01))


[]
0 1806 None 0.0%


In [19]:
import sys

extra_closed_tag = []
parse_error = []

for uri in failed:
    try:
        do_thing(uri)
    except Exception:
        parse_error.append((uri, sys.exc_info()))


print(len(failed))
# print(failed)
if parse_error:
    print(parse_error[0])

# with open('exception.txt', 'w') as f:
#     output_text = ''
#     for item in parse_error:
#         output_text += '{}\n'.format(item)
#     f.write(output_text)



0


In [20]:
# this identifies 3 causes of errors in parsing the data -- and a Leftovers group for items with unknown error

Review_starts_with_url = []
Review_starts_with_image = []
Review_starts_with_header_url = []
Leftovers = []


for item in parse_error:
#     if item[0] in Other_breaking_types:
#         continue
    if "<para><a href=" in item[1][1].args[0]:
        Review_starts_with_url.append(item)
    elif "<para><img " in item[1][1].args[0]:
        Review_starts_with_image.append(item)
    elif "<para><h3><a href=" in item[1][1].args[0]:
        Review_starts_with_header_url.append(item)
    else:
        Leftovers.append(item)

In [21]:
items = ''
for k, v  in cwbr_dict.items():
    items = items + '{}: {},'.format(k, v.Record_type)
# print(items)

In [22]:
# prints the number of items in each category.  You want the numbers to end up "0" eventually.

print("failed:", len(failed), '\n',
      "parse_error", len(parse_error), '\n',
      "Review_starts_with_url", len(Review_starts_with_url), '\n',
      "Review_starts_with_header_url", len(Review_starts_with_header_url), '\n',
      "Review_starts_with_image:", len(Review_starts_with_image), "\n",
      "Remainder", len(parse_error)-len(Review_starts_with_url)-len(Review_starts_with_header_url), '\n',
      "Leftovers", len(Leftovers), '\n',) 

print(sorted(failed))

failed: 0 
 parse_error 0 
 Review_starts_with_url 0 
 Review_starts_with_header_url 0 
 Review_starts_with_image: 0 
 Remainder 0 
 Leftovers 0 

[]


In [23]:
# this command is useful now for finding the error in each 'nnnn' listed above.  The error is in the last printed line.

# do_thing_print_fails('4244')

In [24]:
# to see if all of an error type occur in one Record_type.
# replace Review_starts_with_image with whatever error you wish to examine.

url_starts = {"Annotation": 0, "Classics": 0, "Review": 0, "": 0, "Interview": 0}
for i in Review_starts_with_image:
    url_starts[cwbr_dict[i[0]].Record_type] += 1

if Review_starts_with_image:
    print(Review_starts_with_image[0])
    
print(url_starts)
    

{'': 0, 'Review': 0, 'Annotation': 0, 'Interview': 0, 'Classics': 0}


In [25]:
# dir(cwbr_dict['4053'])


In [26]:
# Script for scraping images listed in csvs

broken_links = ['http://www.cwbr.com/civilwarbookreview/images/jackets/TheLeadershipOfUlyssesSGrant.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/BerryAllthatMakesaMan.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/Canaan.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/tintern.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/HLHunley.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/McPhersonDaysofDestiny.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/EdwardsShelbysExpedition.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/OutOfTheHouseOfBondage.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/RobertLee.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/WelfareAndCharity.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/ColonelsInBlue.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/TheOzarks.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/UpcountrySouthCarolinaGoesToWar.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/TheBattleOfPickettsMill.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/BurningRailsAsWePleased.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/SexSicknessandSlavery.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/QuantrillInTexas.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/OutOfTheMouthOfHell.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/FlagsCivilwarMissouri.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/HorsesandMules.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/LastToLeaveTheField.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/QueenOfTheConfederacy.jpg',
                "http://www.cwbr.com/civilwarbookreview/images/jackets/PrinceGeorge'sCountry.jpg",
                'http://www.cwbr.com/civilwarbookreview/images/jackets/CivilWarRevolvers.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/TheWilmington&WeldonRailroadInTheCivilWar.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/CivilWarSoldiersOfGreaterCleveland.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/LeesCavalrymen.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/GettingUsedToBeingShotAt.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/LifeandLabor.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/TheSeventhStarOfTheConfederacy.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/ARegimentOfSlaves.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/AcountryRiven.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/RebelsandRunaways.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/FloridasMonuments.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/KallenUndergroundRailroad.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/OpticTheBlueandTheGray.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/BrianThomsenAlternateGettysburg.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/VoicesFromCompanyD.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/GarlandTrailMakers.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/DurkinTheLastMan.jpg',
                'http://www.cwbr.com/civilwarbookreview/images/jackets/NeitherLadynorSlave.jpg',
               ]




import urllib.request
import os

imagepath_regex = re.compile('src="(?P<path>[\S]+)"')

def find_img_url(image_text):
    imagepath = imagepath_regex.search(image_text)
    webpath = imagepath.group('path')
    url = 'http://www.cwbr.com{}'.format(webpath)
    return url


for urn, pack in cwbr_dict.items():
    images = [pack.Image, pack.Image2, pack.Image3, pack.Image4, pack.Image5]
    image_urls = [find_img_url(i) for i in images if i]
    for image_url in image_urls:
        os.makedirs('images/{}'.format(urn), exist_ok=True)
        filepath = 'images/{}/{}'.format(urn, image_url.split('/')[-1])      
        if os.path.isfile(filepath):
            continue
        if image_url in broken_links:
            print(image_url)
            continue
        with urllib.request.urlopen(image_url) as response:
            binary = response.read()
        with open(filepath, 'bw') as f:

            f.write(binary)

In [27]:
# scraping audio

import urllib.request
import os

audiopath_regex = re.compile('<a href="(?P<path>[\S]+)"')

def find_audio_url(audio1_text):
    audiopath = audiopath_regex.search(audio1_text)
    webpath = audiopath.group('path')
    url = 'http://www.cwbr.com/{}'.format(webpath)
    return url


for urn, pack in cwbr_dict.items():
    if pack.Audio1:
        os.makedirs('audio/{}'.format(urn), exist_ok=True)
        audio_url = find_audio_url(pack.Audio1)
        filepath = 'audio/{}/{}'.format(urn, os.path.split(audio_url)[-1]) 
        if os.path.isfile(filepath):
            continue
        if pack.Audio1 in broken_links:
            print(pack.Audio1)
            continue

        with urllib.request.urlopen(audio_url) as response:
            binary = response.read()
        with open(filepath, 'bw') as f:
            f.write(binary)

In [28]:
for urn, pack in cwbr_dict.items():
    for field in pack:
        if 'audio/' in field:
            print(urn, field)

5437 <a href="audio/LongInterview.mp3" target="_blank"></a>
4395 <a href="audio/SlotkinInterview.mp3" target="_blank"></a>
4900 <a href="audio/GallagherInterview.mp3" target="_blank"></a>
5284 <a href="audio/CooperInterviewSecession.mp3" target="_blank"></a>
5122 <a href="audio/EtchesonInterview.mp3" target="_blank"></a>
6306 <a href="audio/2016finalcwbrstiles.mp3" target="_blank"></a>
6173 <a href="audio/BerlinInterview.mp3" target="_blank"></a>
4827 <a href="audio/MarshallInterview.mp3" target="_blank"></a>
4566 <a href="audio/SamitoInterview" target="_blank"></a>
4481 <a href="audio/FordInterview.mp3" target="_blank"></a>
5512 <a href="audio/KantrowitzInterview.mp3" target="_blank"></a>
5668 <a href="audio/VaronInterview.mp3" target="_blank">Click here for the audio version of the interview</a>
4053 <a href="audio/CooperInterview.mp3" target="_blank"></a>
5362 <a href="audio/OakesInterview.mp3" target="_blank"></a>
5986 <a href="audio/WarrenInterview.mp3" target="_blank"></a>
5864 <