# Civil War Book Review scratchpad

https://www.reportlab.com/docs/reportlab-userguide.pdf



In [1]:
from collections import namedtuple
import csv
import os

## Read the csvs

In [2]:
cwbr_dict = dict()

def csv_to_dictionary(filename):
    with open(filename, newline='', encoding='utf8') as csvfile:
        csvreader = csv.reader(csvfile, delimiter="\t", quotechar='"')
        headers = next(csvreader)
        CWBR = namedtuple('CWBR', headers)
        for row in csvreader:
            item = CWBR(*row)
            cwbr_dict[item.ID] = item


# Comment out the files you want to use

for filename in (
#                   'Split_Into_Types/Annotations_and_cousins.csv',
#                   'Split_Into_Types/Review_Just_URL.csv',
                  'Split_Into_Types/audio_supplemental.csv',
                  'Split_Into_Types/Needs_Unpacking.csv',
                  'Split_Into_Types/Main_set.csv',
                  'Split_Into_Types/Reference_and_audio.csv',
                  'Split_Into_Types/Review_no_text.csv',
                  'Split_Into_Types/Simple_text.csv',
                ):
    csv_to_dictionary(filename)

## Making the pdfs

In [3]:
from reportlab.lib.enums import TA_JUSTIFY, TA_CENTER
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from reportlab.lib.units import inch

In [4]:
# styling the text

styles=getSampleStyleSheet()
styles.add(ParagraphStyle(name='Justify',
                          fontName = 'Times-Roman',
                          alignment=TA_JUSTIFY,
                          firstLineIndent=25,
                          fontSize = 14,
                          leading = 17))

styles.add(ParagraphStyle(name='TitleBig',
                          fontName ='Times-Bold',
                          alignment=TA_CENTER,
                          fontSize = 14,
                          leading = 22))

styles.add(ParagraphStyle(name='SubTitleBig',
                          fontName ='Times-Bold',
                          alignment=TA_CENTER,
                          fontSize = 14,
                          leading = 18))

styles.add(ParagraphStyle(name='AuthorBig',
                          fontName ='Times-Bold',
                          alignment = TA_CENTER,
                          fontSize = 12,
                          leading = 14))

styles.add(ParagraphStyle(name='Citation',
                          fontName = 'Times-Roman',
                          alignment=TA_JUSTIFY,
                          firstLineIndent=0,
                          fontSize = 14,
                          leading = 17))


In [5]:
# recipe for turning the data into pdf.


def do_thing(uri):
    issue = cwbr_dict[uri]
    make_pdf(issue)

def strip_breaks(text):
    for i in ("<br>", "</br>", "<p>", "</p>", "<BR>", "</BR>", ):
        text = text.replace(i, "")
    return text
   
def make_pdf(issue):
    os.makedirs('output', exist_ok=True)
    doc = SimpleDocTemplate('output/{}.pdf'.format(issue.ID),
                           pagesize=letter,
                           rightMargin=72,
                           leftMargin=72,
                           topMargin=72,
                           bottomMargin=50)

    template = []
    
    title = issue.Headline
    for item in title.split('<p>'):
        if item:
            p = Paragraph(item.upper(), styles["TitleBig"])
            template.append(p)
    # template.append(Spacer(1,0.1*inch))
    
    subtitle = issue.Sub_headline
    for item in subtitle.split('<p>'):
        if item:
            p = Paragraph(item, styles["SubTitleBig"])
            template.append(p)
    template.append(Spacer(1, 0.3*inch))    
    
    if issue.Record_type != 'Editorial':
        if issue.Record_type not in ('Review', 'Classics'): 
            for author in (issue.Auth_1, issue.Auth_2, issue.Auth_3):
                author = author.replace('<br>', '<p>').replace('</br>', '</p>')
                if author:
                    p = Paragraph(author, styles["AuthorBig"])
                    template.append(p)
            template.append(Spacer(1, 0.5*inch))
        else:
            if issue.Reviewer:
                p = Paragraph(issue.Reviewer, styles["AuthorBig"])
                template.append(p)
            template.append(Spacer(1, 0.5*inch))   


        authors_list = [i for i in (issue.Auth_1, issue.Auth_2, issue.Auth_3) if i]
        if len(authors_list) == 1:
            authors = '<b>{}</b>'.format(authors_list[0])
        elif len(authors_list) == 2:
            authors = '<b>{} and {}.</b>'.format(authors_list[0], authors_list[1])
        elif len(authors_list) == 3:
            authors = '<b>{}, {}, and {}.</b>'.format(authors_list[0], authors_list[1], authors_list[2])
        else:
            authors = ''

        book_title = "<i>{}.</i>".format(strip_breaks(issue.Title))

        if issue.Publisher:
            publisher = '{}, '.format(issue.Publisher)
        else:
            publisher = ''
        if issue.Pub_date:
            pub_date = '{}. '.format(issue.Pub_date)
        else:
            pub_date = ''
        citation = '{} {} {}{}{}'.format(authors,
                                         strip_breaks(book_title),
                                         publisher,
                                         pub_date,
                                         strip_breaks(issue.Price),
                                        )
        if issue.ISBN:
            citation += ' ISBN {}'.format(strip_breaks(issue.ISBN))
        p = Paragraph(citation, styles["Citation"])
        template.append(p)
        template.append(Spacer(1, 0.2*inch))
    
    list_of_paragraphs = issue.Review.replace('<br>', '<p>').replace('</br>', '</p>').split('<p>')
    for item in list_of_paragraphs:
        if item:
            p = Paragraph(item, styles["Justify"])
            template.append(p)
            template.append(Spacer(1, 0.2*inch))    
                
    doc.build(template)

In [6]:
# this makes a single item's pdf

do_thing('1003')

In [None]:
# this makes pdfs for all items in the loaded csvs

for uri in cwbr_dict:
    do_thing(uri)

## Understanding the source data

In [None]:
# how many items are in cwbr_dict, not counting 'interviews'?

# count = 0
# for k, v in cwbr_dict.items():
#     if v.Record_type != "Interview":
# #         print(v.ID, v.Record_type)
#         count += 1
# print(count)

In [None]:
# which item has the longest Review

# longest_item = ('', '')
# for k, v in cwbr_dict.items():
#     if len(v.Review) > len(longest_item[1]):
#         longest_item = (k, v.Review)
# print(longest_item[0])

In [None]:
# Is 'Teaser' a shortened 'Title'?

# def strip_and_replace(text):
#     return text.strip().replace(":", "").replace(".", "").replace(",", "").replace('"', '').replace("<p>", "").lower().replace(' ', '')


# inside, not_inside = 0, 0
# for uid, issue in cwbr_dict.items():
#     if issue.Teaser in issue.Title:
#         inside += 1
#     elif strip_and_replace(issue.Teaser[:-5]) in strip_and_replace(issue.Title):
#         inside += 1
#     else:
#         not_inside += 1
#         print("Teaser: {}\nTitle: {}\n\n".format(issue.Teaser, issue.Title))
# print(inside, not_inside)

In [None]:
# how many have Sub_headlines?  How do they Sub_headlines compare to Titles, Headlines?

# count = 0
# for uri, issue in cwbr_dict.items():
#     if len(issue.Sub_headline):
#         count += 1
#         print("{}\nS: {}\nH: {}\nT: {}\n\n".format(uri, issue.Sub_headline, issue.Headline, issue.Title))
# print(count)

In [None]:
# How many have a Headline?  Are Headlines a shortened form of the Title?

# count = 0
# similar = 0
# for uri, issue in cwbr_dict.items():
#     if len(issue.Headline):
#         count += 1
#         if strip_and_replace(issue.Headline) in strip_and_replace(issue.Title):
#             similar += 1
#             print("{}\nS: {}\nH: {}\nT: {}\n\n".format(uri, issue.Sub_headline, issue.Headline, issue.Title))
# print(count, similar)

In [None]:
# items_with_metadata, items_without_metadata = 0, 0

# for uri, item in cwbr_dict.items():
#     if item.SubCategories or item.Metadata:
#         items_with_metadata += 1
#     else:
#         items_without_metadata += 1
# print(items_with_metadata, items_without_metadata)

In [None]:
# what is our datastructure?  give an example

# item = cwbr_dict['6115']
# print(dir(item))
# for field in item._fields:
#     print("{}\t\t{}".format(field, getattr(item, field)[:100]))

In [None]:
# how many have a Teaser?  print all the Teasers to screen.

# count = 0
# for k, v in cwbr_dict.items():
#     if len(v.Teaser):
#         count += 1
#         print(v.Teaser)
# print(count)