# Data Engineering

In [1]:
# set up
import pandas as pd
import requests
import json
import os

## PEN America's Index of School Book Bans Data

In [2]:
# read in the csv file of banned books from 7/1/2021-6/30/2022
index1 = pd.read_csv('pen_index_book_bans_01JUL2021-30JUN2022.csv', skiprows=2)
index1.head(10)

Unnamed: 0,Author,Title,Type of Ban,Secondary Author(s),Illustrator(s),Translator(s),State,District,Date of Challenge/Removal,Origin of Challenge
0,"Àbíké-Íyímídé, Faridah",Ace of Spades,Banned in Libraries and Classrooms,,,,Florida,Indian River County School District,November 2021,Administrator
1,"Acevedo, Elizabeth",Clap When You Land,Banned in Classrooms,,,,Pennsylvania,Central York School District,August 2021,Administrator
2,"Acevedo, Elizabeth",The Poet X,Banned in Libraries,,,,Florida,Indian River County School District,November 2021,Administrator
3,"Acevedo, Elizabeth",The Poet X,Banned in Libraries and Classrooms,,,,New York,Marlboro Central School District,February 2022,Administrator
4,"Acevedo, Elizabeth",The Poet X,Banned Pending Investigation,,,,Texas,Fredericksburg Independent School District,March 2022,Administrator
5,"Acevedo, Elizabeth",The Poet X,Banned in Libraries,,,,Virginia,New Kent County Public Schools,October 2021,Administrator
6,"Aciman, André",Call Me By Your Name (Call Me By Your Name Ser...,Banned Pending Investigation,,,,Virginia,Spotsylvania County Public Schools,November 2021,Administrator
7,"Acito, Marc","How I Paid for College: A Novel of Sex, Theft,...",Banned Pending Investigation,,,,Florida,Indian River County School District,November 2021,Administrator
8,"Ada, Alma Flor",My Name Is María Isabel,Banned Pending Investigation,,"Thompson, Kathryn Dyble","Cerro, Ana M.",Florida,Duval County Public Schools,January 2022,Administrator
9,"Addasi, Maha",Time to Pray,Banned Pending Investigation,,"Gannon, Ned","Albitar, Nuha",Florida,Duval County Public Schools,January 2022,Administrator


In [3]:
print("The first index dataframe has {} rows and {} columns".format(index1.shape[0], index1.shape[1]))

The first index dataframe has 2532 rows and 10 columns


In [4]:
# number of unique titles in the first index
index1['Title'].nunique()

1648

In [5]:
# explore the types of book bans in the first index
unique_types_1 = set(index1['Type of Ban'])
print(unique_types_1)

{'Banned in Libraries', 'Banned in Libraries and Classrooms', 'Banned in Classrooms', 'Banned Pending Investigation'}


In [6]:
# read in the csv file of banned books from 7/1/2022-12/31/2022
index2 = pd.read_csv('pen_index_book_bans_01JUL2022-31DEC2022.csv', skiprows=2)
index2.head(10)

Unnamed: 0,Title,Author,Secondary Author(s),Illustrator(s),Translator(s),Series Name,State,District,Date of Challenge/Removal,Ban Status,Origin of Challenge
0,Beyond Magenta: Transgender Teens Speak Out,"Kuklin, Susan",,,,,Arkansas,Conway Public Schools,October 2022,Banned in Libraries and Classrooms,Unclear
1,Felix Ever After,"Callender, Kacen",,,,,Arkansas,Conway Public Schools,October 2022,Banned in Libraries and Classrooms,Unclear
2,This Book Is Gay,"Dawson, Juno",,,,,California,William S. Hart Union High School District,September 2022,Banned in Libraries and Classrooms,Unclear
3,Flamer,"Curato, Mike",,,,,Colorado,Cherry Creek School District,September 2022,Banned Pending Investigation,Unclear
4,Gender Queer: A Memoir,"Kobabe, Maia",,,,,Colorado,Cherry Creek School District,September 2022,Banned Pending Investigation,Unclear
5,"Let's Talk About It: The Teen's Guide to Sex, ...","Moen, Erika","Nolan, Matthew",,,,Colorado,Cherry Creek School District,September 2022,Banned in Libraries and Classrooms,Unclear
6,A Court of Mist and Fury,"Maas, Sarah J.",,,,A Court of Thorns and Roses,Florida,Brevard Schools,November 2022,Banned Pending Investigation,Formal Challenge
7,Crank,"Hopkins, Ellen",,,,Crank,Florida,Brevard Schools,November 2022,Banned Pending Investigation,Formal Challenge
8,Empire of Storms,"Maas, Sarah J.",,,,Throne of Glass,Florida,Brevard Schools,November 2022,Banned Pending Investigation,Formal Challenge
9,House of Earth and Blood,"Maas, Sarah J.",,,,Crescent City,Florida,Brevard Schools,November 2022,Banned Pending Investigation,Formal Challenge


In [7]:
print("The second index dataframe has {} rows and {} columns".format(index2.shape[0], index2.shape[1]))

The second index dataframe has 1477 rows and 11 columns


In [8]:
# number of unique titles in the second index
index2['Title'].nunique()

873

In [9]:
# explore the types of book bans in the second index
unique_types_2 = set(index2['Ban Status'])
print(unique_types_2)

{'Banned Pending Investigation', 'Banned in Libraries and Classrooms', 'Banned in Classrooms', 'Banned in Libraries'}


In [10]:
# rename the ban status column as type of ban
index2.rename(columns={'Ban Status': 'Type of Ban'}, inplace = True)
index2.head()

Unnamed: 0,Title,Author,Secondary Author(s),Illustrator(s),Translator(s),Series Name,State,District,Date of Challenge/Removal,Type of Ban,Origin of Challenge
0,Beyond Magenta: Transgender Teens Speak Out,"Kuklin, Susan",,,,,Arkansas,Conway Public Schools,October 2022,Banned in Libraries and Classrooms,Unclear
1,Felix Ever After,"Callender, Kacen",,,,,Arkansas,Conway Public Schools,October 2022,Banned in Libraries and Classrooms,Unclear
2,This Book Is Gay,"Dawson, Juno",,,,,California,William S. Hart Union High School District,September 2022,Banned in Libraries and Classrooms,Unclear
3,Flamer,"Curato, Mike",,,,,Colorado,Cherry Creek School District,September 2022,Banned Pending Investigation,Unclear
4,Gender Queer: A Memoir,"Kobabe, Maia",,,,,Colorado,Cherry Creek School District,September 2022,Banned Pending Investigation,Unclear


In [11]:
# drop the series name column in the second index dataset
index2.drop(['Series Name'], axis=1, inplace=True)
index2.head()

Unnamed: 0,Title,Author,Secondary Author(s),Illustrator(s),Translator(s),State,District,Date of Challenge/Removal,Type of Ban,Origin of Challenge
0,Beyond Magenta: Transgender Teens Speak Out,"Kuklin, Susan",,,,Arkansas,Conway Public Schools,October 2022,Banned in Libraries and Classrooms,Unclear
1,Felix Ever After,"Callender, Kacen",,,,Arkansas,Conway Public Schools,October 2022,Banned in Libraries and Classrooms,Unclear
2,This Book Is Gay,"Dawson, Juno",,,,California,William S. Hart Union High School District,September 2022,Banned in Libraries and Classrooms,Unclear
3,Flamer,"Curato, Mike",,,,Colorado,Cherry Creek School District,September 2022,Banned Pending Investigation,Unclear
4,Gender Queer: A Memoir,"Kobabe, Maia",,,,Colorado,Cherry Creek School District,September 2022,Banned Pending Investigation,Unclear


In [12]:
# consolidate the two dataframes into one
combined = pd.concat([index1, index2], ignore_index=True)
combined.tail()

Unnamed: 0,Author,Title,Type of Ban,Secondary Author(s),Illustrator(s),Translator(s),State,District,Date of Challenge/Removal,Origin of Challenge
4004,"Lyga, Barry",Boy Toy,Banned in Libraries and Classrooms,,,,Virginia,Spotsylvania County Public Schools,October 2022,Formal Challenge
4005,"Arnold, Elana K.",Red Hood,Banned in Libraries and Classrooms,,,,Virginia,Spotsylvania County Public Schools,October 2022,Formal Challenge
4006,"McQuiston, Casey","Red, White, and Royal Blue",Banned in Libraries and Classrooms,,,,Virginia,Spotsylvania County Public Schools,October 2022,Formal Challenge
4007,"Kobabe, Maia",Gender Queer: A Memoir,Banned Pending Investigation,,,,Wyoming,Natrona County Schools,October 2022,Formal Challenge
4008,"Erickson-Schroth, Laura","Trans Bodies, Trans Selves: A Resource By and ...",Banned Pending Investigation,,,,Wyoming,Natrona County Schools,October 2022,Formal Challenge


In [13]:
print("The combined dataframe has {} rows and {} columns".format(combined.shape[0], combined.shape[1]))

The combined dataframe has 4009 rows and 10 columns


In [14]:
# drop columns that we are not interested in for this project
combined = combined[['Title', 'Author', 'Type of Ban', 'State', 'District', 'Date of Challenge/Removal']]
combined

Unnamed: 0,Title,Author,Type of Ban,State,District,Date of Challenge/Removal
0,Ace of Spades,"Àbíké-Íyímídé, Faridah",Banned in Libraries and Classrooms,Florida,Indian River County School District,November 2021
1,Clap When You Land,"Acevedo, Elizabeth",Banned in Classrooms,Pennsylvania,Central York School District,August 2021
2,The Poet X,"Acevedo, Elizabeth",Banned in Libraries,Florida,Indian River County School District,November 2021
3,The Poet X,"Acevedo, Elizabeth",Banned in Libraries and Classrooms,New York,Marlboro Central School District,February 2022
4,The Poet X,"Acevedo, Elizabeth",Banned Pending Investigation,Texas,Fredericksburg Independent School District,March 2022
...,...,...,...,...,...,...
4004,Boy Toy,"Lyga, Barry",Banned in Libraries and Classrooms,Virginia,Spotsylvania County Public Schools,October 2022
4005,Red Hood,"Arnold, Elana K.",Banned in Libraries and Classrooms,Virginia,Spotsylvania County Public Schools,October 2022
4006,"Red, White, and Royal Blue","McQuiston, Casey",Banned in Libraries and Classrooms,Virginia,Spotsylvania County Public Schools,October 2022
4007,Gender Queer: A Memoir,"Kobabe, Maia",Banned Pending Investigation,Wyoming,Natrona County Schools,October 2022


In [15]:
# group the dataframe by type of ban
combined_grouped = combined.groupby(['Title', 'State', 'Type of Ban']).size().reset_index(name='Count')
combined_grouped

Unnamed: 0,Title,State,Type of Ban,Count
0,"""Multiplication Is for White People"": Raising ...",Pennsylvania,Banned in Classrooms,1
1,#BlackLivesMatter: Protesting Racism,Florida,Banned Pending Investigation,1
2,#BlackLivesMatter: Protesting Racism,Texas,Banned Pending Investigation,1
3,#Hockey,Texas,Banned in Libraries and Classrooms,1
4,"#Hockey (Check, Please! Series)",Georgia,Banned in Libraries and Classrooms,1
...,...,...,...,...
3479,yolo,South Carolina,Banned Pending Investigation,1
3480,yolo (Internet Girls Series),Wisconsin,Banned in Libraries,1
3481,"¡Solo pregunta!: Sé Diferente, Sé Valiente, Sé Tú",Pennsylvania,Banned in Classrooms,1
3482,¡Vámonos! Let's Go!,Pennsylvania,Banned in Classrooms,1


In [16]:
# introduce a list of 13 most banned books from the american library association (ALA)
most_banned_13 = ["Gender Queer: A Memoir", "All Boys Aren't Blue", "The Bluest Eye", "Flamer", 
                  "Looking For Alaska", "The Perks of Being a Wallflower", "Lawn Boy", 
                  "The Absolutely True Diary of a Part-Time Indian", "Out of Darkness",
                 "A Court of Mist and Fury", "Crank", "Me and Earl and the Dying Girl", 
                  "This Book is Gay"]

In [17]:
# subset the dataframe to 13 most banned
combined_most_banned = combined_grouped[combined_grouped["Title"].isin(most_banned_13)].sort_values(by='State')
combined_most_banned

Unnamed: 0,Title,State,Type of Ban,Count
1005,Gender Queer: A Memoir,Alaska,Banned in Libraries and Classrooms,1
900,Flamer,Colorado,Banned Pending Investigation,1
1006,Gender Queer: A Memoir,Colorado,Banned Pending Investigation,1
58,A Court of Mist and Fury,Florida,Banned Pending Investigation,3
623,Crank,Florida,Banned in Libraries,1
...,...,...,...,...
198,All Boys Aren't Blue,Washington,Banned Pending Investigation,1
1036,Gender Queer: A Memoir,Washington,Banned Pending Investigation,2
1961,Out of Darkness,Wisconsin,Banned in Libraries,1
914,Flamer,Wisconsin,Banned in Libraries and Classrooms,1


In [18]:
# convert and save the dataframe as json
combined_most_banned.to_json('pen_13_most_banned.json', orient = 'records')

## Open Library Data

In [19]:
# get request to open library api to see how the data is formatted
url = "http://openlibrary.org/search.json?q=Flamer&limit=1"
response = requests.get(url)
# print(response.json())

{'numFound': 16, 'start': 0, 'numFoundExact': True, 'docs': [{'key': '/works/OL20813470W', 'type': 'work', 'seed': ['/books/OL28348427M', '/books/OL28177022M', '/works/OL20813470W', '/subjects/lgbtq_young_adult', '/subjects/lambda_literary_awards', '/subjects/lambda_literary_award_winner', '/subjects/collectionid:texchallenge2021', '/subjects/collectionid:kellerchallenge', '/subjects/collectionid:alpinechallenge', '/subjects/comic_books_strips', '/subjects/juvenile_fiction', '/subjects/fiction', '/subjects/coming_out_(sexual_orientation)', '/subjects/camps', '/subjects/gay_teenagers', '/subjects/closeted_gays', '/subjects/infatuation', '/subjects/bullying', '/subjects/teenagers', '/subjects/suicidal_behavior', '/subjects/identity_(psychology)', '/subjects/self', '/subjects/catholic_church', '/subjects/sortir_du_placard_(homosexualité)', '/subjects/bandes_dessinées', '/subjects/romans_nouvelles_etc._pour_la_jeunesse', '/subjects/colonies_de_vacances', '/subjects/adolescents_homosexuels'

In [20]:
# api call to open library to get book jacket images
def get_jacket_art(title):
    url = f"http://openlibrary.org/search.json?q={title}&limit=1"
    response=requests.get(url)
    if response.status_code == 200:
        data = response.json()
        if data['docs']:
            cover_id = data['docs'][0]['cover_i']
            if cover_id:
                cover_query = f"http://covers.openlibrary.org/b/id/{cover_id}-L.jpg"
                cover_image = requests.get(cover_query).content
                with open(os.path.join("cover_images", f"{title}_cover.jpg"), 'wb') as c:
                    c.write(cover_image)
                print(f"Cover art for {title} saved as {title}_cover.jpg")
            else:
                print(f"No cover art found for {title}")
        else:
            print(f"{title} not found in Open Library")
    else:
        print('API call failed, try again')

for title in most_banned_13:
    get_jacket_art(title)

Cover art for Gender Queer: A Memoir saved as Gender Queer: A Memoir_cover.jpg
Cover art for All Boys Aren't Blue saved as All Boys Aren't Blue_cover.jpg
Cover art for The Bluest Eye saved as The Bluest Eye_cover.jpg
Cover art for Flamer saved as Flamer_cover.jpg
Cover art for Looking For Alaska saved as Looking For Alaska_cover.jpg
Cover art for The Perks of Being a Wallflower saved as The Perks of Being a Wallflower_cover.jpg
Cover art for Lawn Boy saved as Lawn Boy_cover.jpg
Cover art for The Absolutely True Diary of a Part-Time Indian saved as The Absolutely True Diary of a Part-Time Indian_cover.jpg
Cover art for Out of Darkness saved as Out of Darkness_cover.jpg
Cover art for A Court of Mist and Fury saved as A Court of Mist and Fury_cover.jpg
Cover art for Crank saved as Crank_cover.jpg
Cover art for Me and Earl and the Dying Girl saved as Me and Earl and the Dying Girl_cover.jpg
Cover art for This Book is Gay saved as This Book is Gay_cover.jpg


In [21]:
# use open library to get data on books
book_info_dict = {}

def get_book_info(title):
    url = f"http://openlibrary.org/search.json?q={title}&limit=1"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        if data['docs']:
            book_info_dict[title] = {
                'title': data['docs'][0]['title'],
                'author': data['docs'][0]['author_name'][0] if 'author_name' in data['docs'][0] else None,
                'publisher': data['docs'][0]['publisher'][0] if 'publisher' in data['docs'][0] else None,
                'publish_date': data['docs'][0]['first_publish_year'] if 'first_publish_year' in data['docs'][0] else None
            }
        else:
            print(f"{title} not found in Open Library")
    else:
        print("API call failed, try again")
        

for title in most_banned_13:
    get_book_info(title)

# Save all book information as a single JSON file
with open("book_info.json", 'w') as f:
    json.dump(book_info_dict, f, indent=4)

print("All book information saved as 'all_books_info.json'")

All book information saved as 'all_books_info.json'
