# Data Engineering

In [1]:
# set up
import pandas as pd
import requests
import json
import os

## PEN America's Index of School Book Bans Data

In [2]:
# read in the csv file of banned books from 7/1/2021-6/30/2022
index1 = pd.read_csv('pen_index_book_bans_01JUL2021-30JUN2022.csv', skiprows=2)
index1.head()

Unnamed: 0,Author,Title,Type of Ban,Secondary Author(s),Illustrator(s),Translator(s),State,District,Date of Challenge/Removal,Origin of Challenge
0,"Àbíké-Íyímídé, Faridah",Ace of Spades,Banned in Libraries and Classrooms,,,,Florida,Indian River County School District,November 2021,Administrator
1,"Acevedo, Elizabeth",Clap When You Land,Banned in Classrooms,,,,Pennsylvania,Central York School District,August 2021,Administrator
2,"Acevedo, Elizabeth",The Poet X,Banned in Libraries,,,,Florida,Indian River County School District,November 2021,Administrator
3,"Acevedo, Elizabeth",The Poet X,Banned in Libraries and Classrooms,,,,New York,Marlboro Central School District,February 2022,Administrator
4,"Acevedo, Elizabeth",The Poet X,Banned Pending Investigation,,,,Texas,Fredericksburg Independent School District,March 2022,Administrator


In [3]:
print("The first index dataframe has {} rows and {} columns".format(index1.shape[0], index1.shape[1]))

The first index dataframe has 2532 rows and 10 columns


In [4]:
# number of unique titles in the first index
index1['Title'].nunique()

1648

In [5]:
# explore the types of book bans in the first index
unique_types_1 = set(index1['Type of Ban'])
print(unique_types_1)

{'Banned in Classrooms', 'Banned in Libraries and Classrooms', 'Banned Pending Investigation', 'Banned in Libraries'}


In [6]:
# read in the csv file of banned books from 7/1/2022-12/31/2022
index2 = pd.read_csv('pen_index_book_bans_01JUL2022-31DEC2022.csv', skiprows=2)
index2.head(5)

Unnamed: 0,Title,Author,Secondary Author(s),Illustrator(s),Translator(s),Series Name,State,District,Date of Challenge/Removal,Ban Status,Origin of Challenge
0,Beyond Magenta: Transgender Teens Speak Out,"Kuklin, Susan",,,,,Arkansas,Conway Public Schools,October 2022,Banned in Libraries and Classrooms,Unclear
1,Felix Ever After,"Callender, Kacen",,,,,Arkansas,Conway Public Schools,October 2022,Banned in Libraries and Classrooms,Unclear
2,This Book Is Gay,"Dawson, Juno",,,,,California,William S. Hart Union High School District,September 2022,Banned in Libraries and Classrooms,Unclear
3,Flamer,"Curato, Mike",,,,,Colorado,Cherry Creek School District,September 2022,Banned Pending Investigation,Unclear
4,Gender Queer: A Memoir,"Kobabe, Maia",,,,,Colorado,Cherry Creek School District,September 2022,Banned Pending Investigation,Unclear


In [7]:
print("The second index dataframe has {} rows and {} columns".format(index2.shape[0], index2.shape[1]))

The second index dataframe has 1477 rows and 11 columns


In [8]:
# number of unique titles in the second index
index2['Title'].nunique()

873

In [9]:
# explore the types of book bans in the second index
unique_types_2 = set(index2['Ban Status'])
print(unique_types_2)

{'Banned in Classrooms', 'Banned in Libraries and Classrooms', 'Banned Pending Investigation', 'Banned in Libraries'}


In [10]:
# rename the ban status column as type of ban
index2.rename(columns={'Ban Status': 'Type of Ban'}, inplace = True)
index2.head()

Unnamed: 0,Title,Author,Secondary Author(s),Illustrator(s),Translator(s),Series Name,State,District,Date of Challenge/Removal,Type of Ban,Origin of Challenge
0,Beyond Magenta: Transgender Teens Speak Out,"Kuklin, Susan",,,,,Arkansas,Conway Public Schools,October 2022,Banned in Libraries and Classrooms,Unclear
1,Felix Ever After,"Callender, Kacen",,,,,Arkansas,Conway Public Schools,October 2022,Banned in Libraries and Classrooms,Unclear
2,This Book Is Gay,"Dawson, Juno",,,,,California,William S. Hart Union High School District,September 2022,Banned in Libraries and Classrooms,Unclear
3,Flamer,"Curato, Mike",,,,,Colorado,Cherry Creek School District,September 2022,Banned Pending Investigation,Unclear
4,Gender Queer: A Memoir,"Kobabe, Maia",,,,,Colorado,Cherry Creek School District,September 2022,Banned Pending Investigation,Unclear


In [11]:
# drop the series name column in the second index dataset
index2.drop(['Series Name'], axis=1, inplace=True)
index2.head()

Unnamed: 0,Title,Author,Secondary Author(s),Illustrator(s),Translator(s),State,District,Date of Challenge/Removal,Type of Ban,Origin of Challenge
0,Beyond Magenta: Transgender Teens Speak Out,"Kuklin, Susan",,,,Arkansas,Conway Public Schools,October 2022,Banned in Libraries and Classrooms,Unclear
1,Felix Ever After,"Callender, Kacen",,,,Arkansas,Conway Public Schools,October 2022,Banned in Libraries and Classrooms,Unclear
2,This Book Is Gay,"Dawson, Juno",,,,California,William S. Hart Union High School District,September 2022,Banned in Libraries and Classrooms,Unclear
3,Flamer,"Curato, Mike",,,,Colorado,Cherry Creek School District,September 2022,Banned Pending Investigation,Unclear
4,Gender Queer: A Memoir,"Kobabe, Maia",,,,Colorado,Cherry Creek School District,September 2022,Banned Pending Investigation,Unclear


In [12]:
# consolidate the two dataframes into one
combined = pd.concat([index1, index2], ignore_index=True)
combined.tail()

Unnamed: 0,Author,Title,Type of Ban,Secondary Author(s),Illustrator(s),Translator(s),State,District,Date of Challenge/Removal,Origin of Challenge
4004,"Lyga, Barry",Boy Toy,Banned in Libraries and Classrooms,,,,Virginia,Spotsylvania County Public Schools,October 2022,Formal Challenge
4005,"Arnold, Elana K.",Red Hood,Banned in Libraries and Classrooms,,,,Virginia,Spotsylvania County Public Schools,October 2022,Formal Challenge
4006,"McQuiston, Casey","Red, White, and Royal Blue",Banned in Libraries and Classrooms,,,,Virginia,Spotsylvania County Public Schools,October 2022,Formal Challenge
4007,"Kobabe, Maia",Gender Queer: A Memoir,Banned Pending Investigation,,,,Wyoming,Natrona County Schools,October 2022,Formal Challenge
4008,"Erickson-Schroth, Laura","Trans Bodies, Trans Selves: A Resource By and ...",Banned Pending Investigation,,,,Wyoming,Natrona County Schools,October 2022,Formal Challenge


In [13]:
#create variable to show the number of unique titles in the database and add print a line of stats
title_tot = combined['Title'].nunique()
print("The combined dataframe has {} rows and {} columns representing {} individual titles.".format(combined.shape[0], combined.shape[1], title_tot))

The combined dataframe has 4009 rows and 10 columns representing 2313 individual titles.


In [14]:
# drop columns that we are not interested in for this project
combined = combined[['Title', 'Author', 'Type of Ban', 'State', 'District', 'Date of Challenge/Removal']]
combined

Unnamed: 0,Title,Author,Type of Ban,State,District,Date of Challenge/Removal
0,Ace of Spades,"Àbíké-Íyímídé, Faridah",Banned in Libraries and Classrooms,Florida,Indian River County School District,November 2021
1,Clap When You Land,"Acevedo, Elizabeth",Banned in Classrooms,Pennsylvania,Central York School District,August 2021
2,The Poet X,"Acevedo, Elizabeth",Banned in Libraries,Florida,Indian River County School District,November 2021
3,The Poet X,"Acevedo, Elizabeth",Banned in Libraries and Classrooms,New York,Marlboro Central School District,February 2022
4,The Poet X,"Acevedo, Elizabeth",Banned Pending Investigation,Texas,Fredericksburg Independent School District,March 2022
...,...,...,...,...,...,...
4004,Boy Toy,"Lyga, Barry",Banned in Libraries and Classrooms,Virginia,Spotsylvania County Public Schools,October 2022
4005,Red Hood,"Arnold, Elana K.",Banned in Libraries and Classrooms,Virginia,Spotsylvania County Public Schools,October 2022
4006,"Red, White, and Royal Blue","McQuiston, Casey",Banned in Libraries and Classrooms,Virginia,Spotsylvania County Public Schools,October 2022
4007,Gender Queer: A Memoir,"Kobabe, Maia",Banned Pending Investigation,Wyoming,Natrona County Schools,October 2022


In [15]:
ace_of_spades = combined[combined["Title"] =="Ace of Spades"]
ace_of_spades.head()

Unnamed: 0,Title,Author,Type of Ban,State,District,Date of Challenge/Removal
0,Ace of Spades,"Àbíké-Íyímídé, Faridah",Banned in Libraries and Classrooms,Florida,Indian River County School District,November 2021
2626,Ace of Spades,"Àbíké-Íyímídé, Faridah",Banned Pending Investigation,Florida,Escambia County Public Schools,September 2022


In [16]:
both_bans = combined[combined["Type of Ban"]== "Banned in Libraries and Classrooms"].copy()
new_classroom_rows = both_bans.copy()
new_classroom_rows['Type of Ban']= "Banned in Classrooms"

new_library_rows = both_bans.copy()
new_library_rows['Type of Ban']= "Banned in Libraries"

combined_filtered = combined[combined['Type of Ban'] != 'Banned in Libraries and Classrooms']
combined_filtered = pd.concat([combined, new_classroom_rows, new_library_rows], ignore_index = True)

combined_filtered.head()

Unnamed: 0,Title,Author,Type of Ban,State,District,Date of Challenge/Removal
0,Ace of Spades,"Àbíké-Íyímídé, Faridah",Banned in Libraries and Classrooms,Florida,Indian River County School District,November 2021
1,Clap When You Land,"Acevedo, Elizabeth",Banned in Classrooms,Pennsylvania,Central York School District,August 2021
2,The Poet X,"Acevedo, Elizabeth",Banned in Libraries,Florida,Indian River County School District,November 2021
3,The Poet X,"Acevedo, Elizabeth",Banned in Libraries and Classrooms,New York,Marlboro Central School District,February 2022
4,The Poet X,"Acevedo, Elizabeth",Banned Pending Investigation,Texas,Fredericksburg Independent School District,March 2022


In [17]:
# create new columns for count for each type of ban
combined_filtered["Library Ban"] = (combined_filtered["Type of Ban"] == "Banned in Libraries").astype(int)
combined_filtered["School Ban"] = (combined_filtered["Type of Ban"] == "Banned in Classrooms").astype(int)
combined_filtered["Pending Investigation"] = (combined_filtered["Type of Ban"] == "Banned Pending Investigation").astype(int)
combined_filtered['Total'] = combined_filtered["Library Ban"] + combined_filtered["School Ban"] + combined_filtered["Pending Investigation"]
combined_filtered.head()


Unnamed: 0,Title,Author,Type of Ban,State,District,Date of Challenge/Removal,Library Ban,School Ban,Pending Investigation,Total
0,Ace of Spades,"Àbíké-Íyímídé, Faridah",Banned in Libraries and Classrooms,Florida,Indian River County School District,November 2021,0,0,0,0
1,Clap When You Land,"Acevedo, Elizabeth",Banned in Classrooms,Pennsylvania,Central York School District,August 2021,0,1,0,1
2,The Poet X,"Acevedo, Elizabeth",Banned in Libraries,Florida,Indian River County School District,November 2021,1,0,0,1
3,The Poet X,"Acevedo, Elizabeth",Banned in Libraries and Classrooms,New York,Marlboro Central School District,February 2022,0,0,0,0
4,The Poet X,"Acevedo, Elizabeth",Banned Pending Investigation,Texas,Fredericksburg Independent School District,March 2022,0,0,1,1


In [18]:
ace_of_spades = combined_filtered[combined_filtered["Title"] =="Ace of Spades"]
ace_of_spades.head()

Unnamed: 0,Title,Author,Type of Ban,State,District,Date of Challenge/Removal,Library Ban,School Ban,Pending Investigation,Total
0,Ace of Spades,"Àbíké-Íyímídé, Faridah",Banned in Libraries and Classrooms,Florida,Indian River County School District,November 2021,0,0,0,0
2626,Ace of Spades,"Àbíké-Íyímídé, Faridah",Banned Pending Investigation,Florida,Escambia County Public Schools,September 2022,0,0,1,1
4009,Ace of Spades,"Àbíké-Íyímídé, Faridah",Banned in Classrooms,Florida,Indian River County School District,November 2021,0,1,0,1
4706,Ace of Spades,"Àbíké-Íyímídé, Faridah",Banned in Libraries,Florida,Indian River County School District,November 2021,1,0,0,1


In [19]:
# group the dataframe by type of ban
combined_filtered = combined_filtered.drop(columns=["District", "Date of Challenge/Removal", "Type of Ban"]) 
combined_filtered.head()


Unnamed: 0,Title,Author,State,Library Ban,School Ban,Pending Investigation,Total
0,Ace of Spades,"Àbíké-Íyímídé, Faridah",Florida,0,0,0,0
1,Clap When You Land,"Acevedo, Elizabeth",Pennsylvania,0,1,0,1
2,The Poet X,"Acevedo, Elizabeth",Florida,1,0,0,1
3,The Poet X,"Acevedo, Elizabeth",New York,0,0,0,0
4,The Poet X,"Acevedo, Elizabeth",Texas,0,0,1,1


In [20]:
combined_filtered = combined_filtered.groupby(["Title", "State"]).sum().reset_index()

#["Library Ban", "School Ban", "Pending Investigation", "Total"].sum()
combined_filtered.head()

Unnamed: 0,Title,State,Author,Library Ban,School Ban,Pending Investigation,Total
0,"""Multiplication Is for White People"": Raising ...",Pennsylvania,"Delpit, Lisa",0,1,0,1
1,#BlackLivesMatter: Protesting Racism,Florida,"Thomas, Rachael L.",0,0,1,1
2,#BlackLivesMatter: Protesting Racism,Texas,"Thomas, Rachael L.",0,0,1,1
3,#Hockey,Texas,"Ukazu, NgoziUkazu, NgoziUkazu, Ngozi",1,1,0,2
4,"#Hockey (Check, Please! Series)",Georgia,"Ukazu, NgoziUkazu, NgoziUkazu, Ngozi",1,1,0,2


In [21]:
# introduce a list of 13 most banned books from the american library association (ALA)
most_banned_13 = ["Gender Queer: A Memoir", "All Boys Aren't Blue", "The Bluest Eye", "Flamer", 
                  "Looking for Alaska", "The Perks of Being a Wallflower", "Lawn Boy", 
                  "The Absolutely True Diary of a Part-Time Indian", "Out of Darkness",
                 "A Court of Mist and Fury", "Crank", "Me and Earl and the Dying Girl", 
                  "This Book Is Gay"]

In [22]:
# subset the dataframe to 13 most banned
combined_most_banned = combined_filtered[combined_filtered["Title"].isin(most_banned_13)].sort_values(by='State')
combined_most_banned

Unnamed: 0,Title,State,Author,Library Ban,School Ban,Pending Investigation,Total
922,Gender Queer: A Memoir,Alaska,"Kobabe, MaiaKobabe, MaiaKobabe, Maia",1,1,0,2
2806,This Book Is Gay,California,"Dawson, JunoDawson, JunoDawson, Juno",1,1,0,2
923,Gender Queer: A Memoir,Colorado,"Kobabe, Maia",0,0,1,1
833,Flamer,Colorado,"Curato, Mike",0,0,1,1
53,A Court of Mist and Fury,Florida,"Maas, Sarah J.Maas, Sarah J.Maas, Sarah J.Maas...",2,2,3,7
...,...,...,...,...,...,...,...
941,Gender Queer: A Memoir,Washington,"Kobabe, MaiaKobabe, Maia",0,0,2,2
842,Flamer,Wisconsin,"Curato, MikeCurato, MikeCurato, Mike",1,1,0,2
1789,Out of Darkness,Wisconsin,"Pérez, Ashley Hope",1,0,0,1
2817,This Book Is Gay,Wisconsin,"Dawson, JunoDawson, JunoDawson, JunoDawson, Ju...",2,2,0,4


In [25]:
# convert and save the dataframe as json
combined_most_banned.to_json('pen_13_most_banned.json', orient = 'records')

## Open Library Data

In [None]:
# get request to open library api to see how the data is formatted
url = "http://openlibrary.org/search.json?q=Flamer&limit=1"
response = requests.get(url)
# print(response.json())

In [None]:
# api call to open library to get book jacket images
# def get_jacket_art(title):
#     url = f"http://openlibrary.org/search.json?q={title}&limit=1"
#     response=requests.get(url)
#     if response.status_code == 200:
#         data = response.json()
#         if data['docs']:
#             cover_id = data['docs'][0]['cover_i']
#             if cover_id:
#                 cover_query = f"http://covers.openlibrary.org/b/id/{cover_id}-L.jpg"
#                 cover_image = requests.get(cover_query).content
#                 with open(os.path.join("cover_images", f"{title}_cover.jpg"), 'wb') as c:
#                     c.write(cover_image)
#                 print(f"Cover art for {title} saved as {title}_cover.jpg")
#             else:
#                 print(f"No cover art found for {title}")
#         else:
#             print(f"{title} not found in Open Library")
#     else:
#         print('API call failed, try again')

# for title in most_banned_13:
#     get_jacket_art(title)

In [None]:
# use open library to get data on books
book_info_dict = {}

def get_book_info(title):
    url = f"http://openlibrary.org/search.json?q={title}&limit=1"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        if data['docs']:
            book_info_dict[title] = {
                'title': data['docs'][0]['title'],
                'author': data['docs'][0]['author_name'][0] if 'author_name' in data['docs'][0] else None,
                'publisher': data['docs'][0]['publisher'][0] if 'publisher' in data['docs'][0] else None,
                'publish_date': data['docs'][0]['first_publish_year'] if 'first_publish_year' in data['docs'][0] else None
            }
        else:
            print(f"{title} not found in Open Library")
    else:
        print("API call failed, try again")
        

for title in most_banned_13:
    get_book_info(title)

# Save all book information as a single JSON file
with open("book_info.json", 'w') as f:
    json.dump(book_info_dict, f, indent=4)

print("All book information saved as 'all_books_info.json'")