# Preprocessing, text cleaning, and preliminary EDA
---

## 1) Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## 2) Text Importing

In [2]:
pd.set_option('display.max_colwidth', 255)

#### Pandas refuses to allow me to set \n as the deliminator for lines and treats the .txt files as a csv, establishing lines using commas. Until I figure out a way to make the pandas import play nicely, I'm importing the old fashioned way.

In [3]:
book = {}
for i in range(1,8): #Putting all 7 books into dataframes stored within a dictionary for intuitive key reference
    with open(f'../corpora/Book{i}.txt') as book_file:
        lines = book_file.read()
    book_file.close()
    
    book_lines = lines.split('\n')
    book_data_frame = pd.DataFrame(data = book_lines)
    book_data_frame.columns = ['line']
    book[i] = book_data_frame

#### What I would normally use.

In [4]:
#book1 = pd.read_table("../corpora/Book1.txt",sep='\n')
#book1.columns = ['line']

## 3) Exploring Data Structure

In [5]:
book[1].describe()

Unnamed: 0,line
count,15261.0
unique,10077.0
top,
freq,5117.0


In [6]:
#Duplicate lines, most of them blank spaces
book[1]['line'].value_counts().head(10)

                 5117
“What?”             5
it?”                4
Hermione.           4
•k k k              3
them.               3
Gryffindor.”        3
again.              3
there.”             2
Harry.              2
Name: line, dtype: int64

In [7]:
#Line count of each book file
[book[key].shape for key in book]

[(15261, 1),
 (16761, 1),
 (21506, 1),
 (37545, 1),
 (49466, 1),
 (32393, 1),
 (38110, 1)]

In [8]:
#Displaying a line as a list of words, easy to reference its word count
example_line = book[1]['line'].iloc[7].split()
example_line,len(example_line)

(['Mr.',
  'and',
  'Mrs.',
  'Dursley,',
  'of',
  'number',
  'four,',
  'Privet',
  'Drive,'],
 9)

## 4) Preliminary Text Cleaning

#### Lots of empty lines in the text files from paragraph spacing and spaces, going to trim any line that only contains blank space

In [9]:
[(book[i]['line'] == '').sum() for i in range(1,8)]

[5117, 5509, 7114, 12293, 15583, 10122, 12133]

In [10]:
def drop_empty_lines(key):
    """
    Takes in an dictionary key (integer) intended for the dictionary containing
    each book data frame and replaces every blank line in the referenced book
    with a numpy null value, so that they can be dropped from the data frame.
    """
    book[key] = book[key].applymap((lambda x:np.nan if x=='' else x))
    book[key].dropna(inplace=True)
    book[key].reset_index(drop=True,inplace=True)
    pass

In [11]:
#Applying function to all 7 books
[drop_empty_lines(key) for key in book]

[None, None, None, None, None, None, None]

In [12]:
#New line count of each book file
[book[key].shape for key in book]

[(10144, 1),
 (11252, 1),
 (14392, 1),
 (25252, 1),
 (33883, 1),
 (22271, 1),
 (25977, 1)]

## 5) Determining Chapter Breaks in Each Book

In [13]:
#Chapter titles are not numbered but appear to be all caps
book[1]['line'].iloc[1],book[1]['line'].iloc[1].isupper()

('THE BOY WHO LIVED ', True)

#### Filtering lines to those where every alphabetical character is capitalized identifies all of the chapter titles, but it also includes lines from singular statements that happen to be written in all-caps. A couple chapters with longer titles take up 2 lines, these will need to be compressed into 1 line. Manual identification is needed, which is not too problematic as the chapters are limited.
#### Chapter names obtained from the fan driven Harry Potter wiki on [Fandom](https://harrypotter.fandom.com/wiki/List_of_chapters_in_the_Harry_Potter_books#Harry_Potter_and_the_Philosopher's_Stone_(23_June_1991–20_June_1992))

In [14]:
book[1][book[1]['line'].str.isupper()].head()

Unnamed: 0,line
1,THE BOY WHO LIVED
569,THE VANASHIG GLASS
833,“MOTORCYCLES DONT FLY!”
918,MR. DURSLEY! COME AND LOOK AT THIS SNAKE!
919,YOU WONT BELIEVE WHAT IT’S DOING!”


In [15]:
chapters = {} #Dictionary to contain a list of chapter titles for each book
chapters_index = {} #Dictionary to contain a list of indices indicating which line each chapter starts on

### Manually inputting indices to identify chapter breaks and collect titles and start line indices.

In [16]:
#Book 1
chapters1 = book[1].iloc[[1,569,1000,1497,1975,2859,3691,4283,4669,5331,5892,6333,7038,7462,7914,8570,9425]]
chapters[1] = list(chapters1['line'])
chapters_index[1] = chapters1

In [17]:
#Book 2
chapters2 = book[2].iloc[[4,340,733,1319,2062,2767,3368,3966,4566,5269,5956,6738,7453,8167,8699,9297,10042,10741]]
chapters[2] = list(chapters2['line'])
chapters_index[2] = chapters2

In [18]:
#Book 3
chapters3 = book[3].iloc[[1,472,972,1549,2222,3113,4000,4589,5280,5992,6925,7674,8321,8885,9621,10365,10957,11528,11821,12482,12743,13746]]
chapters[3] = list(chapters3['line'])
chapters_index[3] = chapters3

In [19]:
#Book 4
chapters4 = book[4].iloc[[1,520,870,1276,1692,2184,2505,3203,3980,4941,5378,5831,6580,7135,7782,8476,9298,9855,10737,11547,
                         12454,13197,13796,14876,15721,16453,17490,18404,19406,19991,20836,21862,22125,22644,23017,23770,24585]]
chapters[4] = list(chapters4['line'])
chapters_index[4] = chapters4

In [20]:
#Book 5
chapters5 = book[5].iloc[[3,756,1592,2282,3048,3776,4658,5264,5836,6903,7689,8498,9640,10781,11840,
                          12771,13547,14451, 15373,16253,17057,18011,19056,20019,21084,22128,23236,
                          24236,25296,26286,27355,28375,29201,29716,30395,31402,31891,32894]]
chapters[5] = list(chapters5['line'])
chapters_index[5] = chapters5

In [21]:
#Book 6
chapters6 = book[6].iloc[[2,651,1285,1907,2710,3521,4313,5188,5744,6549,7341,8007,8729,9439,10285,11067,11881,12717,13598,
                         14422,15270,16015,16787,17519,18277,18969,19797,20391,20857,21590]]
chapters[6] = list(chapters6['line'])
chapters_index[6] = chapters6

In [22]:
#Book 7
chapters7 = book[7].iloc[[2,420,932,1383,2085,2894,3751,4635,5456,5989,6846,7584,8401,9168,9711,10646,11310,12008,12450,
                         13303,13899,14532,15293,16356,17231,17799,18678,19006,19576,20168,20834,21891,22596,23694,24184,24830,25749]]
chapters[7] = list(chapters7['line'])
chapters_index[7] = chapters7

#### I was unable to do this automatically as a couple of chapters had typos in them making it risky to compare them to a list of Harry Potter chapters sourced online. Additionally, based on my method of referencing all lines where every letter is capitalized, a few chapters with vague names like "Gilderoy Lockhart" had duplicates in non-chapter contexts, making it harder to accurately identify all chapters automatically.

In [23]:
"""
This is the code I used to reference every potential chapter name in the books, viewing 20 filtered lines at a time.
"""
#num = 7 #Book number
#book[num][book[num]['line'].str.isupper()].iloc[0:20]
pass

In [24]:
#Number of chapters in each book from 1 to 7
[len(chapters[key]) for key in chapters]

[17, 18, 22, 37, 38, 30, 37]

In [25]:
#Double checking dictionary
chapters1.iloc[16]

line    THE MAN WITH TWO FACES 
Name: 9425, dtype: object

## 6) Preparing Chapter Text Data

#### Assigning chapter text to a nested dictionary indexed by book number and chapter number

In [26]:
chapter_text = {}
for n in range(1,8):
    chapter_text[n] = {}
    for i in range(len(chapters_index[n]['line'])):
        start = chapters_index[n].iloc[[i]].index[0]+1
        try:
            end = chapters_index[n].iloc[[i+1]].index[0]
        except:
            end = book[n].iloc[[-1]].index[0]
        text = ''.join([n for n in book[n]['line'].iloc[start:end]])
        chapter_text[n][i+1] = text

In [27]:
#Confirming number of chapters in each book
for key in chapter_text:
    print(f"Book {key} has {len(chapter_text[key])} chapters")

Book 1 has 17 chapters
Book 2 has 18 chapters
Book 3 has 22 chapters
Book 4 has 37 chapters
Book 5 has 38 chapters
Book 6 has 30 chapters
Book 7 has 37 chapters


#### Writing the chapter_text dictionary to a new Data Frame

In [28]:
chapter_data = [
    [book_num,chap,chapter_text[book_num][chap]]
    for book_num in chapter_text
    for chap in chapter_text[book_num]
]
chapter_df = pd.DataFrame(data=chapter_data,columns=['book_number','chapter_number','chapter_text'])

## 7) Exporting Chapter Data to JSON Files

#### Storing chapter text dictionary as a json file

In [29]:
import json
with open("../corpora/chapter_text.json", "w") as json_file:
  json.dump(chapter_text, json_file)
json_file.close()

#### Assigning chapter titles to a nested dictionary indexed by book number and chapter number and storing it as a json file

In [30]:
chapter_titles = {num:{tup[0]+1:tup[1] for tup in enumerate(chapters[num])} for num in range(1,8)}
with open("../corpora/chapter_titles.json", "w") as json_file:
  json.dump(chapter_titles, json_file)
json_file.close()