# Module 2: Text into Data: Importing a Text

## Efrain Olivares: dpy8wq

* DS 5001: Exploratory Text Analytics
* Raf Alvarado

# Set Up

In [1]:
epub_file = "pg161-1_ex.txt"
csv_file = 'austen-sense-and-sensibility.csv'

In [2]:
OHCO = ['chap_num', 'para_num', 'sent_num', 'token_num']

In [3]:
import pandas as pd
pd.set_option('display.max_rows', 20)
%matplotlib inline

# Import file into a dataframe

In [4]:
epub = open(epub_file, 'r', encoding='utf-8-sig').readlines()
df = pd.DataFrame(epub, columns=['line_str'])
df.index.name = 'line_num'
df.line_str = df.line_str.str.strip()

In [5]:
df.sample(10)

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
3228,please me better than the finest banditti in t...
10086,"good humour could do, to make them feel themse..."
7758,
10996,"and talents, united a disposition naturally op..."
8590,"opinion,-and to make Marianne, by a resemblanc..."
3331,
2945,
5173,
3935,the assurances of her husband and mother on th...
7254,"""I am sorry we cannot see your sister, Miss Da..."


# Extract title of work from first line

In [6]:
title = df.loc[0].line_str.replace('The Project Gutenberg EBook of ', '')
df['title'] = title

In [7]:
print(title)

﻿Sense and Sensibility, by Jane Austen


In [8]:
df.head()

Unnamed: 0_level_0,line_str,title
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
0,﻿The Project Gutenberg EBook of Sense and Sens...,"﻿Sense and Sensibility, by Jane Austen"
1,,"﻿Sense and Sensibility, by Jane Austen"
2,This eBook is for the use of anyone anywhere a...,"﻿Sense and Sensibility, by Jane Austen"
3,almost no restrictions whatsoever. You may co...,"﻿Sense and Sensibility, by Jane Austen"
4,re-use it under the terms of the Project Guten...,"﻿Sense and Sensibility, by Jane Austen"


# Remove Gutenberg's front and back matter

In [9]:
a = df.line_str.str.match(r"\*\*\*\s*START OF (THE|THIS) PROJECT")
b = df.line_str.str.match(r"\*\*\*\s*END OF (THE|THIS) PROJECT")

In [10]:
an = df.loc[a].index[0]
bn = df.loc[b].index[0]

In [11]:
df = df.loc[an + 1 : bn - 2]

In [12]:
df

Unnamed: 0_level_0,line_str,title
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
19,,"﻿Sense and Sensibility, by Jane Austen"
20,Special thanks are due to Sharon Partridge for...,"﻿Sense and Sensibility, by Jane Austen"
21,proofreading and correction of this etext.,"﻿Sense and Sensibility, by Jane Austen"
22,,"﻿Sense and Sensibility, by Jane Austen"
23,,"﻿Sense and Sensibility, by Jane Austen"
...,...,...
12662,,"﻿Sense and Sensibility, by Jane Austen"
12663,,"﻿Sense and Sensibility, by Jane Austen"
12664,,"﻿Sense and Sensibility, by Jane Austen"
12665,,"﻿Sense and Sensibility, by Jane Austen"


# Chunk by chapter

## Find all chapter headers

In [13]:
chap_lines = df.line_str.str.match(r"^\s*(chapter|letter)\s+(\d+)", case=False)

In [14]:
df.loc[chap_lines]

Unnamed: 0_level_0,line_str,title
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
42,CHAPTER 1,"﻿Sense and Sensibility, by Jane Austen"
196,CHAPTER 2,"﻿Sense and Sensibility, by Jane Austen"
399,CHAPTER 3,"﻿Sense and Sensibility, by Jane Austen"
562,CHAPTER 4,"﻿Sense and Sensibility, by Jane Austen"
757,CHAPTER 5,"﻿Sense and Sensibility, by Jane Austen"
...,...,...
11279,CHAPTER 46,"﻿Sense and Sensibility, by Jane Austen"
11572,CHAPTER 47,"﻿Sense and Sensibility, by Jane Austen"
11839,CHAPTER 48,"﻿Sense and Sensibility, by Jane Austen"
11987,CHAPTER 49,"﻿Sense and Sensibility, by Jane Austen"


## Assign numbers to chapters

In [15]:
chap_nums = [i+1 for i in range(df.loc[chap_lines].shape[0])]

In [16]:
df.loc[chap_lines, 'chap_num'] = chap_nums

## Forward-fill chapter numbers to following text lines

In [17]:
df.chap_num = df.chap_num.ffill()

## Clean up

In [18]:
df = df.loc[~df.chap_num.isna()] # Remove chapter heading lines
df = df.loc[~chap_lines] # Remove everything before Chapter 1
df.chap_num = df.chap_num.astype('int') # Convert chap_num from float to int

In [19]:
df.sample(10)

Unnamed: 0_level_0,line_str,title,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
324,"payment of such a sum, on every rent day, is b...","﻿Sense and Sensibility, by Jane Austen",2
745,"at some distance from Norland, than immediatel...","﻿Sense and Sensibility, by Jane Austen",4
11204,"friendship, is enough to prove him one of the ...","﻿Sense and Sensibility, by Jane Austen",45
8260,"their being together, when it was finished. N...","﻿Sense and Sensibility, by Jane Austen",36
1445,"about and spraining of ankles.""","﻿Sense and Sensibility, by Jane Austen",9
6693,"every page. Her mother, still confident of th...","﻿Sense and Sensibility, by Jane Austen",31
2309,"off in such a hurry seems very like it. Well,...","﻿Sense and Sensibility, by Jane Austen",14
10212,The next day produced little or no alteration ...,"﻿Sense and Sensibility, by Jane Austen",43
2277,Could Elinor have listened to her without inte...,"﻿Sense and Sensibility, by Jane Austen",13
1050,he hoped the young ladies would not find it so...,"﻿Sense and Sensibility, by Jane Austen",7


## Group lines by chapter num 

In [20]:
dfc = df.groupby(OHCO[:1]).line_str.apply(lambda x: '\n'.join(x)).to_frame() # Make big string

In [21]:
dfc.head()

Unnamed: 0_level_0,line_str
chap_num,Unnamed: 1_level_1
1,\n\nThe family of Dashwood had long been settl...
2,\n\nMrs. John Dashwood now installed herself m...
3,\n\nMrs. Dashwood remained at Norland several ...
4,"\n\n""What a pity it is, Elinor,"" said Marianne..."
5,"\n\nNo sooner was her answer dispatched, than ..."


# Split into paragraphs 

In [22]:
dfp = dfc['line_str'].str.split(r'\n\n+', expand=True).stack()\
    .to_frame().rename(columns={0:'para_str'})

In [23]:
dfp.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,
1,1,The family of Dashwood had long been settled i...
1,2,"By a former marriage, Mr. Henry Dashwood had o..."
1,3,"The old gentleman died: his will was read, and..."
1,4,"Mr. Dashwood's disappointment was, at first, s..."


In [24]:
dfp.index.names = OHCO[:2]

In [25]:
dfp.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,para_num,Unnamed: 2_level_1
1,0,
1,1,The family of Dashwood had long been settled i...
1,2,"By a former marriage, Mr. Henry Dashwood had o..."
1,3,"The old gentleman died: his will was read, and..."
1,4,"Mr. Dashwood's disappointment was, at first, s..."


In [26]:
dfp['para_str'] = dfp['para_str'].str.replace(r'\n', ' ').str.strip()
dfp = dfp[~dfp['para_str'].str.match(r'^\s*$')] # Remove empty paragraphs

In [27]:
dfp.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,para_num,Unnamed: 2_level_1
1,1,The family of Dashwood had long been settled i...
1,2,"By a former marriage, Mr. Henry Dashwood had o..."
1,3,"The old gentleman died: his will was read, and..."
1,4,"Mr. Dashwood's disappointment was, at first, s..."
1,5,His son was sent for as soon as his danger was...


# Split into sentences

NOTE: ADDED `"` to regex in `split()`

In [28]:
dfs = dfp['para_str'].str.split(r'[.?!;:"]+', expand=True).stack()\
    .to_frame().rename(columns={0:'sent_str'})

In [29]:
dfs.index.names = OHCO[:3]

In [30]:
dfs = dfs[~dfs['sent_str'].str.match(r'^\s*$')] # Remove empty paragraphs

In [31]:
dfs.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
chap_num,para_num,sent_num,Unnamed: 3_level_1
1,1,0,The family of Dashwood had long been settled i...
1,1,1,"Their estate\nwas large, and their residence..."
1,1,2,The late owner of this estate was a single\n...
1,1,3,"But her\ndeath, which happened ten years bef..."
1,1,4,"for to supply her loss, he invited and receiv..."


# Split into tokens

In [32]:
dft = dfs['sent_str'].str.split(r"[\s',-]+", expand=True).stack()\
    .to_frame().rename(columns={0:'token_str'})

In [33]:
dft.index.names = OHCO[:4]

In [34]:
type(dft)

pandas.core.frame.DataFrame

In [35]:
dft.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1
1,1,0,0,The
1,1,0,1,family
1,1,0,2,of
1,1,0,3,Dashwood
1,1,0,4,had


# Gathering by Content Object

In [36]:
sents = dft.groupby(OHCO[:3]).token_str.apply(lambda x: ' '.join(x)).to_frame().rename(columns={'token_str':'content'})
paras = dft.groupby(OHCO[:2]).token_str.apply(lambda x: ' '.join(x)).to_frame().rename(columns={'token_str':'content'})
chaps = dft.groupby(OHCO[:1]).token_str.apply(lambda x: ' '.join(x)).to_frame().rename(columns={'token_str':'content'})

In [37]:
def gather(ohco_level):
    return df.groupby(OHCO[:ohco_level]).token_str\
        .apply(lambda x: ' '.join(x))\
        .to_frame()\
        .rename(columns={'token_str':'content'})

In [38]:
sents.sample(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,content
chap_num,para_num,sent_num,Unnamed: 3_level_1
46,18,1,Elinor joyfully treasured her words as she an...
2,22,5,I would not bind myself to allow them any thi...
25,14,3,said Marianne
1,13,5,They gave themselves up wholly to their sorro...
39,23,6,If however by an unforeseen chance it should ...
28,2,2,After some time spent in saying little or doi...
13,54,2,said almost every body
50,11,19,Lucy became as necessary to Mrs
26,2,1,Jennings might be expected to be
18,18,2,said he


In [39]:
#dft.token_str.str.len().plot.hist(bins=10, title="Tokens")

In [40]:
#sents.content.str.len().plot.hist(bins=20, title='Sentences')

In [41]:
#paras.content.str.len().plot.hist(title="Paragraphs")

In [42]:
#chaps.content.str.len().plot.hist(title="Chapters")

# Save work to CSV

In [43]:
dft.to_csv(csv_file)

In [44]:
dft.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1
1,1,0,0,The
1,1,0,1,family
1,1,0,2,of
1,1,0,3,Dashwood
1,1,0,4,had
1,1,0,5,long
1,1,0,6,been
1,1,0,7,settled
1,1,0,8,in
1,1,0,9,Sussex


# extend the notebook to combine both Persuasion and Sense and Sensibility into a single dataframe with an appropriately modified OHCO list.

In [45]:
# Import the OHCO file from module 1
df_persuasion = pd.read_csv('austen-persuasion.csv')
#df_persuasion.head(4)

In [46]:
# use the guttenberg ids for the books, so we can trace back
book_id_1 = '161'
book_id_2 = '105'

In [47]:
# add book_id column to module 1 OHCO
pers_cols = list(df_persuasion.columns)
df_persuasion['book_id'] = book_id_2
df_persuasion = df_persuasion[['book_id'] + pers_cols]
#df_persuasion.head(4)

In [48]:
# similarly, add book_id for the sense and sensibility file
dft.reset_index(inplace=True)
sens_cols = list(dft.columns)
dft['book_id'] = book_id_1
dft = dft[['book_id'] + sens_cols]
#dft.head()

In [49]:
# combine them into one new OHCO
df_ohco = pd.concat([dft, df_persuasion], axis=0, ignore_index=True)

In [50]:
# Display random sample of the combined OHCO
df_ohco.sample(40)

Unnamed: 0,book_id,chap_num,para_num,sent_num,token_num,token_str
77163,161,34,19,1,21,Lady
149991,105,8,22,1,14,volume
42863,161,22,31,5,19,for
98044,161,41,12,2,5,not
98512,161,41,19,7,20,anxiety
...,...,...,...,...,...,...
70898,161,32,14,2,37,of
100077,161,41,38,4,7,who
16155,161,10,13,0,21,being
154072,105,9,19,7,7,a


In [51]:
# TODO:
# Filter out NaNs and empty tokens
# strip commas, quotes and aother non characters from tokens