# Module 2: Text into Data: Importing a Text

## Efrain Olivares: dpy8wq

* DS 5001: Exploratory Text Analytics
* Raf Alvarado

# Set Up

In [1]:
epub_file = "pg161-1_ex.txt"
csv_file = 'austen-sense-and-sensibility.csv'

In [2]:
OHCO = ['chap_num', 'para_num', 'sent_num', 'token_num']

In [3]:
import pandas as pd
pd.set_option('display.max_rows', 20)
%matplotlib inline

# Import file into a dataframe

In [4]:
epub = open(epub_file, 'r', encoding='utf-8-sig').readlines()
df = pd.DataFrame(epub, columns=['line_str'])
df.index.name = 'line_num'
df.line_str = df.line_str.str.strip()

In [5]:
df.sample(10)

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
3292,"""Who! why yourselves, and the Careys, and Whit..."
505,No sooner did she perceive any symptom of love...
10166,"composure, who, though attending and nursing h..."
681,consider her partiality for Edward in so prosp...
11104,"speak, embraced Elinor again and again, turnin..."
1307,"herself to assist her, was involuntarily hurri..."
822,Her eagerness to be gone from Norland was pres...
11782,
11891,"a gentleman, it was Colonel Brandon himself. ..."
11429,"spot!--shall we ever talk on that subject, Eli..."


# Extract title of work from first line

In [6]:
title = df.loc[0].line_str.replace('The Project Gutenberg EBook of ', '')
df['title'] = title

In [7]:
print(title)

﻿Sense and Sensibility, by Jane Austen


In [8]:
df.head()

Unnamed: 0_level_0,line_str,title
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
0,﻿The Project Gutenberg EBook of Sense and Sens...,"﻿Sense and Sensibility, by Jane Austen"
1,,"﻿Sense and Sensibility, by Jane Austen"
2,This eBook is for the use of anyone anywhere a...,"﻿Sense and Sensibility, by Jane Austen"
3,almost no restrictions whatsoever. You may co...,"﻿Sense and Sensibility, by Jane Austen"
4,re-use it under the terms of the Project Guten...,"﻿Sense and Sensibility, by Jane Austen"


# Remove Gutenberg's front and back matter

In [9]:
a = df.line_str.str.match(r"\*\*\*\s*START OF (THE|THIS) PROJECT")
b = df.line_str.str.match(r"\*\*\*\s*END OF (THE|THIS) PROJECT")

In [10]:
an = df.loc[a].index[0]
bn = df.loc[b].index[0]

In [11]:
df = df.loc[an + 1 : bn - 2]

In [12]:
df

Unnamed: 0_level_0,line_str,title
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
19,,"﻿Sense and Sensibility, by Jane Austen"
20,Special thanks are due to Sharon Partridge for...,"﻿Sense and Sensibility, by Jane Austen"
21,proofreading and correction of this etext.,"﻿Sense and Sensibility, by Jane Austen"
22,,"﻿Sense and Sensibility, by Jane Austen"
23,,"﻿Sense and Sensibility, by Jane Austen"
...,...,...
12662,,"﻿Sense and Sensibility, by Jane Austen"
12663,,"﻿Sense and Sensibility, by Jane Austen"
12664,,"﻿Sense and Sensibility, by Jane Austen"
12665,,"﻿Sense and Sensibility, by Jane Austen"


# Chunk by chapter

## Find all chapter headers

In [13]:
chap_lines = df.line_str.str.match(r"^\s*(chapter|letter)\s+(\d+)", case=False)

In [14]:
df.loc[chap_lines]

Unnamed: 0_level_0,line_str,title
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
42,CHAPTER 1,"﻿Sense and Sensibility, by Jane Austen"
196,CHAPTER 2,"﻿Sense and Sensibility, by Jane Austen"
399,CHAPTER 3,"﻿Sense and Sensibility, by Jane Austen"
562,CHAPTER 4,"﻿Sense and Sensibility, by Jane Austen"
757,CHAPTER 5,"﻿Sense and Sensibility, by Jane Austen"
...,...,...
11279,CHAPTER 46,"﻿Sense and Sensibility, by Jane Austen"
11572,CHAPTER 47,"﻿Sense and Sensibility, by Jane Austen"
11839,CHAPTER 48,"﻿Sense and Sensibility, by Jane Austen"
11987,CHAPTER 49,"﻿Sense and Sensibility, by Jane Austen"


## Assign numbers to chapters

In [15]:
chap_nums = [i+1 for i in range(df.loc[chap_lines].shape[0])]

In [16]:
df.loc[chap_lines, 'chap_num'] = chap_nums

## Forward-fill chapter numbers to following text lines

In [17]:
df.chap_num = df.chap_num.ffill()

## Clean up

In [18]:
df = df.loc[~df.chap_num.isna()] # Remove chapter heading lines
df = df.loc[~chap_lines] # Remove everything before Chapter 1
df.chap_num = df.chap_num.astype('int') # Convert chap_num from float to int

In [19]:
df.sample(10)

Unnamed: 0_level_0,line_str,title,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10198,"desirous to share in all her fatigues, and oft...","﻿Sense and Sensibility, by Jane Austen",43
10267,having sat up to have her bed made; and carefu...,"﻿Sense and Sensibility, by Jane Austen",43
7515,necessity of buying a pair of ear-rings for ea...,"﻿Sense and Sensibility, by Jane Austen",33
5835,herself; and to persuade her to check her agit...,"﻿Sense and Sensibility, by Jane Austen",28
7429,"voice to an important whisper--""will be exceed...","﻿Sense and Sensibility, by Jane Austen",33
5135,"likewise, as she did not think it proper that ...","﻿Sense and Sensibility, by Jane Austen",25
3562,,"﻿Sense and Sensibility, by Jane Austen",19
1690,and to aim at the restraint of sentiments whic...,"﻿Sense and Sensibility, by Jane Austen",11
4154,"his opinion, to be intimate, and while his con...","﻿Sense and Sensibility, by Jane Austen",21
3631,,"﻿Sense and Sensibility, by Jane Austen",19


## Group lines by chapter num 

In [20]:
dfc = df.groupby(OHCO[:1]).line_str.apply(lambda x: '\n'.join(x)).to_frame() # Make big string

In [21]:
dfc.head()

Unnamed: 0_level_0,line_str
chap_num,Unnamed: 1_level_1
1,\n\nThe family of Dashwood had long been settl...
2,\n\nMrs. John Dashwood now installed herself m...
3,\n\nMrs. Dashwood remained at Norland several ...
4,"\n\n""What a pity it is, Elinor,"" said Marianne..."
5,"\n\nNo sooner was her answer dispatched, than ..."


# Split into paragraphs 

In [22]:
dfp = dfc['line_str'].str.split(r'\n\n+', expand=True).stack()\
    .to_frame().rename(columns={0:'para_str'})

In [23]:
dfp.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,
1,1,The family of Dashwood had long been settled i...
1,2,"By a former marriage, Mr. Henry Dashwood had o..."
1,3,"The old gentleman died: his will was read, and..."
1,4,"Mr. Dashwood's disappointment was, at first, s..."


In [24]:
dfp.index.names = OHCO[:2]

In [25]:
dfp.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,para_num,Unnamed: 2_level_1
1,0,
1,1,The family of Dashwood had long been settled i...
1,2,"By a former marriage, Mr. Henry Dashwood had o..."
1,3,"The old gentleman died: his will was read, and..."
1,4,"Mr. Dashwood's disappointment was, at first, s..."


In [26]:
dfp['para_str'] = dfp['para_str'].str.replace(r'\n', ' ').str.strip()
dfp = dfp[~dfp['para_str'].str.match(r'^\s*$')] # Remove empty paragraphs

In [27]:
dfp.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,para_num,Unnamed: 2_level_1
1,1,The family of Dashwood had long been settled i...
1,2,"By a former marriage, Mr. Henry Dashwood had o..."
1,3,"The old gentleman died: his will was read, and..."
1,4,"Mr. Dashwood's disappointment was, at first, s..."
1,5,His son was sent for as soon as his danger was...


# Split into sentences

NOTE: ADDED `"` to regex in `split()`

In [28]:
dfs = dfp['para_str'].str.split(r'[.?!;:"]+', expand=True).stack()\
    .to_frame().rename(columns={0:'sent_str'})

In [29]:
dfs.index.names = OHCO[:3]

In [30]:
dfs = dfs[~dfs['sent_str'].str.match(r'^\s*$')] # Remove empty paragraphs

In [31]:
dfs.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
chap_num,para_num,sent_num,Unnamed: 3_level_1
1,1,0,The family of Dashwood had long been settled i...
1,1,1,"Their estate\nwas large, and their residence..."
1,1,2,The late owner of this estate was a single\n...
1,1,3,"But her\ndeath, which happened ten years bef..."
1,1,4,"for to supply her loss, he invited and receiv..."


# Split into tokens

In [32]:
dft = dfs['sent_str'].str.split(r"[\s',-]+", expand=True).stack()\
    .to_frame().rename(columns={0:'token_str'})

In [33]:
dft.index.names = OHCO[:4]

In [34]:
type(dft)

pandas.core.frame.DataFrame

In [35]:
dft.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1
1,1,0,0,The
1,1,0,1,family
1,1,0,2,of
1,1,0,3,Dashwood
1,1,0,4,had


# Gathering by Content Object

In [36]:
sents = dft.groupby(OHCO[:3]).token_str.apply(lambda x: ' '.join(x)).to_frame().rename(columns={'token_str':'content'})
paras = dft.groupby(OHCO[:2]).token_str.apply(lambda x: ' '.join(x)).to_frame().rename(columns={'token_str':'content'})
chaps = dft.groupby(OHCO[:1]).token_str.apply(lambda x: ' '.join(x)).to_frame().rename(columns={'token_str':'content'})

In [37]:
def gather(ohco_level):
    return df.groupby(OHCO[:ohco_level]).token_str\
        .apply(lambda x: ' '.join(x))\
        .to_frame()\
        .rename(columns={'token_str':'content'})

In [38]:
sents.sample(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,content
chap_num,para_num,sent_num,Unnamed: 3_level_1
35,21,2,for his heart had not the indifference of Luc...
34,34,3,and such ill timed praise of another at Elino...
6,8,0,They were of course very anxious to see a pers...
20,60,3,
50,1,1,Ferrars just so violent and so steady as to p...
7,9,4,He paid her only the compliment of attention
16,9,5,I know Marianne s heart
4,2,1,No taste for drawing
37,45,0,John Dashwood was greatly astonished
47,15,1,Rather say your mother s imprudence my child


In [39]:
#dft.token_str.str.len().plot.hist(bins=10, title="Tokens")

In [40]:
#sents.content.str.len().plot.hist(bins=20, title='Sentences')

In [41]:
#paras.content.str.len().plot.hist(title="Paragraphs")

In [42]:
#chaps.content.str.len().plot.hist(title="Chapters")

# Save work to CSV

In [43]:
dft.to_csv(csv_file)

In [44]:
dft.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1
1,1,0,0,The
1,1,0,1,family
1,1,0,2,of
1,1,0,3,Dashwood
1,1,0,4,had
1,1,0,5,long
1,1,0,6,been
1,1,0,7,settled
1,1,0,8,in
1,1,0,9,Sussex


# Part 2.  Combine Sense and Sensibility OHCO and Persuasion OHCO into one csv

In [45]:
df_persuasion = pd.read_csv('austen-persuasion.csv')
df_persuasion.head()

Unnamed: 0,chap_num,para_num,sent_num,token_num,token_str
0,1,1,0,0,Sir
1,1,1,0,1,Walter
2,1,1,0,2,Elliot
3,1,1,0,3,of
4,1,1,0,4,Kellynch


In [46]:
book_id_1 = '161'
book_id_2 = '105'

In [47]:
pers_cols = list(df_persuasion.columns)
df_persuasion['book_id'] = book_id_2
df_persuasion = df_persuasion[['book_id'] + pers_cols]
df_persuasion.head()

Unnamed: 0,book_id,chap_num,para_num,sent_num,token_num,token_str
0,105,1,1,0,0,Sir
1,105,1,1,0,1,Walter
2,105,1,1,0,2,Elliot
3,105,1,1,0,3,of
4,105,1,1,0,4,Kellynch


In [48]:
dft.reset_index(inplace=True)
sens_cols = list(dft.columns)
dft['book_id'] = book_id_1
dft = dft[['book_id'] + sens_cols]
dft.head()

Unnamed: 0,book_id,chap_num,para_num,sent_num,token_num,token_str
0,161,1,1,0,0,The
1,161,1,1,0,1,family
2,161,1,1,0,2,of
3,161,1,1,0,3,Dashwood
4,161,1,1,0,4,had


In [49]:
df_ohco = pd.concat([dft, df_persuasion], axis=0, ignore_index=True)

In [89]:
df_ohco.sample(40)

Unnamed: 0,book_id,chap_num,para_num,sent_num,token_num,token_str
210312,105,23,39,3,10,nor
62152,161,30,3,6,11,nothing
132720,105,3,4,0,7,in
150602,105,8,31,0,20,order
48067,161,24,25,4,33,friendship
...,...,...,...,...,...,...
197955,105,21,69,5,17,from
183608,105,18,9,25,8,when
77355,161,34,25,1,1,Ferrars
146465,105,7,12,2,2,have


In [None]:
# TODO:
# Filter out NaNs and empty tokens
# strip commas, quotes and aother non characters from tokens