# Module 2: Text into Data: Importing a Text

* DS 5001: Exploratory Text Analytics
* Raf Alvarado

# Set Up

In [1]:
import pandas as pd
%matplotlib inline

In [2]:
epub_file ="pg161.txt"
other_file = "pg105.txt"
csv_file = 'austen-persuasion.csv'

In [3]:
OHCO = ['title', 'chap_num', 'para_num', 'sent_num', 'token_num']

# Import file into a dataframe

In [4]:
epub = open(epub_file, 'r', encoding='utf-8-sig').readlines()
df = pd.DataFrame(epub, columns=['line_str'])
df.index.name = 'line_num'
df.line_str = df.line_str.str.strip()

# Extract title of work from first line

In [5]:
title = df.loc[0].line_str.replace('The Project Gutenberg EBook of ', '')
df['title'] = title

In [6]:
print(title)

Sense and Sensibility, by Jane Austen


In [7]:
df.head()

Unnamed: 0_level_0,line_str,title
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
0,﻿The Project Gutenberg EBook of Sense and Sens...,"﻿Sense and Sensibility, by Jane Austen"
1,,"﻿Sense and Sensibility, by Jane Austen"
2,This eBook is for the use of anyone anywhere a...,"﻿Sense and Sensibility, by Jane Austen"
3,almost no restrictions whatsoever. You may co...,"﻿Sense and Sensibility, by Jane Austen"
4,re-use it under the terms of the Project Guten...,"﻿Sense and Sensibility, by Jane Austen"


In [8]:
epub2 = open(other_file, 'r', encoding='utf-8-sig').readlines()
df2 = pd.DataFrame(epub2, columns=['line_str'])
title2 = df2.loc[0].line_str.replace('The Project Gutenberg EBook of ', '')
df2['title'] = title2.strip()
df2.index.name = 'line_num'
df2.line_str = df2.line_str.str.strip()


In [9]:
print(title2)
df2.head()

Persuasion, by Jane Austen



Unnamed: 0_level_0,line_str,title
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"The Project Gutenberg EBook of Persuasion, by ...","Persuasion, by Jane Austen"
1,,"Persuasion, by Jane Austen"
2,This eBook is for the use of anyone anywhere a...,"Persuasion, by Jane Austen"
3,almost no restrictions whatsoever. You may co...,"Persuasion, by Jane Austen"
4,re-use it under the terms of the Project Guten...,"Persuasion, by Jane Austen"


# Remove Gutenberg's front and back matter

In [10]:
a = df.line_str.str.match(r"\*\*\*\s*START OF (THE|THIS) PROJECT")
b = df.line_str.str.match(r"\*\*\*\s*END OF (THE|THIS) PROJECT")

In [11]:
an = df.loc[a].index[0]
bn = df.loc[b].index[0]

In [12]:
df = df.loc[an + 1 : bn - 2]

In [13]:
a = df2.line_str.str.match(r"\*\*\*\s*START OF (THE|THIS) PROJECT")
b = df2.line_str.str.match(r"\*\*\*\s*END OF (THE|THIS) PROJECT")
an = df2.loc[a].index[0]
bn = df2.loc[b].index[0]
df2 = df2.loc[an + 1 : bn - 2]


In [14]:
df2

Unnamed: 0_level_0,line_str,title
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
19,,"Persuasion, by Jane Austen"
20,,"Persuasion, by Jane Austen"
21,,"Persuasion, by Jane Austen"
22,,"Persuasion, by Jane Austen"
23,Produced by Sharon Partridge and Martin Ward. ...,"Persuasion, by Jane Austen"
...,...,...
8367,,"Persuasion, by Jane Austen"
8368,,"Persuasion, by Jane Austen"
8369,,"Persuasion, by Jane Austen"
8370,,"Persuasion, by Jane Austen"


# Chunk by chapter

## Find all chapter headers

In [16]:
chap_lines = df.line_str.str.match(r"^\s*(chapter|letter)\s+(\d+)", case=False)
chap_lines2 = df2.line_str.str.match(r"^\s*(chapter|letter)\s+(\d+)", case=False)


In [17]:
df.loc[chap_lines]

Unnamed: 0_level_0,line_str,title
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
42,CHAPTER 1,"﻿Sense and Sensibility, by Jane Austen"
196,CHAPTER 2,"﻿Sense and Sensibility, by Jane Austen"
399,CHAPTER 3,"﻿Sense and Sensibility, by Jane Austen"
562,CHAPTER 4,"﻿Sense and Sensibility, by Jane Austen"
757,CHAPTER 5,"﻿Sense and Sensibility, by Jane Austen"
859,CHAPTER 6,"﻿Sense and Sensibility, by Jane Austen"
987,CHAPTER 7,"﻿Sense and Sensibility, by Jane Austen"
1113,CHAPTER 8,"﻿Sense and Sensibility, by Jane Austen"
1245,CHAPTER 9,"﻿Sense and Sensibility, by Jane Austen"
1449,CHAPTER 10,"﻿Sense and Sensibility, by Jane Austen"


## Assign numbers to chapters

In [18]:
chap_nums = [i+1 for i in range(df.loc[chap_lines].shape[0])]
chap_nums2 = [i+1 for i in range(df2.loc[chap_lines].shape[0])]

In [19]:
df.loc[chap_lines, 'chap_num'] = chap_nums
df2.loc[chap_lines, 'chap_num'] = chap_nums2


## Forward-fill chapter numbers to following text lines

In [20]:
df.chap_num = df.chap_num.ffill()
df2.chap_num = df2.chap_num.ffill()

In [23]:
df2

Unnamed: 0_level_0,line_str,title,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
43,,"﻿Sense and Sensibility, by Jane Austen",1
44,,"﻿Sense and Sensibility, by Jane Austen",1
45,The family of Dashwood had long been settled i...,"﻿Sense and Sensibility, by Jane Austen",1
46,"was large, and their residence was at Norland ...","﻿Sense and Sensibility, by Jane Austen",1
47,"their property, where, for many generations, t...","﻿Sense and Sensibility, by Jane Austen",1
...,...,...,...
12662,,"﻿Sense and Sensibility, by Jane Austen",50
12663,,"﻿Sense and Sensibility, by Jane Austen",50
12664,,"﻿Sense and Sensibility, by Jane Austen",50
12665,,"﻿Sense and Sensibility, by Jane Austen",50


## Clean up

In [24]:
df = df.loc[~df.chap_num.isna()] # Remove chapter heading lines
df = df.loc[~chap_lines] # Remove everything before Chapter 1
df.chap_num = df.chap_num.astype('int') # Convert chap_num from float to int

df2 = df2.loc[~df2.chap_num.isna()] # Remove chapter heading lines
df2 = df2.loc[~chap_lines] # Remove everything before Chapter 1
df2.chap_num = df2.chap_num.astype('int') # Convert chap_num from float to int

In [26]:
df2

Unnamed: 0_level_0,line_str,title,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
43,,"Persuasion, by Jane Austen",1
44,,"Persuasion, by Jane Austen",1
45,,"Persuasion, by Jane Austen",1
46,,"Persuasion, by Jane Austen",1
47,Chapter 1,"Persuasion, by Jane Austen",1
...,...,...,...
8367,,"Persuasion, by Jane Austen",36
8368,,"Persuasion, by Jane Austen",36
8369,,"Persuasion, by Jane Austen",36
8370,,"Persuasion, by Jane Austen",36


In [27]:
df = pd.concat([df, df2])

In [28]:
df.sample(10)

Unnamed: 0_level_0,line_str,title,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4083,,"Persuasion, by Jane Austen",21
6062,,"Persuasion, by Jane Austen",29
5278,"""February 1st.","Persuasion, by Jane Austen",26
5271,"bell, requested the footman who answered it to...","﻿Sense and Sensibility, by Jane Austen",26
7309,"circumstances, could only be exposing them to ...","Persuasion, by Jane Austen",33
11421,"towards it, Marianne calmly said,","﻿Sense and Sensibility, by Jane Austen",46
11260,"""At Delaford, she will be within an easy dista...","﻿Sense and Sensibility, by Jane Austen",45
9346,"comfortable income. Such as it is, however, m...","﻿Sense and Sensibility, by Jane Austen",39
5140,Russell had fresh arranged all her evening eng...,"Persuasion, by Jane Austen",25
6644,name and livery included; but I will not prete...,"Persuasion, by Jane Austen",31


## Group lines by chapter num 

In [29]:
dfc = df.groupby(OHCO[:2]).line_str.apply(lambda x: '\n'.join(x)).to_frame() # Make big string

In [32]:
dfc

Unnamed: 0_level_0,Unnamed: 1_level_0,line_str
title,chap_num,Unnamed: 2_level_1
"Persuasion, by Jane Austen",1,"\n\n\n\nChapter 1\n\n\nSir Walter Elliot, of K..."
"Persuasion, by Jane Austen",2,"\nShe had had a disappointment, moreover, whic..."
"Persuasion, by Jane Austen",3,\nSir Walter would quit Kellynch Hall; and aft...
"Persuasion, by Jane Austen",4,but what restrictions I might impose on the us...
"Persuasion, by Jane Austen",5,"treaty, and authorising him to wait on Admiral..."
...,...,...
"﻿Sense and Sensibility, by Jane Austen",46,"\n\nMarianne's illness, though weakening in it..."
"﻿Sense and Sensibility, by Jane Austen",47,\n\nMrs. Dashwood did not hear unmoved the vin...
"﻿Sense and Sensibility, by Jane Austen",48,\n\nElinor now found the difference between th...
"﻿Sense and Sensibility, by Jane Austen",49,"\n\nUnaccountable, however, as the circumstanc..."


# Split into paragraphs 

In [33]:
dfp = dfc['line_str'].str.split(r'\n\n+', expand=True).stack()\
    .to_frame().rename(columns={0:'para_str'})

In [35]:
dfp


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,para_str
title,chap_num,Unnamed: 2_level_1,Unnamed: 3_level_1
"Persuasion, by Jane Austen",1,0,
"Persuasion, by Jane Austen",1,1,Chapter 1
"Persuasion, by Jane Austen",1,2,"Sir Walter Elliot, of Kellynch Hall, in Somers..."
"Persuasion, by Jane Austen",1,3,"""ELLIOT OF KELLYNCH HALL."
"Persuasion, by Jane Austen",1,4,"""Walter Elliot, born March 1, 1760, married, J..."
...,...,...,...
"﻿Sense and Sensibility, by Jane Austen",50,19,"For Marianne, however--in spite of his incivil..."
"﻿Sense and Sensibility, by Jane Austen",50,20,Mrs. Dashwood was prudent enough to remain at ...
"﻿Sense and Sensibility, by Jane Austen",50,21,"Between Barton and Delaford, there was that co..."
"﻿Sense and Sensibility, by Jane Austen",50,22,THE END


In [36]:
dfp.index.names = OHCO[:3]

In [37]:
dfp.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,para_str
title,chap_num,para_num,Unnamed: 3_level_1
"Persuasion, by Jane Austen",1,0,
"Persuasion, by Jane Austen",1,1,Chapter 1
"Persuasion, by Jane Austen",1,2,"Sir Walter Elliot, of Kellynch Hall, in Somers..."
"Persuasion, by Jane Austen",1,3,"""ELLIOT OF KELLYNCH HALL."
"Persuasion, by Jane Austen",1,4,"""Walter Elliot, born March 1, 1760, married, J..."


In [38]:
dfp['para_str'] = dfp['para_str'].str.replace(r'\n', ' ').str.strip()
dfp = dfp[~dfp['para_str'].str.match(r'^\s*$')] # Remove empty paragraphs

  dfp['para_str'] = dfp['para_str'].str.replace(r'\n', ' ').str.strip()


In [39]:
dfp

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,para_str
title,chap_num,para_num,Unnamed: 3_level_1
"Persuasion, by Jane Austen",1,1,Chapter 1
"Persuasion, by Jane Austen",1,2,"Sir Walter Elliot, of Kellynch Hall, in Somers..."
"Persuasion, by Jane Austen",1,3,"""ELLIOT OF KELLYNCH HALL."
"Persuasion, by Jane Austen",1,4,"""Walter Elliot, born March 1, 1760, married, J..."
"Persuasion, by Jane Austen",1,5,Precisely such had the paragraph originally st...
...,...,...,...
"﻿Sense and Sensibility, by Jane Austen",50,19,"For Marianne, however--in spite of his incivil..."
"﻿Sense and Sensibility, by Jane Austen",50,20,Mrs. Dashwood was prudent enough to remain at ...
"﻿Sense and Sensibility, by Jane Austen",50,21,"Between Barton and Delaford, there was that co..."
"﻿Sense and Sensibility, by Jane Austen",50,22,THE END


# Split into sentences

NOTE: ADDED `"` to regex in `split()`

In [40]:
dfs = dfp['para_str'].str.split(r'[.?!;:"]+', expand=True).stack()\
    .to_frame().rename(columns={0:'sent_str'})

In [41]:
dfs.index.names = OHCO[:4]

In [42]:
dfs = dfs[~dfs['sent_str'].str.match(r'^\s*$')] # Remove empty paragraphs

In [43]:
dfs

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,sent_str
title,chap_num,para_num,sent_num,Unnamed: 4_level_1
"Persuasion, by Jane Austen",1,1,0,Chapter 1
"Persuasion, by Jane Austen",1,2,0,"Sir Walter Elliot, of Kellynch Hall, in Somers..."
"Persuasion, by Jane Austen",1,2,1,"there he found occupation for an idle hour, a..."
"Persuasion, by Jane Austen",1,2,2,there his faculties were roused into admirati...
"Persuasion, by Jane Austen",1,2,3,"there any unwelcome sensations, arising from ..."
...,...,...,...,...
"﻿Sense and Sensibility, by Jane Austen",50,20,3,"Jennings, when Marianne was taken from them, ..."
"﻿Sense and Sensibility, by Jane Austen",50,21,0,"Between Barton and Delaford, there was that co..."
"﻿Sense and Sensibility, by Jane Austen",50,21,1,--and among the merits and the happiness of El...
"﻿Sense and Sensibility, by Jane Austen",50,22,0,THE END


# Split into tokens

In [44]:
dft = dfs['sent_str'].str.split(r"[\s',-]+", expand=True).stack()\
    .to_frame().rename(columns={0:'token_str'})

In [45]:
dft.index.names = OHCO[:5]

In [47]:
dft

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str
title,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1
"Persuasion, by Jane Austen",1,1,0,0,Chapter
"Persuasion, by Jane Austen",1,1,0,1,1
"Persuasion, by Jane Austen",1,2,0,0,Sir
"Persuasion, by Jane Austen",1,2,0,1,Walter
"Persuasion, by Jane Austen",1,2,0,2,Elliot
...,...,...,...,...,...
"﻿Sense and Sensibility, by Jane Austen",50,23,0,8,and
"﻿Sense and Sensibility, by Jane Austen",50,23,0,9,Sensibility
"﻿Sense and Sensibility, by Jane Austen",50,23,0,10,by
"﻿Sense and Sensibility, by Jane Austen",50,23,0,11,Jane


# Gathering by Content Object

In [48]:
sents = dft.groupby(OHCO[:4]).token_str.apply(lambda x: ' '.join(x)).to_frame().rename(columns={'token_str':'content'})
paras = dft.groupby(OHCO[:3]).token_str.apply(lambda x: ' '.join(x)).to_frame().rename(columns={'token_str':'content'})
chaps = dft.groupby(OHCO[:2]).token_str.apply(lambda x: ' '.join(x)).to_frame().rename(columns={'token_str':'content'})

In [49]:
def gather(ohco_level):
    return df.groupby(OHCO[:ohco_level]).token_str\
        .apply(lambda x: ' '.join(x))\
        .to_frame()\
        .rename(columns={'token_str':'content'})

In [50]:
sents.sample(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,content
title,chap_num,para_num,sent_num,Unnamed: 4_level_1
"﻿Sense and Sensibility, by Jane Austen",18,17,0,Sir John never came to the Dashwoods without e...
"﻿Sense and Sensibility, by Jane Austen",11,17,9,appeared to think that he had said too much a...
"Persuasion, by Jane Austen",30,19,6,But why should you be cruel
"Persuasion, by Jane Austen",7,8,5,but she was young and certainly altogether we...
"﻿Sense and Sensibility, by Jane Austen",40,27,4,that you wished to speak with me at least I un...
"Persuasion, by Jane Austen",15,4,3,but it would be shocking to have Henrietta mar...
"﻿Sense and Sensibility, by Jane Austen",11,13,6,and a better acquaintance with the world is w...
"﻿Sense and Sensibility, by Jane Austen",23,7,0,Much as she had suffered from her first conver...
"Persuasion, by Jane Austen",28,17,3,she must confess to herself that she was not ...
"﻿Sense and Sensibility, by Jane Austen",20,27,1,Mr


KeyError: False

# Save work to CSV

In [53]:
dft.to_csv("full-output.csv")

In [52]:
dft

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str
title,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1
"Persuasion, by Jane Austen",1,1,0,0,Chapter
"Persuasion, by Jane Austen",1,1,0,1,1
"Persuasion, by Jane Austen",1,2,0,0,Sir
"Persuasion, by Jane Austen",1,2,0,1,Walter
"Persuasion, by Jane Austen",1,2,0,2,Elliot
...,...,...,...,...,...
"﻿Sense and Sensibility, by Jane Austen",50,23,0,8,and
"﻿Sense and Sensibility, by Jane Austen",50,23,0,9,Sensibility
"﻿Sense and Sensibility, by Jane Austen",50,23,0,10,by
"﻿Sense and Sensibility, by Jane Austen",50,23,0,11,Jane
