# DS 5001 Week 2 Lab: Text into Data: Functions for Importing a Text

In [1]:
import pandas as pd

## Config

We put everything we know about our text and its processing requirements in a configuration dictionary.

In [2]:
config = {
    'slug': "auusten-persuasion",
    'src_file': "data_in/pg105.txt",
    'cruft': {
        'start_line_pat': r"\*\*\*\s*START OF (THE|THIS) PROJECT",
#         'end_line_pat': r"\*\*\*\s*END OF (THE|THIS) PROJECT"
        'end_line_pat': 'End of the Project Gutenberg EBook'
    }, 
    'ohco': {
        'chapter': {
            'pat': r"^\s*(chapter|letter)\s+(\d+)",
            'type': 'milestone'
        },
        'paragraph': {
            'pat': r"\n\n+",
            'type': 'delimitter'
            
        },
        'sentence': {
            'pat': r"[.?!;:]+",
            'type': 'delimitter'
        },
        'token': {
            'pat': r"[\s',-]+",
            'type': 'delimitter'
        }
    }
}

## Functions

These require that the config dictionary be completed and valid. 

**Assumptions**:

* The functions must be executed in order.
* We have only one milestone to process.
* The milestone result is required by subsequent delimmiter functions.


In [3]:
def import_source(col_name='line_str', id_name='line_id', strip=True):
    """Convert a raw text file into a dataframe of lines"""
    global config
    src_file = config['src_file']
    df = pd.DataFrame({col_name:open(src_file,'r').readlines()})
    df.index.name = id_name
    if strip:
        df[col_name] = df[col_name].str.strip()
    return df

In [4]:
def clip_lines(df, line_col='line_str'):
    global config
    start_pat = config['cruft']['start_line_pat']
    end_pat = config['cruft']['end_line_pat']
    start = df[line_col].str.match(start_pat)
    end = df[line_col].str.match(end_pat)
    start_line_num = df.loc[start].index[0]
    end_line_num = df.loc[end].index[0]
    df = df.loc[start_line_num + 1 : end_line_num - 2]
    return df

In [5]:
def group_by_milestone(df, ohco_level, 
                       src_col='line_str', 
                       tmp_col='div_idx', 
                       id_suffix='_id', 
                       case=False):
    
    global config
    
    OHCO = list(config['ohco'].keys())
    div_name = OHCO[ohco_level]
    div_pat = config['ohco'][div_name]['pat']

    print("OHCO =", OHCO)    
    print("Chunking source by {} using column `{}`".format(div_name.upper(), src_col))
    print("{} pattern = /{}/, with case = {}".format(div_name.upper(), div_pat, case))
    
    div_lines = df[src_col].str.match(div_pat, case=case)
    df.loc[div_lines, div_name] = [i+1 for i in range(df.loc[div_lines].shape[0])]
    df[div_name] = df[div_name].ffill()
    df = df.loc[~df[div_name].isna()] # Remove everything before Chapter 1
    df = df.loc[~div_lines] # Remove chapter heading lines
    df[div_name] = df[div_name].astype('int')
    df = df.groupby(OHCO[:ohco_level+1])[src_col].apply(lambda x: '\n'.join(x)).to_frame() # Make big string
    df[src_col] = df[src_col].str.strip()    
    df = df.rename(columns={src_col:'{}_str'.format(div_name)})
    df.index.name = "{}_id".format(div_name)
    
    return df

In [6]:
def split_by_delimitter(df, ohco_level, 
                        src_col_suffix='_str', 
                        join_pat='\n', 
                        id_suffix='_num', 
                        case=False):
    
    global config
    
    OHCO = list(config['ohco'].keys())
    div_name = OHCO[ohco_level]
    div_pat = config['ohco'][div_name]['pat']
    src_div_name = OHCO[ohco_level-1]
    src_col = "{}{}".format(src_div_name, src_col_suffix)
    
    print("OHCO =", OHCO)    
    print("Splitting source by {} using column `{}`".format(div_name.upper(), src_col))
    print("{} pattern = /{}/, with case = {}".format(div_name.upper(), div_pat, case))

    df2 = df[src_col].str.split(div_pat, expand=True).stack().to_frame()\
        .rename(columns={0:div_name}).copy()
    df2.index.names = df.index.names + [div_name + id_suffix]
    df2[div_name] = df2[div_name].str.replace(join_pat, ' ')
    df2 = df2[~df2[div_name].str.match(r'^\s*$')]    
    df2 = df2.rename(columns={div_name:'{}_str'.format(div_name)})

    return df2

In [29]:
def gather_tokens(df, level=0, col='token_str', glue=' ', collapse=False):
    idx = df.index.names[:level+1]
    df2 = df.groupby(idx)[col].apply(lambda x: glue.join(x)).to_frame('doc_str')
    return df2

## Test 1

In [7]:
source = import_source()
source = clip_lines(source)

In [8]:
chaps = group_by_milestone(source, 0)
paras = split_by_delimitter(chaps, 1)
sents = split_by_delimitter(paras, 2)
tokens = split_by_delimitter(sents, 3)

OHCO = ['chapter', 'paragraph', 'sentence', 'token']
Chunking source by CHAPTER using column `line_str`
CHAPTER pattern = /^\s*(chapter|letter)\s+(\d+)/, with case = False
OHCO = ['chapter', 'paragraph', 'sentence', 'token']
Splitting source by PARAGRAPH using column `chapter_str`
PARAGRAPH pattern = /\n\n+/, with case = False
OHCO = ['chapter', 'paragraph', 'sentence', 'token']
Splitting source by SENTENCE using column `paragraph_str`
SENTENCE pattern = /[.?!;:]+/, with case = False
OHCO = ['chapter', 'paragraph', 'sentence', 'token']
Splitting source by TOKEN using column `sentence_str`
TOKEN pattern = /[\s',-]+/, with case = False


In [9]:
tokens

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str
chapter_id,paragraph_num,sentence_num,token_num,Unnamed: 4_level_1
1,0,0,0,Sir
1,0,0,1,Walter
1,0,0,2,Elliot
1,0,0,3,of
1,0,0,4,Kellynch
...,...,...,...,...
24,11,6,34,in
24,11,6,35,its
24,11,6,36,national
24,11,6,37,importance


In [10]:
chaps.loc[chaps.chapter_str.str.contains('finis')]

Unnamed: 0_level_0,chapter_str
chapter_id,Unnamed: 1_level_1
6,"Anne had not wanted this visit to Uppercross, ..."
7,"A very few days more, and Captain Wentworth wa..."
11,The time now approached for Lady Russell's ret...
20,"Sir Walter, his two daughters, and Mrs Clay, w..."
23,One day only had passed since Anne's conversat...


In [11]:
chaps.loc[24].to_list()[0][-500:]

"nd Anne's was in the warmth of her heart.  Anne was tenderness\nitself, and she had the full worth of it in Captain Wentworth's\naffection.  His profession was all that could ever make her friends\nwish that tenderness less, the dread of a future war all that could dim\nher sunshine.  She gloried in being a sailor's wife, but she must pay\nthe tax of quick alarm for belonging to that profession which is, if\npossible, more distinguished in its domestic virtues than in its\nnational importance.\n\n\n\nFinis"

In [12]:
chaps

Unnamed: 0_level_0,chapter_str
chapter_id,Unnamed: 1_level_1
1,"Sir Walter Elliot, of Kellynch Hall, in Somers..."
2,"Mr Shepherd, a civil, cautious lawyer, who, wh..."
3,"""I must take leave to observe, Sir Walter,"" sa..."
4,"He was not Mr Wentworth, the former curate of ..."
5,On the morning appointed for Admiral and Mrs C...
6,"Anne had not wanted this visit to Uppercross, ..."
7,"A very few days more, and Captain Wentworth wa..."
8,From this time Captain Wentworth and Anne Elli...
9,Captain Wentworth was come to Kellynch as to a...
10,Other opportunities of making her observations...


## Test 2

In [33]:
r = pd.DataFrame()
for i, level in enumerate(config['ohco']):
    print(i, level)
    if config['ohco'][level]['type'] == 'milestone':
        r = group_by_milestone(source, i)
    else:
        r = split_by_delimitter(r, i)
    print()

0 chapter
OHCO = ['chapter', 'paragraph', 'sentence', 'token']
Chunking source by CHAPTER using column `line_str`
CHAPTER pattern = /^\s*(chapter|letter)\s+(\d+)/, with case = False

1 paragraph
OHCO = ['chapter', 'paragraph', 'sentence', 'token']
Splitting source by PARAGRAPH using column `chapter_str`
PARAGRAPH pattern = /\n\n+/, with case = False

2 sentence
OHCO = ['chapter', 'paragraph', 'sentence', 'token']
Splitting source by SENTENCE using column `paragraph_str`
SENTENCE pattern = /[.?!;:]+/, with case = False

3 token
OHCO = ['chapter', 'paragraph', 'sentence', 'token']
Splitting source by TOKEN using column `sentence_str`
TOKEN pattern = /[\s',-]+/, with case = False



In [40]:
r

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str
chapter_id,paragraph_num,sentence_num,token_num,Unnamed: 4_level_1
1,0,0,0,Sir
1,0,0,1,Walter
1,0,0,2,Elliot
1,0,0,3,of
1,0,0,4,Kellynch
...,...,...,...,...
24,11,6,34,in
24,11,6,35,its
24,11,6,36,national
24,11,6,37,importance


In [39]:
gather_tokens(r, 2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,doc_str
chapter_id,paragraph_num,sentence_num,Unnamed: 3_level_1
1,0,0,Sir Walter Elliot of Kellynch Hall in Somerset...
1,0,1,there he found occupation for an idle hour and...
1,0,2,there his faculties were roused into admiratio...
1,0,3,there any unwelcome sensations arising from do...
1,0,4,and there if every other leaf were powerless h...
...,...,...,...
24,11,3,Her spring of felicity was in the glow of her ...
24,11,4,Anne was tenderness itself and she had the ful...
24,11,5,His profession was all that could ever make he...
24,11,6,She gloried in being a sailor s wife but she m...
