In [1]:
import re
import string

from collections import Counter

In [2]:
def process(line:str, corpus:list):
    if len(line)==1:
        return
    line = line.replace('\n', '')
    line = line.replace('’', "'")
    corpus.append(line)

corpus = []

with open('natural_lang_data.txt', encoding='UTF-8') as f:
    for line in f:
        process(line, corpus)


## Basic splitting

In [3]:
# Remove all punctuation with replace function
non_punctiated_corpus = []

for line in corpus:
    non_punc_line = line.replace(string.punctuation, line)
    non_punctiated_corpus.append(non_punc_line)

basic_split_corpus = []

for line in non_punctiated_corpus:
    split_line = line.split(" ")
    basic_split_corpus.append(split_line)
    
basic_split_corpus
# This splits Chomsky's into Chomskys

[['The',
  'history',
  'of',
  'machine',
  'translation',
  'dates',
  'back',
  'to',
  'the',
  'seventeenth',
  'century,',
  'when',
  'philosophers',
  'such',
  'as',
  'Leibniz',
  'and',
  'Descartes',
  'put',
  'forward',
  'proposals',
  'for',
  'codes',
  'which',
  'would',
  'relate',
  'words',
  'between',
  'languages.',
  'All',
  'of',
  'these',
  'proposals',
  'remained',
  'theoretical,',
  'and',
  'none',
  'resulted',
  'in',
  'the',
  'development',
  'of',
  'an',
  'actual',
  'machine.'],
 ['The',
  'first',
  'patents',
  'for',
  '"translating',
  'machines"',
  'were',
  'applied',
  'for',
  'in',
  'the',
  'mid-1930s.',
  'One',
  'proposal,',
  'by',
  'Georges',
  'Artsrouni',
  'was',
  'simply',
  'an',
  'automatic',
  'bilingual',
  'dictionary',
  'using',
  'paper',
  'tape.',
  'The',
  'other',
  'proposal,',
  'by',
  'Peter',
  'Troyanskii,',
  'a',
  'Russian,',
  'was',
  'more',
  'detailed.',
  'It',
  'included',
  'both',
  'the

In [4]:
# Define as a function
def basic_split(line:str):
    non_punc_line = line.replace(string.punctuation, line)
    return non_punc_line.split(" ")

## Basic Regex splitting

In [5]:
word_regex = r"\W+"

split_corpus = []

for line in corpus:
    split_line = re.split(word_regex, line)
    split_corpus.append(split_line)
    
split_corpus
# This splits Chomsky's into Chomsky and s

[['The',
  'history',
  'of',
  'machine',
  'translation',
  'dates',
  'back',
  'to',
  'the',
  'seventeenth',
  'century',
  'when',
  'philosophers',
  'such',
  'as',
  'Leibniz',
  'and',
  'Descartes',
  'put',
  'forward',
  'proposals',
  'for',
  'codes',
  'which',
  'would',
  'relate',
  'words',
  'between',
  'languages',
  'All',
  'of',
  'these',
  'proposals',
  'remained',
  'theoretical',
  'and',
  'none',
  'resulted',
  'in',
  'the',
  'development',
  'of',
  'an',
  'actual',
  'machine',
  ''],
 ['The',
  'first',
  'patents',
  'for',
  'translating',
  'machines',
  'were',
  'applied',
  'for',
  'in',
  'the',
  'mid',
  '1930s',
  'One',
  'proposal',
  'by',
  'Georges',
  'Artsrouni',
  'was',
  'simply',
  'an',
  'automatic',
  'bilingual',
  'dictionary',
  'using',
  'paper',
  'tape',
  'The',
  'other',
  'proposal',
  'by',
  'Peter',
  'Troyanskii',
  'a',
  'Russian',
  'was',
  'more',
  'detailed',
  'It',
  'included',
  'both',
  'the',

In [6]:
# define as a function 

def regex_spliting(line:str):
    
    word_regex = r"\W+"
    
    return re.split(word_regex, line)

## More complex Regex splitting

In [7]:
improved_word_regex = r"(\w[\w']*\w|\w)"

improved_split_corpus = []

for line in corpus:
    word_matcher = re.compile(improved_word_regex)
    split_line = word_matcher.findall(line)
    improved_split_corpus.append(split_line)
    
improved_split_corpus

[['The',
  'history',
  'of',
  'machine',
  'translation',
  'dates',
  'back',
  'to',
  'the',
  'seventeenth',
  'century',
  'when',
  'philosophers',
  'such',
  'as',
  'Leibniz',
  'and',
  'Descartes',
  'put',
  'forward',
  'proposals',
  'for',
  'codes',
  'which',
  'would',
  'relate',
  'words',
  'between',
  'languages',
  'All',
  'of',
  'these',
  'proposals',
  'remained',
  'theoretical',
  'and',
  'none',
  'resulted',
  'in',
  'the',
  'development',
  'of',
  'an',
  'actual',
  'machine'],
 ['The',
  'first',
  'patents',
  'for',
  'translating',
  'machines',
  'were',
  'applied',
  'for',
  'in',
  'the',
  'mid',
  '1930s',
  'One',
  'proposal',
  'by',
  'Georges',
  'Artsrouni',
  'was',
  'simply',
  'an',
  'automatic',
  'bilingual',
  'dictionary',
  'using',
  'paper',
  'tape',
  'The',
  'other',
  'proposal',
  'by',
  'Peter',
  'Troyanskii',
  'a',
  'Russian',
  'was',
  'more',
  'detailed',
  'It',
  'included',
  'both',
  'the',
  'bi

In [8]:
# define improved regex splitting
def improved_regex_splitting(line:str):
    improved_word_regex = r"(\w[\w']*\w|\w)"

    word_matcher = re.compile(improved_word_regex)
    
    return word_matcher.findall(line)

In [9]:
def process_and_split(line:str, split_fn=None) -> list:
    if len(line)==1:
        return
    line = line.replace('\n', '').replace('’', "'")
    if split_fn:
        line = split_fn(line)
    return line

In [12]:
corpus=[]
with open('natural_lang_data.txt', encoding='UTF-8') as f:
    for line in f:
        line = process_and_split(line, improved_regex_splitting)
        if line:
            corpus.append(line)

corpus

[['The',
  'history',
  'of',
  'machine',
  'translation',
  'dates',
  'back',
  'to',
  'the',
  'seventeenth',
  'century',
  'when',
  'philosophers',
  'such',
  'as',
  'Leibniz',
  'and',
  'Descartes',
  'put',
  'forward',
  'proposals',
  'for',
  'codes',
  'which',
  'would',
  'relate',
  'words',
  'between',
  'languages',
  'All',
  'of',
  'these',
  'proposals',
  'remained',
  'theoretical',
  'and',
  'none',
  'resulted',
  'in',
  'the',
  'development',
  'of',
  'an',
  'actual',
  'machine'],
 ['The',
  'first',
  'patents',
  'for',
  'translating',
  'machines',
  'were',
  'applied',
  'for',
  'in',
  'the',
  'mid',
  '1930s',
  'One',
  'proposal',
  'by',
  'Georges',
  'Artsrouni',
  'was',
  'simply',
  'an',
  'automatic',
  'bilingual',
  'dictionary',
  'using',
  'paper',
  'tape',
  'The',
  'other',
  'proposal',
  'by',
  'Peter',
  'Troyanskii',
  'a',
  'Russian',
  'was',
  'more',
  'detailed',
  'It',
  'included',
  'both',
  'the',
  'bi