In [1]:
import re
import string

from collections import Counter

In [2]:
def process(line:str, corpus:list):
    if len(line)==1:
        return
    line = line.replace('\n', '')
    line = line.replace('’', "'")
    corpus.append(line)

corpus = []

with open('natural_lang_data.txt', encoding='UTF-8') as f:
    for line in f:
        process(line, corpus)


## Basic splitting

In [3]:
# Remove all punctuation with replace function
non_punctiated_corpus = []

for line in corpus:
    non_punc_line = line.replace(string.punctuation, line)
    non_punctiated_corpus.append(non_punc_line)

basic_split_corpus = []

for line in non_punctiated_corpus:
    split_line = line.split(" ")
    basic_split_corpus.append(split_line)
    
basic_split_corpus
# This splits Chomsky's into Chomskys

[['The',
  'history',
  'of',
  'machine',
  'translation',
  'dates',
  'back',
  'to',
  'the',
  'seventeenth',
  'century,',
  'when',
  'philosophers',
  'such',
  'as',
  'Leibniz',
  'and',
  'Descartes',
  'put',
  'forward',
  'proposals',
  'for',
  'codes',
  'which',
  'would',
  'relate',
  'words',
  'between',
  'languages.',
  'All',
  'of',
  'these',
  'proposals',
  'remained',
  'theoretical,',
  'and',
  'none',
  'resulted',
  'in',
  'the',
  'development',
  'of',
  'an',
  'actual',
  'machine.'],
 ['The',
  'first',
  'patents',
  'for',
  '"translating',
  'machines"',
  'were',
  'applied',
  'for',
  'in',
  'the',
  'mid-1930s.',
  'One',
  'proposal,',
  'by',
  'Georges',
  'Artsrouni',
  'was',
  'simply',
  'an',
  'automatic',
  'bilingual',
  'dictionary',
  'using',
  'paper',
  'tape.',
  'The',
  'other',
  'proposal,',
  'by',
  'Peter',
  'Troyanskii,',
  'a',
  'Russian,',
  'was',
  'more',
  'detailed.',
  'It',
  'included',
  'both',
  'the

In [4]:
# Define as a function
def basic_split(line:str):
    non_punc_line = line.replace(string.punctuation, line)
    return non_punc_line.split(" ")

## Basic Regex splitting

In [5]:
word_regex = r"\W+"

split_corpus = []

for line in corpus:
    split_line = re.split(word_regex, line)
    split_corpus.append(split_line)
    
split_corpus
# This splits Chomsky's into Chomsky and s

[['The',
  'history',
  'of',
  'machine',
  'translation',
  'dates',
  'back',
  'to',
  'the',
  'seventeenth',
  'century',
  'when',
  'philosophers',
  'such',
  'as',
  'Leibniz',
  'and',
  'Descartes',
  'put',
  'forward',
  'proposals',
  'for',
  'codes',
  'which',
  'would',
  'relate',
  'words',
  'between',
  'languages',
  'All',
  'of',
  'these',
  'proposals',
  'remained',
  'theoretical',
  'and',
  'none',
  'resulted',
  'in',
  'the',
  'development',
  'of',
  'an',
  'actual',
  'machine',
  ''],
 ['The',
  'first',
  'patents',
  'for',
  'translating',
  'machines',
  'were',
  'applied',
  'for',
  'in',
  'the',
  'mid',
  '1930s',
  'One',
  'proposal',
  'by',
  'Georges',
  'Artsrouni',
  'was',
  'simply',
  'an',
  'automatic',
  'bilingual',
  'dictionary',
  'using',
  'paper',
  'tape',
  'The',
  'other',
  'proposal',
  'by',
  'Peter',
  'Troyanskii',
  'a',
  'Russian',
  'was',
  'more',
  'detailed',
  'It',
  'included',
  'both',
  'the',

In [6]:
# define as a function 

def regex_spliting(line:str):
    
    word_regex = r"\W+"
    
    return re.split(word_regex, line)

## More complex Regex splitting

In [7]:
improved_word_regex = r"(\w[\w']*\w|\w)"

improved_split_corpus = []

for line in corpus:
    word_matcher = re.compile(improved_word_regex)
    split_line = word_matcher.findall(line)
    improved_split_corpus.append(split_line)
    
improved_split_corpus

[['The',
  'history',
  'of',
  'machine',
  'translation',
  'dates',
  'back',
  'to',
  'the',
  'seventeenth',
  'century',
  'when',
  'philosophers',
  'such',
  'as',
  'Leibniz',
  'and',
  'Descartes',
  'put',
  'forward',
  'proposals',
  'for',
  'codes',
  'which',
  'would',
  'relate',
  'words',
  'between',
  'languages',
  'All',
  'of',
  'these',
  'proposals',
  'remained',
  'theoretical',
  'and',
  'none',
  'resulted',
  'in',
  'the',
  'development',
  'of',
  'an',
  'actual',
  'machine'],
 ['The',
  'first',
  'patents',
  'for',
  'translating',
  'machines',
  'were',
  'applied',
  'for',
  'in',
  'the',
  'mid',
  '1930s',
  'One',
  'proposal',
  'by',
  'Georges',
  'Artsrouni',
  'was',
  'simply',
  'an',
  'automatic',
  'bilingual',
  'dictionary',
  'using',
  'paper',
  'tape',
  'The',
  'other',
  'proposal',
  'by',
  'Peter',
  'Troyanskii',
  'a',
  'Russian',
  'was',
  'more',
  'detailed',
  'It',
  'included',
  'both',
  'the',
  'bi

In [8]:
# define improved regex splitting
def improved_regex_splitting(line:str):
    # Start with a letter, then a possible combination of letters 
    # and apostrophes but that finish with a letter, or single letters
    improved_word_regex = r"(\w[\w']*\w|\w)"

    word_matcher = re.compile(improved_word_regex)
    
    return word_matcher.findall(line)

In [9]:
def process_and_split(line:str, split_fn=None) -> list:
    if len(line)==1:
        return
    # New lines used and single speach marks instead of apostrophes
    # Lower case also needed
    line = line.replace('\n', '').replace('’', "'").lower()
    if split_fn:
        line = split_fn(line)
    return line

In [10]:
processed_corpus=[]
with open('natural_lang_data.txt', encoding='UTF-8') as f:
    for line in f:
        line = process_and_split(line, improved_regex_splitting)
        if line:
            processed_corpus.extend(line)

processed_corpus

['the',
 'history',
 'of',
 'machine',
 'translation',
 'dates',
 'back',
 'to',
 'the',
 'seventeenth',
 'century',
 'when',
 'philosophers',
 'such',
 'as',
 'leibniz',
 'and',
 'descartes',
 'put',
 'forward',
 'proposals',
 'for',
 'codes',
 'which',
 'would',
 'relate',
 'words',
 'between',
 'languages',
 'all',
 'of',
 'these',
 'proposals',
 'remained',
 'theoretical',
 'and',
 'none',
 'resulted',
 'in',
 'the',
 'development',
 'of',
 'an',
 'actual',
 'machine',
 'the',
 'first',
 'patents',
 'for',
 'translating',
 'machines',
 'were',
 'applied',
 'for',
 'in',
 'the',
 'mid',
 '1930s',
 'one',
 'proposal',
 'by',
 'georges',
 'artsrouni',
 'was',
 'simply',
 'an',
 'automatic',
 'bilingual',
 'dictionary',
 'using',
 'paper',
 'tape',
 'the',
 'other',
 'proposal',
 'by',
 'peter',
 'troyanskii',
 'a',
 'russian',
 'was',
 'more',
 'detailed',
 'it',
 'included',
 'both',
 'the',
 'bilingual',
 'dictionary',
 'and',
 'a',
 'method',
 'for',
 'dealing',
 'with',
 'grammati

In [11]:
word_counts = Counter(processed_corpus)
word_counts

Counter({'the': 48,
         'of': 37,
         'a': 21,
         'and': 17,
         'in': 17,
         'to': 16,
         'for': 13,
         'machine': 10,
         'systems': 10,
         'was': 9,
         'translation': 8,
         'which': 8,
         'were': 8,
         'more': 8,
         'on': 8,
         'that': 8,
         'data': 8,
         'as': 7,
         'such': 6,
         'by': 6,
         'with': 6,
         'real': 6,
         'research': 6,
         'language': 6,
         'learning': 6,
         'is': 5,
         'this': 5,
         'into': 5,
         'however': 5,
         'input': 5,
         'models': 5,
         'when': 4,
         'these': 4,
         'an': 4,
         'based': 4,
         'written': 4,
         'statistical': 4,
         'developed': 4,
         'used': 4,
         'rules': 4,
         'many': 4,
         'are': 4,
         'algorithms': 4,
         'has': 4,
         'annotated': 4,
         'between': 3,
         'languages': 3,
       

In [20]:
# Define some stop words
stop_words = {
    'ourselves', 'hers', 'between', 'yourself', 'but', 'again',
    'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with',
    'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such',
    'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or',
    'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below',
    'are', 'we','these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were',
    'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their',
    'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no',
    'when', 'at', 'any','before', 'them', 'same', 'and', 'been', 'have', 'in',
    'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what',
    'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', 'he', 'you',
    'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 'which',
    'those', 'i', 'after', 'few', 'whom', 't','being', 'if', 'theirs', 'my',
    'against', 'a', 'by', 'doing', 'it', 'how','further', 'was', 'here', 'than'
    }

# find least common elements
uncommon_words = word_counts.most_common()[:-10:-1]
# uncommon_word_counts = list(
#     {key:count for key, count in word_counts.items() if count<2})

processed_corpus = [w for w in processed_corpus if w not in stop_words]
processed_corpus = [w for w in processed_corpus if w not in uncommon_words]
processed_corpus

['history',
 'machine',
 'translation',
 'dates',
 'back',
 'seventeenth',
 'century',
 'philosophers',
 'leibniz',
 'descartes',
 'put',
 'forward',
 'proposals',
 'codes',
 'would',
 'relate',
 'words',
 'languages',
 'proposals',
 'remained',
 'theoretical',
 'none',
 'resulted',
 'development',
 'actual',
 'machine',
 'first',
 'patents',
 'translating',
 'machines',
 'applied',
 'mid',
 '1930s',
 'one',
 'proposal',
 'georges',
 'artsrouni',
 'simply',
 'automatic',
 'bilingual',
 'dictionary',
 'using',
 'paper',
 'tape',
 'proposal',
 'peter',
 'troyanskii',
 'russian',
 'detailed',
 'included',
 'bilingual',
 'dictionary',
 'method',
 'dealing',
 'grammatical',
 'roles',
 'languages',
 'based',
 'esperanto',
 '1950',
 'alan',
 'turing',
 'published',
 'famous',
 'article',
 'computing',
 'machinery',
 'intelligence',
 'proposed',
 'called',
 'turing',
 'test',
 'criterion',
 'intelligence',
 'criterion',
 'depends',
 'ability',
 'computer',
 'program',
 'impersonate',
 'human',

In [None]:
def remove_uncommon_and_stop_words(
    corpus: list[str],
    uncommon_amount: int|float,
    stop_words: list[str]
    ) -> list[str]:
    '''
    Removes uncommon words and stop words from a corpus
    
    Params:
    corpus: A list of strings to be processed
    uncommon_amount: If 1 or above will be a number of words to
        be removed. If below 1, then will be a proportion to be
        removed.
    stop_words: A list of common words without meaning to remove
    
    Returns a list of words from the corpus
    '''
    # Find least common elements
    if uncommon_amount > 1:
        uncommon_words = word_counts.most_common()[:-(uncommon_amount+1):-1]
    else:
        removal_amount = int(uncommon_amount*len(corpus))
        uncommon_words = word_counts.most_common()[:-(removal_amount+1):-1]

    reduced_corpus = [w for w in corpus if w not in stop_words]
    return [w for w in reduced_corpus if w not in uncommon_words]