-
Notifications
You must be signed in to change notification settings - Fork 1
/
tokenizer.py
43 lines (41 loc) · 1.76 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# File description:
#
# 0. Get long string from importer
# 1. Split long string into multiple strings based on
# the newline character
# 2. Tokenize the words from the array of strings
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
def tokenize(imported_string):
imported_string = imported_string.translate(str.maketrans("’", "'"))
imported_string = imported_string.lower()
string_array = imported_string.split("\n")
# a word is one or more letters, optionally followed by an apostrophe and more letters
# or a number with at least 2 digits (note: currently this will split "abc123def" into ["abc", "123", "def"])
'''
A word is a url, an email, one or more letters optionally followed by an apostrophe and more letters,
a phone number, or a number with at least two digits.
The order is such that more specific cases are checked first and general cases are checked later.
'''
regexp = "(?:https?|ftp)://[^\s/$.?#].[^\s]*|" \
"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9]+|" \
"[a-z]+(?:'[a-z]+)?|" \
"\d{3}\D*\d{3}\D*\d{4}(?:\D*\d+)?|" \
"[0-9]{2,}"
tokenizer = RegexpTokenizer(regexp)
# tokenizer = RegexpTokenizer("[a-z|']+|[0-9]{2,}")
stopWords = set(stopwords.words('english'))
lines = []
for idx, line in enumerate(string_array):
real_line = []
# split line based on whitespace
# segments = line.split()
# for segment in segments:
# pass
tokenized_line = tokenizer.tokenize(line)
for word in tokenized_line:
if word not in stopWords:
real_line.append(word)
if len(real_line) > 0:
lines.append(real_line)
return lines