# 1. Handling text in python

In [2]:
text1 = "Ethics are built right into the ideals and objectives of the United Nations "
len(text1)  

76

In [3]:
text2 = text1.split(' ')
text2

['Ethics',
 'are',
 'built',
 'right',
 'into',
 'the',
 'ideals',
 'and',
 'objectives',
 'of',
 'the',
 'United',
 'Nations',
 '']

In [4]:
len(text2)

14

## Find specific words

In [5]:
# long words: Words that are more than 3 letters long
[w for w in text2 if len(w) > 3]

['Ethics',
 'built',
 'right',
 'into',
 'ideals',
 'objectives',
 'United',
 'Nations']

In [6]:
# Find capitalized words
[w for w in text2 if w.istitle()]

['Ethics', 'United', 'Nations']

In [7]:
# words that end with s
[w for w in text2 if w.endswith('s')]

['Ethics', 'ideals', 'objectives', 'Nations']

## Find unique words: using set()

In [8]:
text3 = 'To be or not to be'
text4 = text3.split()
len(text4)

6

In [9]:
len(set(text4))

5

In [10]:
set(text4)

{'To', 'be', 'not', 'or', 'to'}

In [11]:
len(set([w.lower() for w in text4]))

4

In [12]:
set([w.lower() for w in text4])

{'be', 'not', 'or', 'to'}

## some word comparison functions

* s.startwith(t)
* s.endswith(t)
* t in s
* s.isupper(); s.lower(); s.istitle()
* s.isalpha(); s.isdigit(); s.isalnum()

## String Operations

* s.lower(); s.supper(); s.titlecase()
* s.split()
* s.splitlines()
* s.join(t)
* s.strip(); s.rstrip()
* s.find(t); s.rfind(t)
* s.replace(u,v)

## From words to characters

In [13]:
text5 = "ouagadougou"
text6 = text5.split('ou')
text6

['', 'agad', 'g', '']

In [14]:
'ou'.join(text6)

'ouagadougou'

In [15]:
text5.split()

['ouagadougou']

In [17]:
text5.split('') # we cannot split based on empty separator

ValueError: empty separator

In [18]:
list(text5) # this is the right way to split words into characters

['o', 'u', 'a', 'g', 'a', 'd', 'o', 'u', 'g', 'o', 'u']

## Cleaning text

In [19]:
text8 = '  	 A quick brown fox jumped over the lazy dog. '
text8.split(' ')

['',
 '',
 '\t',
 'A',
 'quick',
 'brown',
 'fox',
 'jumped',
 'over',
 'the',
 'lazy',
 'dog.',
 '']

In [20]:
text9 = text8.strip() # strip all non-needed white spaces
text9.split(' ') 

['A', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog.']

## Chaning text

In [21]:
# Find and replace
text9

'A quick brown fox jumped over the lazy dog.'

In [22]:
text9.find('o')

10

In [23]:
text9.rfind('o')

40

In [24]:
text9.replace('o', 'O')

'A quick brOwn fOx jumped Over the lazy dOg.'

## Reading files line by line

In [25]:
f = open('text1.txt', 'r')
f.readline()

'We first compare our approach against recent methods\n'

## Read the full file

In [26]:
f.seek(0)
text12 = f.read()
len(text12)

575

In [27]:
text13 = text12.splitlines()
text13

['We first compare our approach against recent methods',
 'for unpaired image-to-image translation on paired datasets',
 'where ground truth input-output pairs are available for evaluation.',
 'We then study the importance of both the adversarial',
 'loss and the cycle consistency loss and compare our full',
 'method against several variants. Finally, we demonstrate',
 'the generality of our algorithm on a wide range of applications',
 'where paired data does not exist. For brevity, we refer',
 'to our method as CycleGAN. The PyTorch and Torch code,',
 'models, and full results can be found at our website.']

In [28]:
len(text13)

10

In [29]:
text13[0]

'We first compare our approach against recent methods'

## File operations

* f = open(filename, mode)
* f.readline(); f.read(); f.read(n)
* for line in f: doSomething(line)
* f.seek(n)
* f.write(message)
* f.close()

## Issues with reading text files 

In [30]:
# remove the last newline character
f = open("text1.txt", 'r')
text14 = f.readline()
text14

'We first compare our approach against recent methods\n'

## how do you remove the last newline character

In [31]:
text14.rstrip() # we strip white spaces from the right side

'We first compare our approach against recent methods'

In [32]:
text14.strip() # we strip the white spaces starting from the left side

'We first compare our approach against recent methods'

# 2. Regular Expressions

## Processing free-text

In [33]:
# Find words with hashtags
tweet = "@nltk Text analysis is awesome! #regex #pandas #python"
print([word for word in tweet.split() if word.startswith('#')])

['#regex', '#pandas', '#python']


In [35]:
# find callouts
[w for w in tweet if w.startswith('@')]

['@']

## Callouts are more than just tokens beginning with '@'
@utoronto.ca @gmail.com 

Match something after '@'
- Alphabets
- Numbers
- Specoal symbols like '_'

@[A-Za-z0-9]+ 

In [40]:
text10 = '"Ethics are built right into the ideals and objectives of the united \
Nations" #UNSG @ NY Society for Ethical Culture bit.ly/2guVelr @UN @UN_Women'
text11 = text10.split()
text11

['"Ethics',
 'are',
 'built',
 'right',
 'into',
 'the',
 'ideals',
 'and',
 'objectives',
 'of',
 'the',
 'united',
 'Nations"',
 '#UNSG',
 '@',
 'NY',
 'Society',
 'for',
 'Ethical',
 'Culture',
 'bit.ly/2guVelr',
 '@UN',
 '@UN_Women']

In [42]:
[w for w in text11 if w.startswith('@')]

['@', '@UN', '@UN_Women']

## Import regular expression first

@[A-Za-z0-9]+ 

@: starts with @ \
[A-Za-z0-9]: followed by any alphabet(upper or lower case), digit, or undersocre \
+: repeats at least onece, but any number of times

In [44]:
import re
[w for w in text11 if re.search('@[A-Za-z0-9_]+', w)]

['@UN', '@UN_Women']

## Meta-characters: Character matches

. : wildcard, matches a single character \
^ : start of a string [^abc] match none of abc\
$ : end of a string \
[]: matches one of the set of characters within [] \
[a-z] : matches one of the range of character a,b,...,z \ 
[^abc] : matches a character that is not a,b,or,c \
a|b : matches either a or b, where a and b are strings \
() : Scoping for operators \
\ : Escape character for sepcial characters(\t, \n, \b)

## Meta-characters: Character symbols
\b : Matches word bounday \
\d : Any digit, equivalent to [0-9] \
\D : Any non-digit, equivalent to [^0-9] \
\s : Any whitespace, equivalent to [ \t\n\r\f\v] \
\S : Any non-whitespace, equivalent to [^ \t\n\r\f\v] \
\w : Alphabumeric character, equivalent to [a-zA-Z0-9_] \
\W : Non-alphanumeric, equivalent to [^a-zA-Z0-9_]


## Meta-characters: Repetitions

there is not quote, but in juputer notebook, have to add quote\
'*' : matches zero or more occurences \ 
'+' : matches one or more occurences \
? : matches zero or one occurences \
{n} : exactly n repetitions, n >= 0 \
{n,} : at least n repetions \
{,n} : at most n repetions \
{m,n} : at least m and at most n repetions\



## Find specific characters

In [53]:
text12 = "ouagadougou"
re.findall(r'[aeiou]', text12) # find all characters that match at least one of the hcaracters in the brackets
# find all: find and list all of them

['o', 'u', 'a', 'a', 'o', 'u', 'o', 'u']

In [55]:
re.findall(r'[^aeiou]', text12) # find all characters that match none of aeiou

['g', 'd', 'g']

## Case study: Regular expression for Dates

## Date variations for 23rd December 2021
23-12-2021 \
23/12/2021 \
23/12/21\
12/23/2021 \ 
23 Dec 2021 \
23 December 2021 \ 
Dec 23, 2021 

In [176]:
dateStr = '23-12-2021\n23/12/2021\n23/12/21\n12/23/2021\n23 Dec 2021\n23 December 2021\n Dec 23, 2021\n Mar-20-2009\n Mar 20,2009\n06 May 1972\nJan 24 1986\n'
dateStr

'23-12-2021\n23/12/2021\n23/12/21\n12/23/2021\n23 Dec 2021\n23 December 2021\n Dec 23, 2021\n Mar-20-2009\n Mar 20,2009\n06 May 1972\nJan 24 1986\n'

In [160]:
re.findall(r'(?:Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Oct|Nov|Dec)-\d{2}-\d{4}',dateStr)

['Mar-20-2009']

In [181]:
re.findall(r'(?:\d{2} )?(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{2},? )?\d{4}', dateStr)

['23 Dec 2021',
 '23 December 2021',
 'Dec 23, 2021',
 '06 May 1972',
 'Jan 24 1986']

In [59]:
re.findall(r'\d{2}[/-]\d{2}[/-]\d{4}', dateStr)

['23-12-2021', '23/12/2021', '12/23/2021']

In [60]:
re.findall(r'\d{2}[/-]\d{2}[/-]\d{2,4}', dateStr)

['23-12-2021', '23/12/2021', '23/12/21', '12/23/2021']

In [61]:
re.findall(r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}', dateStr)

['23-12-2021', '23/12/2021', '23/12/21', '12/23/2021']

In [63]:
re.findall(r'\d{2} (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) \d{4}', dateStr)

['Dec']

In [64]:
re.findall(r'\d{2} (?:Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Oct|Nov|Dec) \d{4}', dateStr)

['23 Dec 2021']

In [170]:
re.findall(r'(?:\d{2} )?(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{2}, )?\d{4}', dateStr)

['23 Dec 2021', '23 December 2021', 'Dec 23, 2021', '06 May 1972']

In [166]:
t = "new line s The patient is a 44 year old married Caucasian woman, unemployed Decorator, living with husband and caring for two young children, who is referred by Capitol Hill Hospital PCP, Dr. Heather Zubia, for urgent evaluation/treatment till first visit with Dr. Toney Winkler IN EIGHT WEEKS on 24 Jan 2001."
re.findall(r'(?:\d{2} )?(?:Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{2}, )?\d{4}', t)

['24 Jan 2001']

In [171]:
t = "new line 06 May 1972 SOS-10 Total Score:"
re.findall(r'(?:\d{2} )?(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{2}, )?\d{4}', t)

['06 May 1972']

In [172]:
t = "new line none; but currently has appt with new HJH PCP Rachel Salas, MD on October. 11, 2013 Other Agency Involvement: No"
re.findall(r'(?:\d{2} )?(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*[.] (?:\d{2}, )?\d{4}', t)

['October. 11, 2013']

In [175]:
t = "new line .Came back to US on Jan 24 1986, saw Dr. Quackenbush at Beaufort Memorial Hospital.  Checked VPA level and found it to be therapeutic and confirmed BPAD dx.  Also, has a general physician exam and found to be in good general health, except for being slightly overwt."
re.findall(r'(?:\d{2} )?(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{2}, )?\d{4}', t)

[]

In [186]:
t = "new line 6/1998 Primary Care Doctor:"
re.findall(r'\d{1,2}[/-]\d{4}', t)

['6/1998']

# 3. Working with Text Data in pandas

In [81]:
import pandas as pd

time_sentences = ["Monday: The doctor's appointment is at 2:45pm.",
                 "Tuesday: The dentist's appointment is at 11:30 am.",
                 "Wedneday: At 7:00pm, there is a basktball game!", 
                 "Thursday: Be back home by 11:15 pm at the latest.", 
                 "Friday: Take the train at 08:10 am, arrive at 9:00 am."]
df = pd.DataFrame(time_sentences, columns=['text'])
df

Unnamed: 0,text
0,Monday: The doctor's appointment is at 2:45pm.
1,Tuesday: The dentist's appointment is at 11:30...
2,"Wedneday: At 7:00pm, there is a basktball game!"
3,Thursday: Be back home by 11:15 pm at the latest.
4,"Friday: Take the train at 08:10 am, arrive at ..."


In [71]:
df['text'].str.len()

0    46
1    50
2    47
3    49
4    54
Name: text, dtype: int64

In [74]:
df['text'].str.split().str.len()

0     7
1     8
2     8
3    10
4    11
Name: text, dtype: int64

In [75]:
df['text'].str.contains('appointment')

0     True
1     True
2    False
3    False
4    False
Name: text, dtype: bool

In [76]:
df['text'].str.count(r'\d') #count the number of digits in the dataframe

0    3
1    4
2    3
3    4
4    7
Name: text, dtype: int64

In [77]:
df['text'].str.findall(r'(\d?\d):(\d\d)') 

0              [(2, 45)]
1             [(11, 30)]
2              [(7, 00)]
3             [(11, 15)]
4    [(08, 10), (9, 00)]
Name: text, dtype: object

In [82]:
df['text'].str.replace(r'\w+day\b', '???')

  df['text'].str.replace(r'\w+day\b', '???')


0          ???: The doctor's appointment is at 2:45pm.
1       ???: The dentist's appointment is at 11:30 am.
2           ???: At 7:00pm, there is a basktball game!
3         ???: Be back home by 11:15 pm at the latest.
4    ???: Take the train at 08:10 am, arrive at 9:0...
Name: text, dtype: object

In [88]:
df['text'].str.replace(r'(\w+day\b)', lambda x: x.groups()[0][:3])
# This method returns a tuple containing all the subgroups of the match
# groups[0] is the first subgroup of the regular expression match

  df['text'].str.replace(r'(\w+day\b)', lambda x: x.groups()[0][:3])


0          Mon: The doctor's appointment is at 2:45pm.
1       Tue: The dentist's appointment is at 11:30 am.
2           Wed: At 7:00pm, there is a basktball game!
3         Thu: Be back home by 11:15 pm at the latest.
4    Fri: Take the train at 08:10 am, arrive at 9:0...
Name: text, dtype: object

In [89]:
df['text'].str.extract(r'(\d?\d):(\d\d)')

Unnamed: 0,0,1
0,2,45
1,11,30
2,7,0
3,11,15
4,8,10


In [105]:
df['text'].str.extractall(r'((\d?\d):(\d\d) ?([ap]m))')

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,2:45pm,2,45,pm
1,0,11:30 am,11,30,am
2,0,7:00pm,7,0,pm
3,0,11:15 pm,11,15,pm
4,0,08:10 am,8,10,am
4,1,9:00 am,9,0,am


In [107]:
df['text'].str.extractall(r'(?P<time>(?P<hour>\d?\d):(?P<minute>\d\d) ?(?P<period>[ap]m))')

Unnamed: 0_level_0,Unnamed: 1_level_0,time,hour,minute,period
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,2:45pm,2,45,pm
1,0,11:30 am,11,30,am
2,0,7:00pm,7,0,pm
3,0,11:15 pm,11,15,pm
4,0,08:10 am,8,10,am
4,1,9:00 am,9,0,am


## 4. Internationalization and issues with Non-ASCII Characters

there are words that have similar prunication: 
resume (e is french) \

* international language: chinese vs India \
written scripts: different languages \

Other character encoding \
* IBM EBCDIC \
* Latin-I \
* JIS: Japanese Industrial Standards
* CCCII: Chinese Character Code for Inforamtion Interchange
* EUC: Extendede Unix Code
* Numerous other national standards
* Unicode and UTF-8

Diversity in Text

# 5. Steps for data cleaning

1. Escaping HTML characters \
2. Decoding data \
3. Apostrophe lookup \ 
4. Removal of stop_words \
5. removal of punctuations \
6. removal of expressions \
7. split attached words \
8. Slangs loopup \
9. Standardizing words \
10. Removal of URLs  \


* 1. Grammer checking 2. Spelling correction

## Escaping HTML characters: 
Data obtained from web usually contains a lot of htmel entities like &lt; &gt; &amp. It is necessary to get rid of these entities. \
One approach is to directly remove them by using regular expression. \
Another approach is to use approptiate packages or modules

In [135]:
original_tweet = "“I luv my &lt;3 iphone &amp; you’re awsm apple. DisplayIsAwesome, sooo happppppy 🙂 http://www.apple.com"
original_tweet

'“I luv my &lt;3 iphone &amp; you’re awsm apple. DisplayIsAwesome, sooo happppppy 🙂 http://www.apple.com'

## Decoding data

UTF-8 encoding is widely accepted and is recoomended to use.

In [131]:
original_tweet.encode()

b'\xe2\x80\x9cI luv my &lt;3 iphone &amp; you\xe2\x80\x99re awsm apple. DisplayIsAwesome, sooo happppppy \xf0\x9f\x99\x82 http://www.apple.com'

In [132]:
tweet = original_tweet.encode().decode("utf8").encode('ascii', 'ignore')
tweet

b'I luv my &lt;3 iphone &amp; youre awsm apple. DisplayIsAwesome, sooo happppppy  http://www.apple.com'

## Apostrophe Lookup

In [136]:
APPOSTOPHES = {"'s" : " is", "'re" : " are", ...} ## Need a huge dictionary
#here is just an example of making up the dictionary : cover all the possible cases
words = tweet.split()
reformed = [APPOSTOPHES[word] if word in APPOSTOPHES else word for word in words]
reformed = " ".join(reformed)

SyntaxError: invalid syntax (1107857201.py, line 1)

In [138]:
text = ['This is dirty TEXT: A phone number +001234561234, moNey 3.333, some date like 09.08.2016 and weird Čárákterš.']
text

['This is dirty TEXT: A phone number +001234561234, moNey 3.333, some date like 09.08.2016 and weird Čárákterš.']

In [141]:
import re

def is_digit(word):
    try:
        int(word)
        return True
    except ValueError:
        return False

cedilla2latin = [[u'Á', u'A'], [u'á', u'a'], [u'Č', u'C'], [u'č', u'c'], [u'Š', u'S'], [u'š', u's']]
tr = dict([(a[0], a[1]) for (a) in cedilla2latin])

def transliterate(line):
    new_line = ""
    for letter in line:
        if letter in tr:
            new_line += tr[letter]
        else:
            new_line += letter
    return new_line

text = ['This is dirty TEXT: A phone number +001234561234, moNey 3.333, some date like 09.08.2016 and weird Čárákterš.']

for line in text:
    # decode line to worrk with utf8 symbols
    line = line.encode().decode('utf8')
    line = line.replace('+', ' ').replace('.', ' ').replace(',', ' ').replace(':', ' ')
    # remove digits with regex
    line = re.sub("(^|\W)\d+($|\W)", " ", line)
    # OR remove digits with casting to int
    new_line = []
    for word in line.split():
        if not is_digit(word):
            new_line.append(word)
    line = " ".join(new_line)
    # transliterate to Latin characters
    line = transliterate(line)
    line = line.lower()
    print (line)

this is dirty text a phone number money some date like and weird carakters


In [144]:
cedilla2latin = [[u'Á', u'A'], [u'á', u'a'], [u'Č', u'C'], [u'č', u'c'], [u'Š', u'S'], [u'š', u's']]
tr = dict([(a[0], a[1]) for (a) in cedilla2latin])
tr

{'Á': 'A', 'á': 'a', 'Č': 'C', 'č': 'c', 'Š': 'S', 'š': 's'}