# Handling text in python

In [1]:
text = "Ethics are built right into the ideals and objectives of the United Nations."
words = text.split(" ")

In [2]:
words

['Ethics',
 'are',
 'built',
 'right',
 'into',
 'the',
 'ideals',
 'and',
 'objectives',
 'of',
 'the',
 'United',
 'Nations.']

In [3]:
# Words with more than 3 letters
[w for w in words if len(w) > 3]

['Ethics',
 'built',
 'right',
 'into',
 'ideals',
 'objectives',
 'United',
 'Nations.']

In [4]:
# Capitalized words
[w for w in words if w.istitle()]

['Ethics', 'United', 'Nations.']

In [5]:
# Words that end with s
[w for w in words if w.endswith("s")]

['Ethics', 'ideals', 'objectives']

In [6]:
# Finding unique words
text2 = "To be or not to be"
words2 = text2.split(" ")

In [7]:
print(f"text length: {len(words2)}")
print(f"text length: {len(set(words2))}")

text length: 6
text length: 5


In [9]:
set(words2)

{'To', 'be', 'not', 'or', 'to'}

In [10]:
# As the To is capitalized and to isn't, the set doesn'tidentify them
# as the same word
# Solution for this
set([w.lower() for w in words2])

{'be', 'not', 'or', 'to'}

## For comparisons
- s.startswith(t)
- s.endswith(t)
- t in s
- s.isupper(), s.islower(), s.istitle()
- s.isalpha(), s.isdigit(), s.isalnum()

## String operations
- s.lower(), s.upper(), s.titlecase()
- s.split(t)
- s.splitlines()    -> it will split the string at the end of the line
- s.join()
- s.strip() -> it will remove all the whitespace characters
- s.rstrip() -> it will remove all spaces, whitespace characters, and tabs from the end of the string
- s.find(t)
- s.rfind(t) -> it will find a token/character from the end of the string
-s.replace(u, v)

In [11]:
text3 = "ouagadougou"
words3 = text3.split("ou")
"ou".join(words3)

'ouagadougou'

In [13]:
text4 = "A quick brown fox jumper over the lazy dog"
text4.find("o")

10

In [14]:
text4.rfind("o")

40

In [15]:
# Both functions return the position

## File operations
- f = open(filename, mode)
- f.readline(), f.read(), f.read()
- for line in f: doSomething(line)
- f.seek(n)
- f.write(message)
- f.close()
- f.closed

## Reading files line by line
- f = open("UNDHR.txt", "r")

## Reading the full file
- f.seek(0)
- text = f.read()

# Regular expressions

In [16]:
text5 = '"Ethics are built right into the ideals and objectives of the United Nations" \
#UNSG @ NY Society for Ethical Culture bit.ly/2guVelr'
text6 = text5.split(' ')

text6

['"Ethics',
 'are',
 'built',
 'right',
 'into',
 'the',
 'ideals',
 'and',
 'objectives',
 'of',
 'the',
 'United',
 'Nations"',
 '#UNSG',
 '@',
 'NY',
 'Society',
 'for',
 'Ethical',
 'Culture',
 'bit.ly/2guVelr']

In [19]:
# Finding hashtags
[w for w in text6 if w.startswith("#")]

['#UNSG']

In [20]:
# Finding callouts
[w for w in text6 if w.startswith("@")]

['@']

In [21]:
text7 = '@UN @UN_Women "Ethics are built right into the ideals and objectives of the United Nations" \
#UNSG @ NY Society for Ethical Culture bit.ly/2guVelr'
text8 = text7.split(' ')

In [22]:
import re
[w for w in text8 if re.search("@[A-Za-z0-9_]+", w)]

['@UN', '@UN_Women']

## Meta-characters: Character matches
- . : wildcard, matches a single character
- ^ : start of a string
- $ : end of a string
- [] : matches one of the set of characters within []
- [a-z] : matches one of the range of characters a, b, ..., z
- [^abc] : matches a character that is not a, b or c
- a|b : matches either a or b, where a and b are strings
- () : scoping for operators
- \ : Escape character for special characters (\t, \n, \b)
- \b : matches a word boundary
- \d : any digit, equivalent to [0-9]
- \D : any non-digit, equivalent to [^0-9]
- \s : any whitespace, equivalent to [ \t\n\r\f\v]
- \S : any non-whitespace, equivalent to [ ^\t\n\r\f\v]
- \w: alphanumeric character, equivalen to [a-zA-Z0-9_]
- \W: Non-alphanumeric character, equivalen to [^a-zA-Z0-9_]

## Meta-characters: Repetitions
- \ * : matches zero or more ocurrences
- \ + : matches one or more ocurrences
- ? : matches zero or one ocurrences
- {n} : exactly n repetitions, n >= 0
- {n,} : at least n repetitions
- {,n} : at most n repetitions
- {m,n} : at least m and at most n repetitions

In [23]:
[w for w in text8 if re.search("@[\w]+", w)]

['@UN', '@UN_Women']

In [25]:
text5 = "ouagadougou"
re.findall("[aeiou]", text5)    # Find vowels

['o', 'u', 'a', 'a', 'o', 'u', 'o', 'u']

In [26]:
re.findall("[^aeiou]", text5)   # Find consonants

['g', 'd', 'g']

# Working with Text Data in pandas

In [28]:
import pandas as pd

time_sentences = ["Monday: The doctor's appointment is at 2:45pm.", 
                  "Tuesday: The dentist's appointment is at 11:30 am.",
                  "Wednesday: At 7:00pm, there is a basketball game!",
                  "Thursday: Be back home by 11:15 pm at the latest.",
                  "Friday: Take the train at 08:10 am, arrive at 09:00am."]

df = pd.DataFrame(time_sentences, columns=['text'])
df

Unnamed: 0,text
0,Monday: The doctor's appointment is at 2:45pm.
1,Tuesday: The dentist's appointment is at 11:30...
2,"Wednesday: At 7:00pm, there is a basketball game!"
3,Thursday: Be back home by 11:15 pm at the latest.
4,"Friday: Take the train at 08:10 am, arrive at ..."


In [29]:
# The str method allows us to work with strings in pandas
# Length of the string for every row
df.text.str.len()

0    46
1    50
2    49
3    49
4    54
Name: text, dtype: int64

In [30]:
# Number of tokens in every row
df.text.str.split().str.len()

0     7
1     8
2     8
3    10
4    10
Name: text, dtype: int64

In [31]:
# To see if they contain a pattern
df.text.str.contains("appointment")

0     True
1     True
2    False
3    False
4    False
Name: text, dtype: bool

In [33]:
# To count occurrences
df.text.str.count("\d")

0    3
1    4
2    3
3    4
4    8
Name: text, dtype: int64

In [34]:
df.text.str.findall("\d")

0                   [2, 4, 5]
1                [1, 1, 3, 0]
2                   [7, 0, 0]
3                [1, 1, 1, 5]
4    [0, 8, 1, 0, 0, 9, 0, 0]
Name: text, dtype: object

In [35]:
# Pull out hour and minutes from each string
df.text.str.findall("(\d?\d):(\d\d)")

0               [(2, 45)]
1              [(11, 30)]
2               [(7, 00)]
3              [(11, 15)]
4    [(08, 10), (09, 00)]
Name: text, dtype: object

In [38]:
# Let's replace every weekday
df.text.str.replace("\w+day", "???")

  


0          ???: The doctor's appointment is at 2:45pm.
1       ???: The dentist's appointment is at 11:30 am.
2          ???: At 7:00pm, there is a basketball game!
3         ???: Be back home by 11:15 pm at the latest.
4    ???: Take the train at 08:10 am, arrive at 09:...
Name: text, dtype: object

In [46]:
# Replace every weekday by a letter abreviation
df.text.str.replace("(\w+day)", lambda x: x.groups()[0][:3])

  


0          Mon: The doctor's appointment is at 2:45pm.
1       Tue: The dentist's appointment is at 11:30 am.
2          Wed: At 7:00pm, there is a basketball game!
3         Thu: Be back home by 11:15 pm at the latest.
4    Fri: Take the train at 08:10 am, arrive at 09:...
Name: text, dtype: object

In [47]:
# Create new columns by extracting groups
# extract method only extracts the first ocurrence
df.text.str.extract("(\d?\d):(\d\d)")

Unnamed: 0,0,1
0,2,45
1,11,30
2,7,0
3,11,15
4,8,10


In [50]:
# This extracts all occurrences
df.text.str.extractall("((\d?\d):(\d\d) ?([ap]m))")

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,2:45pm,2,45,pm
1,0,11:30 am,11,30,am
2,0,7:00pm,7,0,pm
3,0,11:15 pm,11,15,pm
4,0,08:10 am,8,10,am
4,1,09:00am,9,0,am


In [52]:
# Named groups
df.text.str.extractall("(?P<time>(?P<hour>\d?\d):(?P<minute>\d\d) ?(?P<period>[ap]m))")

Unnamed: 0_level_0,Unnamed: 1_level_0,time,hour,minute,period
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,2:45pm,2,45,pm
1,0,11:30 am,11,30,am
2,0,7:00pm,7,0,pm
3,0,11:15 pm,11,15,pm
4,0,08:10 am,8,10,am
4,1,09:00am,9,0,am


# ASCII

- American Standard Code for Information Interchange
- 7 bit character encoding standard: 128 valid codes
- Includes alphabets (upper and lower cases), digits, punctuations, common symbols, control characters


## Unicode
- Industry standard for encoding and representing text
- Over 128.000 characters from 130+ scripts and symbol sets
- Can be implemented by diffrerent character endings
    - UTF-8: the most common one

## UTF-8
- Unicode Transformational Format 8 bits
- Variable lengthencodin: One to four bytes
- Backward compatible with ASCII
- Dominant character encoding for the Web
- Default in Python 3 (not the case for Python 2)