# Tokenisation

#### Different examples of tokenisation in Python.

#### First, we need to import the nltk libraries and regular expression features

In [41]:
import nltk
import re
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('crubadan')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/ivanyanakiev1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ivanyanakiev1/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     /home/ivanyanakiev1/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/ivanyanakiev1/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/ivanyanakiev1/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package crubadan to
[nltk_data]     /home/ivanyanakiev1/nltk_data...
[nltk_data]   Unzipping corpora/crubadan.zip.


True

#### Next, let's create some sample data to play with...

#### I used used a couple of lines of text from a BBC sports page as an example.

In [8]:
text = 'The European Super League (ESL) is on "standby" despite nine of the 12 founding ' \
       'teams withdrawing, says Real Madrid president Florentino Perez.  After a furious '\
       'backlash against the proposed tournament that was announced on Sunday, all six '\
       'Premier League clubs involved withdrew on Tuesday.'

#### Now, we can tokenise by word.

In [9]:
word_tokens = nltk.word_tokenize(text)

print(word_tokens)

['The', 'European', 'Super', 'League', '(', 'ESL', ')', 'is', 'on', '``', 'standby', "''", 'despite', 'nine', 'of', 'the', '12', 'founding', 'teams', 'withdrawing', ',', 'says', 'Real', 'Madrid', 'president', 'Florentino', 'Perez', '.', 'After', 'a', 'furious', 'backlash', 'against', 'the', 'proposed', 'tournament', 'that', 'was', 'announced', 'on', 'Sunday', ',', 'all', 'six', 'Premier', 'League', 'clubs', 'involved', 'withdrew', 'on', 'Tuesday', '.']


#### You will notice in the list above it treats punctuation as individual tokens.  It's easy enough to strip this out...

In [10]:
test = [word for word in word_tokens if word.isalpha()]
print(test)

['The', 'European', 'Super', 'League', 'ESL', 'is', 'on', 'standby', 'despite', 'nine', 'of', 'the', 'founding', 'teams', 'withdrawing', 'says', 'Real', 'Madrid', 'president', 'Florentino', 'Perez', 'After', 'a', 'furious', 'backlash', 'against', 'the', 'proposed', 'tournament', 'that', 'was', 'announced', 'on', 'Sunday', 'all', 'six', 'Premier', 'League', 'clubs', 'involved', 'withdrew', 'on', 'Tuesday']


#### Well, the punctuation is gone but notice the 12 disappeared?  That's because it's not an alpha character.  Let's try again.


In [11]:
word_tokens = [word for word in word_tokens if word.isalpha() or word.isnumeric()]
print(word_tokens)

['The', 'European', 'Super', 'League', 'ESL', 'is', 'on', 'standby', 'despite', 'nine', 'of', 'the', '12', 'founding', 'teams', 'withdrawing', 'says', 'Real', 'Madrid', 'president', 'Florentino', 'Perez', 'After', 'a', 'furious', 'backlash', 'against', 'the', 'proposed', 'tournament', 'that', 'was', 'announced', 'on', 'Sunday', 'all', 'six', 'Premier', 'League', 'clubs', 'involved', 'withdrew', 'on', 'Tuesday']


#### Another quick and dirty way to tokenise is just to split on whitespace...

In [12]:
word_tokens2 = text.split()
print(word_tokens2)

['The', 'European', 'Super', 'League', '(ESL)', 'is', 'on', '"standby"', 'despite', 'nine', 'of', 'the', '12', 'founding', 'teams', 'withdrawing,', 'says', 'Real', 'Madrid', 'president', 'Florentino', 'Perez.', 'After', 'a', 'furious', 'backlash', 'against', 'the', 'proposed', 'tournament', 'that', 'was', 'announced', 'on', 'Sunday,', 'all', 'six', 'Premier', 'League', 'clubs', 'involved', 'withdrew', 'on', 'Tuesday.']


#### You'll notice that the punctuation is now with the individual terms, we can still strip it out 

In [13]:
test = [word for word in word_tokens2 if word.isalpha()]
print(test)

['The', 'European', 'Super', 'League', 'is', 'on', 'despite', 'nine', 'of', 'the', 'founding', 'teams', 'says', 'Real', 'Madrid', 'president', 'Florentino', 'After', 'a', 'furious', 'backlash', 'against', 'the', 'proposed', 'tournament', 'that', 'was', 'announced', 'on', 'all', 'six', 'Premier', 'League', 'clubs', 'involved', 'withdrew', 'on']


#### Ooops, that seems to have gotten rid of any text containing punctuation, let's try again. 

#### We will use a regular expression so that we end up just accepting words from the list.

#### Note, the 2nd parameter says "" will be substituted if it does satisfy the regular expression.  You'll need to use this with care with some tokenisation as you may have punctuation on its own or another anomolies that will turn into "" entries in your list (which is easy enough to strip out at any rate).

In [14]:
word_tokens2 = [re.sub('[^\w]', "", word) for word in word_tokens2]
print(word_tokens2)

['The', 'European', 'Super', 'League', 'ESL', 'is', 'on', 'standby', 'despite', 'nine', 'of', 'the', '12', 'founding', 'teams', 'withdrawing', 'says', 'Real', 'Madrid', 'president', 'Florentino', 'Perez', 'After', 'a', 'furious', 'backlash', 'against', 'the', 'proposed', 'tournament', 'that', 'was', 'announced', 'on', 'Sunday', 'all', 'six', 'Premier', 'League', 'clubs', 'involved', 'withdrew', 'on', 'Tuesday']


#### That seems to have done the trick

#### You will see below that the two tokenised versions are now the same, just 2 different ways of doing it.

In [15]:
print(word_tokens)
print(word_tokens2)

['The', 'European', 'Super', 'League', 'ESL', 'is', 'on', 'standby', 'despite', 'nine', 'of', 'the', '12', 'founding', 'teams', 'withdrawing', 'says', 'Real', 'Madrid', 'president', 'Florentino', 'Perez', 'After', 'a', 'furious', 'backlash', 'against', 'the', 'proposed', 'tournament', 'that', 'was', 'announced', 'on', 'Sunday', 'all', 'six', 'Premier', 'League', 'clubs', 'involved', 'withdrew', 'on', 'Tuesday']
['The', 'European', 'Super', 'League', 'ESL', 'is', 'on', 'standby', 'despite', 'nine', 'of', 'the', '12', 'founding', 'teams', 'withdrawing', 'says', 'Real', 'Madrid', 'president', 'Florentino', 'Perez', 'After', 'a', 'furious', 'backlash', 'against', 'the', 'proposed', 'tournament', 'that', 'was', 'announced', 'on', 'Sunday', 'all', 'six', 'Premier', 'League', 'clubs', 'involved', 'withdrew', 'on', 'Tuesday']


#### The next step will be to lower case the terms...

In [16]:
word_tokens = [word.lower() for word in word_tokens]
print(word_tokens)

['the', 'european', 'super', 'league', 'esl', 'is', 'on', 'standby', 'despite', 'nine', 'of', 'the', '12', 'founding', 'teams', 'withdrawing', 'says', 'real', 'madrid', 'president', 'florentino', 'perez', 'after', 'a', 'furious', 'backlash', 'against', 'the', 'proposed', 'tournament', 'that', 'was', 'announced', 'on', 'sunday', 'all', 'six', 'premier', 'league', 'clubs', 'involved', 'withdrew', 'on', 'tuesday']


#### You will also undoubtably come accross other cases, like contractions or other odditites

In [17]:
text = "O'Niell can't run."
text = nltk.word_tokenize(text)
print(text)

["O'Niell", 'ca', "n't", 'run', '.']


#### So, O'Niell comes out fine, but the contraction has been split up?  This may or may not be useful.

#### Some NLP tools can deal with that type of input.  I generally avoid it which is one reason why splitting on space, in some cases, can make things easier

In [18]:
text = "O'Niell can't run."
text = text.split()
print(text)
text = [re.sub('[^\w]', "", word) for word in text]
print(text)


["O'Niell", "can't", 'run.']
['ONiell', 'cant', 'run']


# Stemming and Lemmatisation

#### Let's run through some of the examples to see what happens if we stem/lemmatise them

#### First, we need to import some additional nltk libraries to work with this.

In [19]:
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

#### Let's look at the cases with punctuation we introduced above as well as some other "gotcha" cases

In [20]:
porter_stemmer = PorterStemmer()
print(porter_stemmer.stem("O'Niell"))
print(porter_stemmer.stem("can't"))
print(porter_stemmer.stem("cant'"))
print(porter_stemmer.stem("hers'"))
print(porter_stemmer.stem("hers"))
print(porter_stemmer.stem("university"))
print(porter_stemmer.stem("universe"))

o'niel
can't
cant'
hers'
her
univers
univers


#### Let's stem the longer text we tokenized earlier

In [21]:
print(word_tokens)
stemmed_text = [porter_stemmer.stem(word) for word in word_tokens]
print(stemmed_text)

['the', 'european', 'super', 'league', 'esl', 'is', 'on', 'standby', 'despite', 'nine', 'of', 'the', '12', 'founding', 'teams', 'withdrawing', 'says', 'real', 'madrid', 'president', 'florentino', 'perez', 'after', 'a', 'furious', 'backlash', 'against', 'the', 'proposed', 'tournament', 'that', 'was', 'announced', 'on', 'sunday', 'all', 'six', 'premier', 'league', 'clubs', 'involved', 'withdrew', 'on', 'tuesday']
['the', 'european', 'super', 'leagu', 'esl', 'is', 'on', 'standbi', 'despit', 'nine', 'of', 'the', '12', 'found', 'team', 'withdraw', 'say', 'real', 'madrid', 'presid', 'florentino', 'perez', 'after', 'a', 'furiou', 'backlash', 'against', 'the', 'propos', 'tournament', 'that', 'wa', 'announc', 'on', 'sunday', 'all', 'six', 'premier', 'leagu', 'club', 'involv', 'withdrew', 'on', 'tuesday']


#### Now let's apply lemmatisation.  

#### Remember, we need to the part of speech tag for this to work properly so let's get that first

In [24]:
POSTags = nltk.pos_tag(word_tokens)
print(POSTags)

[('the', 'DT'), ('european', 'JJ'), ('super', 'NN'), ('league', 'NN'), ('esl', 'NN'), ('is', 'VBZ'), ('on', 'IN'), ('standby', 'JJ'), ('despite', 'IN'), ('nine', 'CD'), ('of', 'IN'), ('the', 'DT'), ('12', 'CD'), ('founding', 'JJ'), ('teams', 'NNS'), ('withdrawing', 'VBG'), ('says', 'VBZ'), ('real', 'JJ'), ('madrid', 'JJ'), ('president', 'NN'), ('florentino', 'NN'), ('perez', 'NN'), ('after', 'IN'), ('a', 'DT'), ('furious', 'JJ'), ('backlash', 'NN'), ('against', 'IN'), ('the', 'DT'), ('proposed', 'VBN'), ('tournament', 'NN'), ('that', 'WDT'), ('was', 'VBD'), ('announced', 'VBN'), ('on', 'IN'), ('sunday', 'NN'), ('all', 'DT'), ('six', 'CD'), ('premier', 'JJR'), ('league', 'NN'), ('clubs', 'NNS'), ('involved', 'VBN'), ('withdrew', 'NN'), ('on', 'IN'), ('tuesday', 'NN')]


#### If we needed to look it up, we can actually print out the meanings of the POS if you don't know them

In [28]:
for t in POSTags:
  nltk.help.upenn_tagset(t[1])


DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...
NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...
NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...
NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...
VBZ: verb, present tense, 3rd person s

#### We will need to make a translation function in order to use our POS in the WordNet Lemmatiser as it uses a different set (or subset) of POS tags.

In [29]:
def get_wordnet_post(word):
  # Remember, the word is a tuple, word[0] = word, word[1] = POS Tag
  tag = word[1][0].upper()
  tag_dictionary = { "J": wordnet.ADJ,
	                 "N": wordnet.NOUN,
	                 "V": wordnet.VERB,
	                 "R": wordnet.ADV}
	
  # retrive value from dictionary, if not found use default of NOUN
  return tag_dictionary.get(tag, wordnet.NOUN)

#### Now we can lemmatise our text...

In [36]:
lemmatiser = WordNetLemmatizer()
print("Lemmatisation of the sentence: ")
for t in POSTags:
  term = t[0]
  print("[" + term + "]:  " + lemmatiser.lemmatize(term, pos = get_wordnet_post(t)) + \
        " which is a " + get_wordnet_post(t))

Lemmatisation of the sentence: 
[the]:  the which is a n
[european]:  european which is a a
[super]:  super which is a n
[league]:  league which is a n
[esl]:  esl which is a n
[is]:  be which is a v
[on]:  on which is a n
[standby]:  standby which is a a
[despite]:  despite which is a n
[nine]:  nine which is a n
[of]:  of which is a n
[the]:  the which is a n
[12]:  12 which is a n
[founding]:  founding which is a a
[teams]:  team which is a n
[withdrawing]:  withdraw which is a v
[says]:  say which is a v
[real]:  real which is a a
[madrid]:  madrid which is a a
[president]:  president which is a n
[florentino]:  florentino which is a n
[perez]:  perez which is a n
[after]:  after which is a n
[a]:  a which is a n
[furious]:  furious which is a a
[backlash]:  backlash which is a n
[against]:  against which is a n
[the]:  the which is a n
[proposed]:  propose which is a v
[tournament]:  tournament which is a n
[that]:  that which is a n
[was]:  be which is a v
[announced]:  announce 

#### So, using a lemmatiser is somewhat more work - in practice I find results between stemming/lemmatisation are usually pretty similar, not much to choose between them

#### Next, an example of what happens if you *don't* use a POS tag

In [37]:
print("better :", lemmatiser.lemmatize("better", pos ="a"))
print("better :", lemmatiser.lemmatize("better"))

better : good
better : better


# Language Detection

#### To cap things off, we will look at a model that guesses the language of text.  

#### First, let's come up with some sample pieces of text

In [39]:
import pycountry

#English
phrase_one = "good morning"
# Afrikaans
phrase_two = "goeie more"
# Italian
phrase_three = "buongiorno"
# Korean
phrase_four = "좋은 아침"

#### Now, we will instantiate a text classificaiton model and see what we come up with

In [42]:
tc = nltk.classify.textcat.TextCat() 
guess_one = tc.guess_language(phrase_one)
guess_two = tc.guess_language(phrase_two)
guess_three = tc.guess_language(phrase_three)
guess_four = tc.guess_language(phrase_four)

print(guess_one)
print(guess_two)
print(guess_three)
print(guess_four)

guess_one_name = pycountry.languages.get(alpha_3=guess_one).name
guess_two_name = pycountry.languages.get(alpha_3=guess_two).name
guess_three_name = pycountry.languages.get(alpha_3=guess_three).name
guess_four_name = pycountry.languages.get(alpha_3=guess_four).name
print(guess_one_name)
print(guess_two_name)
print(guess_three_name)
print(guess_four_name)

eng
afr
ita
abk
English
Afrikaans
Italian
Abkhazian
