# Selecting filtering Keywords

Let's analyse the [Wikipedia page](https://en.wikipedia.org/wiki/Tunisian_Revolution) for the thematic we are interested in:

In [177]:
import wikipedia
from collections import Counter
from nltk.tokenize import *
from nltk.corpus import stopwords
from nltk.collocations import *
import string
import numpy as np
import operator

In [178]:
def get_KeyWords(wiki_page, num_keywords, language, stop_words_language):
    #download wikipedia page
    wikipedia.set_lang(language)
    wiki = wikipedia.page(wiki_page)

    #initialize token's counter
    tknzr = TweetTokenizer()
    wiki_tokens = tknzr.tokenize(wiki.content)
    count = Counter(wiki_tokens)  
    
    #remove stop words
    count_filtered = {k.lower():v for k,v in count.items() if k.lower() not in stopwords.words(stop_words_language) 
                      and k not in string.punctuation}
    sorted_count = sorted(count_filtered.items(), key=operator.itemgetter(1), reverse=True)
    
    return [i[0] for i in sorted_count][:num_keywords]

In [179]:
get_KeyWords('Tunisian Revolution', 10, "en", "english")

['ben',
 'tunisia',
 'tunisian',
 'ali',
 'government',
 'january',
 'said',
 'would',
 'arab',
 'february']

In [161]:
get_KeyWords('Revolução de Jasmim', 10, "pt", "portuguese")

['2011',
 'manifestações',
 'tunísia',
 'ben',
 'ali',
 'janeiro',
 'anos',
 'é',
 'governo',
 'país']

In [169]:
wiki = wikipedia.page("Tunisian Revolution")

In [196]:
bigram_measures = nltk.collocations.BigramAssocMeasures()

tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(wiki.content)
tokens_filtered = [token for token in tokens if token.lower() not in set(stopwords.words("english"))]

finder = BigramCollocationFinder.from_words(tokens_filtered)
finder.apply_freq_filter(3)

scored = finder.score_ngrams(bigram_measures.raw_freq)
sorted(bigram for bigram, score in scored)  # doctest: +NORMALIZE_WHITESPACE

[('12', 'January'),
 ('14', 'January'),
 ('28', 'December'),
 ('Abidine', 'Ben'),
 ('Al', 'Jazeera'),
 ('Ali', 'fled'),
 ('Ali', 'rule'),
 ('Arab', 'world'),
 ('Ben', 'Ali'),
 ('December', '2010'),
 ('El', 'Abidine'),
 ('Ennahda', 'Movement'),
 ('February', '2011'),
 ('Foreign', 'Minister'),
 ('Fouad', 'Mebazaa'),
 ('Interior', 'Ministry'),
 ('January', '2011'),
 ('Jasmine', 'Revolution'),
 ('Mohamed', 'Bouazizi'),
 ('Mohamed', 'Ghannouchi'),
 ('President', 'Zine'),
 ('Prime', 'Minister'),
 ('RCD', 'party'),
 ('Saudi', 'Arabia'),
 ('Sidi', 'Bouzid'),
 ('Tunisian', 'National'),
 ('Tunisian', 'authorities'),
 ('United', 'States'),
 ('Zine', 'El'),
 ('across', 'Arab'),
 ('also', 'called'),
 ('also', 'said'),
 ('civil', 'war'),
 ('days', 'later'),
 ('government', 'announces'),
 ('interim', 'government'),
 ('later', 'released'),
 ('new', 'government'),
 ('prime', 'minister'),
 ('ruling', 'RCD'),
 ('security', 'forces'),
 ('security', 'services'),
 ('state', 'emergency'),
 ('tear', 'gas')]

Languages: en, fr, ar
Keywords:
- Tunisia: tunis+
- Egypt: egypt