# Selecting filtering Keywords

Let's analyse the [Wikipedia page](https://en.wikipedia.org/wiki/Tunisian_Revolution) for the thematic we are interested in:

In [50]:
import wikipedia
from collections import Counter
from nltk.tokenize import *
from nltk.corpus import stopwords
from nltk.collocations import BigramAssocMeasures
import string
import numpy as np
import operator

In [16]:
def get_KeyWords(wiki_page, num_keywords, language, stop_words_language):
    #download wikipedia page
    wikipedia.set_lang(language)
    wiki = wikipedia.page(wiki_page)

    #initialize token's counter
    tknzr = TweetTokenizer()
    wiki_tokens = tknzr.tokenize(wiki.content)
    count = Counter(wiki_tokens)  
    
    #remove stop words
    count_filtered = {k.lower():v for k,v in count.items() if k.lower() not in set(stopwords.words(stop_words_language)) 
                      and k not in string.punctuation}
    sorted_count = sorted(count_filtered.items(), key=operator.itemgetter(1), reverse=True)
    
    return [i[0] for i in sorted_count][:num_keywords]

In [17]:
get_KeyWords('Tunisian Revolution', 10, "en", "english")

['ben',
 'tunisia',
 'tunisian',
 'ali',
 'government',
 'january',
 'said',
 'would',
 'arab',
 'february']

In [18]:
get_KeyWords('Revolução de Jasmim', 10, "pt", "portuguese")

['2011',
 'manifestações',
 'tunísia',
 'ben',
 'ali',
 'janeiro',
 'anos',
 'é',
 'governo',
 'país']

In [52]:
def bigrams(wiki_page, num_keywords, language, stop_words_language, treshold):
    #download wikipedia page
    wikipedia.set_lang(language)
    wiki = wikipedia.page(wiki_page)
    
    bigram_measures = BigramAssocMeasures
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(wiki.content)
    tokens_filtered = [token for token in tokens if token.lower() not in set(stopwords.words(stop_words_language))]
    
    finder = BigramCollocationFinder.from_words(tokens_filtered)
    finder.apply_freq_filter(treshold)
    
    scored = finder.score_ngrams(bigram_measures.raw_freq)
    
    return sorted(bigram for bigram, score in scored)  # doctest: +NORMALIZE_WHITESPACE

In [53]:
bigrams("Tunisian Revolution", 30, "en", "english", treshold = 4)

[('14', 'January'),
 ('Abidine', 'Ben'),
 ('Al', 'Jazeera'),
 ('Arab', 'world'),
 ('Ben', 'Ali'),
 ('December', '2010'),
 ('El', 'Abidine'),
 ('Fouad', 'Mebazaa'),
 ('January', '2011'),
 ('Jasmine', 'Revolution'),
 ('Mohamed', 'Bouazizi'),
 ('President', 'Zine'),
 ('Prime', 'Minister'),
 ('Saudi', 'Arabia'),
 ('Sidi', 'Bouzid'),
 ('United', 'States'),
 ('Zine', 'El'),
 ('civil', 'war'),
 ('days', 'later'),
 ('interim', 'government'),
 ('new', 'government'),
 ('security', 'forces'),
 ('state', 'emergency'),
 ('tear', 'gas')]

Languages: en, fr, ar
Keywords:
- Tunisia: tunis+
- Egypt: egypt