# Imports

In [39]:
import wikipedia
from collections import Counter
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import regexp_tokenize
from nltk.collocations import BigramCollocationFinder
import string
import numpy as np
import operator

### Scrap wiki page and get keywords

In [24]:
def get_KeyWords(wiki_page, num_keywords, language, s_lang):
    #download wikipedia page
    wikipedia.set_lang(language)
    wiki = wikipedia.page(wiki_page)

    #initialize token's counter
    tknzr = WordPunctTokenizer()
    wiki_tokens = tknzr.tokenize(wiki.content)
    count = Counter(wiki_tokens)  
    
    #remove stop words
    count_filtered = {k.lower():v for k,v in count.items() if k.lower() not in stopwords.words(s_lang) 
                      and k not in string.punctuation}
    sorted_count = sorted(count_filtered.items(), key=operator.itemgetter(1), reverse=True)
    
    return [i[0] for i in sorted_count][:num_keywords]

In [29]:
get_KeyWords('Tunisian Revolution', 50, "en", 'english')

['tunisia',
 'ben',
 'ali',
 'tunisian',
 'government',
 '."',
 '==',
 'january',
 'said',
 'would',
 'arab',
 'february',
 'country',
 'security',
 'tunis',
 'rcd',
 'new',
 '2011',
 'december',
 'minister',
 'ghannouchi',
 'bouazizi',
 'president',
 'world',
 'later',
 'revolution',
 'mohamed',
 'called',
 'sidi',
 'bouzid',
 '000',
 'one',
 'mebazaa',
 'interim',
 '===',
 'forces',
 '2010',
 'power',
 'national',
 'foreign',
 'members',
 'demonstrations',
 'region',
 'arrested',
 'across',
 'announced',
 'emergency',
 'saying',
 'including',
 'zine']

In [26]:
get_KeyWords('Révolution tunisienne', 50, 'fr', 'french')

['les',
 '«',
 'ben',
 '’',
 'ali',
 'janvier',
 '===',
 '»',
 'tunisie',
 '2011',
 '====',
 '—',
 'a',
 'annonce',
 'ancien',
 'manifestants',
 'régime',
 'morts',
 '».',
 'police',
 'tunis',
 'tunisienne',
 '»,',
 'mohamed',
 'pays',
 'dont',
 'blessés',
 'contre',
 'tunisien',
 'ainsi',
 'fait',
 'sidi',
 'bouzid',
 'rcd',
 'ghannouchi',
 '==',
 'ugtt',
 'jour',
 'comme',
 'décembre',
 'départ',
 '17',
 '14',
 'février',
 'place',
 'état',
 '2010',
 'personnes',
 'jours',
 'ordre']

In [14]:
get_KeyWords('الثورة التونسية', 50, 'ar', 'arabic')

['،',
 'تونس',
 'الثورة',
 'بن',
 'علي',
 'التونسي',
 'التونسية',
 'ً',
 '2011',
 'ا',
 'الرئيس',
 'الأمن',
 'يناير',
 'محمد',
 'يوم',
 'كانون',
 'الاحتجاجات',
 'العربية',
 'العاصمة',
 'الشعب',
 'ديسمبر',
 'جانفي',
 '2010',
 'قوات',
 'الأول',
 'العابدين',
 'عدد',
 'عبد',
 'وقد',
 'سيدي',
 'زين',
 'السعودية',
 'الثاني',
 'المظاهرات',
 'النظام',
 'بوزيد',
 'وذلك',
 'ضد',
 'وفي',
 '17',
 'البوعزيزي',
 'مدن',
 'خلال',
 'الحكومة',
 'عربية',
 '14',
 'أعلن',
 'الأوضاع',
 'القذافي',
 'الجيش']

In [138]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.metrics import BigramAssocMeasures

def get_bigrams(myString):
    tokenizer = WordPunctTokenizer()
    tokens = tokenizer.tokenize(myString)
    wordnet_lemmatizer = WordNetLemmatizer()
    tokens = [wordnet_lemmatizer.lemmatize(w).lower() for w in tokens]
    tokens_tag = {k:v for (k,v) in nltk.pos_tag(tokens)}
    
    bigram_finder = BigramCollocationFinder.from_words(tokens)
    bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 500)
    
    results = [x for x in tokens if verb_adverb(x, tokens_tag)]
    for bigram_tuple in bigrams:
        (w1, w2) = bigram_tuple
        if not verb_adverb(w1, tokens_tag) and not verb_adverb(w2, tokens_tag):
            x = "%s %s" % bigram_tuple
            results.append(x)

    #result = [x for x in tokens if x.lower() not in stopwords.words('english') and len(x) > 4]
    return results

def verb_adverb(word, tags):
    adv_verb = ['R', 'V']
    return len(word)>2 and tags[word][0] not in adv_verb and word not in string.punctuation and word not in stopwords.words('english')


In [139]:
tr = wikipedia.page('Tunisian Revolution').content
frq_unigrams = get_bigrams(tr)
Counter(frq_unigrams).most_common()

[('tunisia', 63),
 ('protest', 58),
 ('ben', 57),
 ('ali', 53),
 ('government', 40),
 ('tunisian', 39),
 ('minister', 35),
 ('january', 31),
 ('police', 29),
 ('country', 24),
 ('state', 24),
 ('protester', 23),
 ('revolution', 21),
 ('day', 21),
 ('party', 21),
 ('would', 21),
 ('president', 20),
 ('arab', 20),
 ('new', 20),
 ('february', 20),
 ('security', 18),
 ('tunis', 18),
 ('rcd', 17),
 ('people', 17),
 ('2011', 16),
 ('december', 16),
 ('national', 16),
 ('ghannouchi', 16),
 ('bouazizi', 15),
 ('medium', 15),
 ('foreign', 15),
 ('world', 14),
 ('two', 14),
 ('international', 14),
 ('political', 13),
 ('force', 13),
 ('former', 13),
 ('election', 12),
 ('power', 12),
 ('mohamed', 11),
 ('sidi', 11),
 ('bouzid', 11),
 ('regime', 11),
 ('000', 11),
 ('civil', 10),
 ('three', 10),
 ('one', 10),
 ('mebazaa', 10),
 ('member', 10),
 ('movement', 10),
 ('interim', 10),
 ('demonstration', 9),
 ('2010', 9),
 ('change', 9),
 ('event', 9),
 ('youth', 9),
 ('leader', 9),
 ('prime', 9),
 ('d

In [111]:
w = ['really', 'happy']
tmp = nltk.pos_tag(w)
print(tmp)

[('really', 'RB'), ('happy', 'JJ')]


In [48]:
Counter(frq_words).most_common()

[('government', 40),
 ('protester', 23),
 ('revolution', 21),
 ('president', 20),
 ('ghannouchi', 16),
 ('following', 14),
 ('international', 14),
 ('political', 13),
 ('demonstration', 9),
 ('democratic', 8),
 ('announced', 8),
 ('emergency', 8),
 ('including', 7),
 ('election', 7),
 ('announces', 7),
 ('minister', 7),
 ('country', 6),
 ('solidarity', 6),
 ('authority', 6),
 ('continued', 6),
 ('criticised', 6),
 ('demonstrator', 5),
 ('democracy', 5),
 ('constitutional', 5),
 ('repression', 5),
 ('constitution', 5),
 ('corruption', 4),
 ('immolation', 4),
 ('supported', 4),
 ('activist', 4),
 ('headquarters', 4),
 ('islamists', 4),
 ('suggested', 4),
 ('unemployment', 3),
 ('officially', 3),
 ('opposition', 3),
 ('marginalisation', 3),
 ('confiscated', 3),
 ('official', 3),
 ('responded', 3),
 ('operation', 3),
 ('criticism', 3),
 ('television', 3),
 ('communication', 3),
 ('governor', 3),
 ('communist', 3),
 ('stability', 3),
 ('population', 3),
 ('presidential', 3),
 ('struggling',

Languages: en, fr, ar
Keywords:
- Tunisia: tunis+
- Egypt: egypt

# Helper Functions

# Main

# Reading a Single `.protostream` File

# Selecting filtering Keywords

Let's analyse the [Wikipedia page](https://en.wikipedia.org/wiki/Tunisian_Revolution) for the thematic we are interested in: