# Imports

In [39]:
import wikipedia
from collections import Counter
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import regexp_tokenize
from nltk.collocations import BigramCollocationFinder
import string
import numpy as np
import operator

### Scrap wiki page and get keywords

In [24]:
def get_KeyWords(wiki_page, num_keywords, language, s_lang):
    #download wikipedia page
    wikipedia.set_lang(language)
    wiki = wikipedia.page(wiki_page)

    #initialize token's counter
    tknzr = WordPunctTokenizer()
    wiki_tokens = tknzr.tokenize(wiki.content)
    count = Counter(wiki_tokens)  
    
    #remove stop words
    count_filtered = {k.lower():v for k,v in count.items() if k.lower() not in stopwords.words(s_lang) 
                      and k not in string.punctuation}
    sorted_count = sorted(count_filtered.items(), key=operator.itemgetter(1), reverse=True)
    
    return [i[0] for i in sorted_count][:num_keywords]

In [29]:
get_KeyWords('Tunisian Revolution', 50, "en", 'english')

['tunisia',
 'ben',
 'ali',
 'tunisian',
 'government',
 '."',
 '==',
 'january',
 'said',
 'would',
 'arab',
 'february',
 'country',
 'security',
 'tunis',
 'rcd',
 'new',
 '2011',
 'december',
 'minister',
 'ghannouchi',
 'bouazizi',
 'president',
 'world',
 'later',
 'revolution',
 'mohamed',
 'called',
 'sidi',
 'bouzid',
 '000',
 'one',
 'mebazaa',
 'interim',
 '===',
 'forces',
 '2010',
 'power',
 'national',
 'foreign',
 'members',
 'demonstrations',
 'region',
 'arrested',
 'across',
 'announced',
 'emergency',
 'saying',
 'including',
 'zine']

In [26]:
get_KeyWords('Révolution tunisienne', 50, 'fr', 'french')

['les',
 '«',
 'ben',
 '’',
 'ali',
 'janvier',
 '===',
 '»',
 'tunisie',
 '2011',
 '====',
 '—',
 'a',
 'annonce',
 'ancien',
 'manifestants',
 'régime',
 'morts',
 '».',
 'police',
 'tunis',
 'tunisienne',
 '»,',
 'mohamed',
 'pays',
 'dont',
 'blessés',
 'contre',
 'tunisien',
 'ainsi',
 'fait',
 'sidi',
 'bouzid',
 'rcd',
 'ghannouchi',
 '==',
 'ugtt',
 'jour',
 'comme',
 'décembre',
 'départ',
 '17',
 '14',
 'février',
 'place',
 'état',
 '2010',
 'personnes',
 'jours',
 'ordre']

In [14]:
get_KeyWords('الثورة التونسية', 50, 'ar', 'arabic')

['،',
 'تونس',
 'الثورة',
 'بن',
 'علي',
 'التونسي',
 'التونسية',
 'ً',
 '2011',
 'ا',
 'الرئيس',
 'الأمن',
 'يناير',
 'محمد',
 'يوم',
 'كانون',
 'الاحتجاجات',
 'العربية',
 'العاصمة',
 'الشعب',
 'ديسمبر',
 'جانفي',
 '2010',
 'قوات',
 'الأول',
 'العابدين',
 'عدد',
 'عبد',
 'وقد',
 'سيدي',
 'زين',
 'السعودية',
 'الثاني',
 'المظاهرات',
 'النظام',
 'بوزيد',
 'وذلك',
 'ضد',
 'وفي',
 '17',
 'البوعزيزي',
 'مدن',
 'خلال',
 'الحكومة',
 'عربية',
 '14',
 'أعلن',
 'الأوضاع',
 'القذافي',
 'الجيش']

In [51]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.metrics import BigramAssocMeasures

def get_bigrams(myString):
    tokenizer = WordPunctTokenizer()
    tokens = tokenizer.tokenize(myString)
    wordnet_lemmatizer = WordNetLemmatizer()
    bigram_finder = BigramCollocationFinder.from_words(tokens)
    bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 500)

    for bigram_tuple in bigrams:
        x = "%s %s" % bigram_tuple
        tokens.append(x)

    result = [' '.join([wordnet_lemmatizer.lemmatize(w).lower() for w in x.split()]) for x in tokens if x.lower() not in stopwords.words('english') and len(x) > 8]
    return result



In [53]:
words = ['amazing', 'interesting', 'love', 'great', 'nice', 'really']

for w in words:
    tmp = wn.synsets(w)[0].pos()
    print( w, ":", tmp)

amazing : v
interesting : v
love : n
great : n
nice : n
really : r


In [47]:
tr = wikipedia.page('Tunisian Revolution').content
frq_words = get_bigrams(tr)

In [48]:
Counter(frq_words).most_common()

[('government', 40),
 ('protester', 23),
 ('revolution', 21),
 ('president', 20),
 ('ghannouchi', 16),
 ('following', 14),
 ('international', 14),
 ('political', 13),
 ('demonstration', 9),
 ('democratic', 8),
 ('announced', 8),
 ('emergency', 8),
 ('including', 7),
 ('election', 7),
 ('announces', 7),
 ('minister', 7),
 ('country', 6),
 ('solidarity', 6),
 ('authority', 6),
 ('continued', 6),
 ('criticised', 6),
 ('demonstrator', 5),
 ('democracy', 5),
 ('constitutional', 5),
 ('repression', 5),
 ('constitution', 5),
 ('corruption', 4),
 ('immolation', 4),
 ('supported', 4),
 ('activist', 4),
 ('headquarters', 4),
 ('islamists', 4),
 ('suggested', 4),
 ('unemployment', 3),
 ('officially', 3),
 ('opposition', 3),
 ('marginalisation', 3),
 ('confiscated', 3),
 ('official', 3),
 ('responded', 3),
 ('operation', 3),
 ('criticism', 3),
 ('television', 3),
 ('communication', 3),
 ('governor', 3),
 ('communist', 3),
 ('stability', 3),
 ('population', 3),
 ('presidential', 3),
 ('struggling',

Languages: en, fr, ar
Keywords:
- Tunisia: tunis+
- Egypt: egypt

# Helper Functions

# Main

# Reading a Single `.protostream` File

# Selecting filtering Keywords

Let's analyse the [Wikipedia page](https://en.wikipedia.org/wiki/Tunisian_Revolution) for the thematic we are interested in: