In [24]:
import numpy as np
import pickle as pkl
import re

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from pyspark.mllib.clustering import KMeans
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
PUNCTUATION = set(string.punctuation)
STOPWORDS = set(stopwords.words('english'))

In [2]:
wiki_rdd = sc.textFile('s3n://wikisample10/sample2')
wiki_rdd.cache()
wiki_rdd.setName('wiki')

wiki MappedRDD[1] at textFile at NativeMethodAccessorImpl.java:-2

In [3]:
count = wiki_rdd.count()
f = open('count.txt', 'w')
f.write(count)

1524144

- **Take 10 points**

In [11]:
wiki_rdd_samples = sc.parallelize(wiki_rdd.take(10), 5)

- **Filter out articles that start with #REDIRECT**

In [12]:
wiki_rdd_samples.first()

u'#REDIRECT [[Computer accessibility]]  {{Redr|move|from CamelCase|up}}'

In [13]:
wiki_no_redirect_rdd = wiki_rdd_samples.filter(lambda line: '#REDIRECT' not in line)

In [14]:
no_redirect = wiki_no_redirect_rdd.first()
no_redirect[:100]

u'{{Redirect2|Anarchist|Anarchists|the fictional character|Anarchist (comics)|other uses|Anarchists (d'

- **Token and Stem each article**

In [15]:
def tokenize(text):
    regex = re.compile('<.+?>|[^a-zA-Z]')
    clean_txt = regex.sub(' ', text)
    tokens = clean_txt.split()
    lowercased = [t.lower() for t in tokens]

    no_punctuation = []
    for word in lowercased:
        punct_removed = ''.join([letter for letter in word if not letter in PUNCTUATION])
        no_punctuation.append(punct_removed)
    no_stopwords = [w for w in no_punctuation if not w in STOPWORDS]
    
    STEMMER = PorterStemmer()
    stemmed = [STEMMER.stem(w) for w in no_stopwords]
    return [w for w in stemmed if w]

In [16]:
token_rdd = wiki_no_redirect_rdd.map(tokenize)

In [17]:
tokens = token_rdd.first()
tokens[:10]

[u'redirect',
 u'anarchist',
 u'anarchist',
 u'fiction',
 u'charact',
 u'anarchist',
 u'comic',
 u'use',
 u'anarchist',
 u'disambigu']

- **Compute TF**

In [18]:
vocab = token_rdd.flatMap(lambda x: x).distinct().collect()

In [19]:
def get_tf(word_lst):
    count_of_each_word = Counter(word_lst)
    doc_word_count = len(word_lst) * 1.
    return np.array([count_of_each_word[v] / doc_word_count if v in count_of_each_word else 0 for v in vocab])
    
tf_rdd = token_rdd.map(get_tf)

In [23]:
tf_first = tf_rdd.first()
tf_first[:20]

[0.0002592352559948153,
 0.0005184705119896306,
 0.0003888528839922229,
 0,
 0.0005184705119896306,
 0.00019442644199611146,
 6.480881399870383e-05,
 6.480881399870383e-05,
 0.00012961762799740766,
 0,
 0.0005184705119896306,
 6.480881399870383e-05,
 6.480881399870383e-05,
 0,
 0.00012961762799740766,
 6.480881399870383e-05,
 6.480881399870383e-05,
 0.00012961762799740766,
 0.0003240440699935191,
 6.480881399870383e-05]

- **Compute IDF**

In [26]:
total_doc_count = tf_rdd.count()
times_words_in_doc = tf_rdd.map(lambda tf_lst: ((np.array(tf_lst) > 0) + 0)).sum()
idf = np.log(total_doc_count / times_words_in_doc)

- **Compute TF-IDF**

In [27]:
tfidf_rdd = tf_rdd.map(lambda tf_vec: tf_vec * idf)
tfidf_rdd.cache()
tfidf_rdd.setName('tfidf')

tfidf PythonRDD[36] at RDD at PythonRDD.scala:43

- **Run K-Means**

In [28]:
model = KMeans.train(tfidf_rdd, 10)
centriods = model.centers

In [29]:
centriods

[array([  1.79688187e-04,   0.00000000e+00,   2.69532280e-04, ...,
          4.49220467e-05,   0.00000000e+00,   0.00000000e+00]),
 array([ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.00021243])]