In [None]:
import wikipedia as wiki
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import gensim
from gensim.corpora.dictionary import Dictionary
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
import itertools
import re
import pandas as pd
from gensim.models.tfidfmodel import TfidfModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Analyzing one Wikipedia article

In [None]:
page = wiki.page(wiki.random(1))

In [None]:
print(page.title)

In [None]:
print(page.url)

In [None]:
print(page.images)

In [None]:
print(page.links)

In [None]:
print(page.summary)

In [None]:
print(page.content)

In [None]:
tokens = word_tokenize(page.content)

In [None]:
lower_tokens = [t.lower() for t in tokens]

In [None]:
# Creating an initial bag of words.
bag = Counter(lower_tokens)

In [None]:
print(bag)

### Clean up and preprocessing to remove non-alphabetic characters, stop words, and to lemmatize.

In [None]:
# Removing non-alphabetic characters.
alpha_only = [t for t in lower_tokens if t.isalpha()]

In [None]:
# Removing English stop words.
english_stops = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn', '']
no_stops = [t for t in alpha_only if t not in english_stops]

In [None]:
# Lemmatizing.
word_lemm = WordNetLemmatizer()
lemm = [word_lemm.lemmatize(t) for t in no_stops]

In [None]:
# Creating an updated and improved bag of words.
newbag = Counter(lemm)
print(newbag.most_common(10))

You can see the most common words in this particular article and glean key points to aid in topic identification.

# Analyzing multiple articles

## Gensim

In [None]:
multiple_pages = wiki.random(10)

In [None]:
titles = []
for t in multiple_pages:
    try:
        titles.append(wiki.page(t).title)
    except wiki.DisambiguationError as e:
        titles.append(wiki.page(e.options[0]).title) 
        continue

In [None]:
print(len(titles))

In [None]:
contentofpages= []
for c in multiple_pages:
    try:
        contentofpages.append(wiki.page(c).content)
    except wiki.DisambiguationError as e:
        contentofpages.append(wiki.page(e.options[0]).content) 
        continue

In [None]:
print(len(contentofpages))

In [None]:
clean_articles = []

for article in contentofpages:
    tokens = word_tokenize(article)
    lower_tokens = [t.lower() for t in tokens]
    alpha_only = [t for t in lower_tokens if t.isalpha()]
    no_stops = [t for t in alpha_only if t not in english_stops]
    lemm = [word_lemm.lemmatize(t) for t in no_stops]
    clean_articles.append(lemm)

In [None]:
dictionary = Dictionary(clean_articles)

In [None]:
# Creating a gensim corpus.
corpus = [dictionary.doc2bow(article) for article in clean_articles]

### Gensim Bag-of-Words

In [None]:
total_word_count = defaultdict(int)
for word_id, word_count in itertools.chain.from_iterable(corpus):
    total_word_count[word_id] += word_count

In [None]:
sorted_word_count = sorted(total_word_count.items(), key=lambda w: w[1], reverse=True) 

In [None]:
for word_id, word_count in sorted_word_count[:5]:
    print(dictionary.get(word_id), word_count)

### TF-IDF

In [None]:
tfidf = TfidfModel(corpus)

In [None]:
# As an example, we'll use the 3rd article to calculate the significant terms.
tfidf_weights = tfidf[corpus[2]]
sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=True)

In [None]:
# The top 5 weighted words for that particular article.
for term_id, weight in sorted_tfidf_weights[:5]:
    print(dictionary.get(term_id), weight)

**You can see that both the bag of words looks across the entire dataset to find the most common words. The TF-IDF method, on the other hand, allows us to focus on one particular article to determine how the terms in it are represented across the rest of the dataset.

**Depending on our interests and use cases, you can see how both methods could be valuable.

### Clustering (k-means)

In [None]:
def tokenize_and_stem(text):
    for article in contentofpages:
        tokens = word_tokenize(article)
        lower_tokens = [t.lower() for t in tokens]
        alpha_only = [t for t in lower_tokens if t.isalpha()]
        no_stops = [t for t in alpha_only if t not in english_stops]
        stems = [word_lemm.lemmatize(t) for t in no_stops]
        return stems

def tokenize_only(text):
    for article in contentofpages:
        tokens = word_tokenize(article)
        lower_tokens = [t.lower() for t in tokens]
        alpha_only = [t for t in lower_tokens if t.isalpha()]
        filtered_tokens = [t for t in alpha_only if t not in english_stops]
        return filtered_tokens

In [None]:
totalvocab_stemmed =[]
totalvocab_tokenized = []

for i in contentofpages:
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [None]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print('There are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame.')

# Alex, this seems to be my problem area. Everything seems to cluster together..only forming 1 cluster.

In [None]:
#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(contentofpages) #fit the vectorizer to the content

print(tfidf_matrix.shape)

In [None]:
terms = tfidf_vectorizer.get_feature_names()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [None]:
km = KMeans(n_clusters=5)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

In [None]:
cluster_articles = {'title': titles, 'text': contentofpages, 'cluster': clusters}

frame = pd.DataFrame(cluster_articles, index = [clusters] , columns = ['title', 'text', 'cluster'])

In [None]:
frame['cluster'].value_counts() #number of articles per cluster (clusters from 0 to 4)