In [1]:
import wikipedia as wiki
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import gensim
from gensim.corpora.dictionary import Dictionary
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
import itertools
from gensim.models.tfidfmodel import TfidfModel

# Analyzing one Wikipedia article

In [2]:
page = wiki.page(wiki.random(1))

In [3]:
print(page.title)

Willard Memorial Chapel-Welch Memorial Hall


In [4]:
print(page.url)

https://en.wikipedia.org/wiki/Willard_Memorial_Chapel-Welch_Memorial_Hall


In [5]:
print(page.images)

['https://upload.wikimedia.org/wikipedia/commons/2/21/Church.svg', 'https://upload.wikimedia.org/wikipedia/commons/1/1a/Flag_of_New_York.svg', 'https://upload.wikimedia.org/wikipedia/commons/b/ba/Map_of_USA_NY.svg', 'https://upload.wikimedia.org/wikipedia/commons/0/0c/Red_pog.svg', 'https://upload.wikimedia.org/wikipedia/commons/c/c5/US-NationalParkService-ShadedLogo.svg', 'https://upload.wikimedia.org/wikipedia/commons/8/8d/USA_New_York_location_map.svg', 'https://upload.wikimedia.org/wikipedia/commons/2/20/Usa_edcp_location_map.svg', 'https://upload.wikimedia.org/wikipedia/commons/4/41/WillardChapel.JPG', 'https://upload.wikimedia.org/wikipedia/commons/e/e8/Willard_Chapel_rear_interior.jpg', 'https://upload.wikimedia.org/wikipedia/commons/0/01/Willard_Chapel_window_detail_1.jpg', 'https://upload.wikimedia.org/wikipedia/commons/8/83/Willard_etc_025.jpg', 'https://upload.wikimedia.org/wikipedia/commons/1/1e/Willard_etc_039.jpg', 'https://upload.wikimedia.org/wikipedia/commons/4/4d/Will

In [6]:
print(page.links)

['Auburn, NY', 'Auburn, New York', 'Auburn Theological Seminary', 'Cayuga County, New York', 'Chapel', 'Contributing property', 'Dr. Sylvester Willard Mansion', 'Geographic coordinate system', 'Historic districts in the United States', 'History of the National Register of Historic Places', 'Keeper of the Register', 'List of National Historic Landmarks in New York', 'List of bridges and tunnels on the National Register of Historic Places in New York', 'Louis Comfort Tiffany', 'National Historic Landmark', 'National Park Service', 'National Register of Historic Places', 'National Register of Historic Places listings in Albany, New York', 'National Register of Historic Places listings in Albany County, New York', 'National Register of Historic Places listings in Allegany County, New York', 'National Register of Historic Places listings in Brooklyn', 'National Register of Historic Places listings in Broome County, New York', 'National Register of Historic Places listings in Buffalo, New Yo

In [7]:
print(page.summary)

The Willard Memorial Chapel and the adjoining Welch Memorial Hall are located in Auburn, Cayuga County, New York state.


In [8]:
print(page.content)

The Willard Memorial Chapel and the adjoining Welch Memorial Hall are located in Auburn, Cayuga County, New York state.


== Architecture ==
The chapel and hall were designed by Warner & Brockett of Rochester, New York. They feature the stained-glass windows and interior decoration of Louis Comfort Tiffany. They are the last surviving complete installation by Tiffany in its original location.
The chapel was built between 1892 and 1894 in memory of Dr. Sylvester Willard and his wife, Jane Frances Case Willard. Funding was provided by their daughters, Caroline and Georgiana. The chapel was a part of the former Auburn Theological Seminary.


== Conservation ==
They were declared a National Historic Landmark in 2005. The Willard Memorial Chapel and Welch Memorial Hall are listed on the National Register of Historic Places.


== See also ==
Dr. Sylvester Willard Mansion
Auburn Theological Seminary
National Register of Historic Places listings in Cayuga County, New York


== Gallery ==

		
	

In [9]:
tokens = word_tokenize(page.content)

In [10]:
lower_tokens = [t.lower() for t in tokens]

In [11]:
# Creating an initial bag of words.
bag = Counter(lower_tokens)

In [12]:
print(bag)

Counter({'==': 12, 'the': 10, '.': 9, 'and': 7, ',': 6, 'of': 6, 'willard': 5, 'chapel': 5, 'in': 5, 'memorial': 4, 'hall': 3, 'are': 3, 'auburn': 3, 'new': 3, 'york': 3, 'by': 3, 'they': 3, 'was': 3, 'national': 3, 'historic': 3, 'welch': 2, 'cayuga': 2, 'county': 2, 'were': 2, 'tiffany': 2, 'dr.': 2, 'sylvester': 2, 'a': 2, 'theological': 2, 'seminary': 2, 'register': 2, 'places': 2, 'adjoining': 1, 'located': 1, 'state': 1, 'architecture': 1, 'designed': 1, 'warner': 1, '&': 1, 'brockett': 1, 'rochester': 1, 'feature': 1, 'stained-glass': 1, 'windows': 1, 'interior': 1, 'decoration': 1, 'louis': 1, 'comfort': 1, 'last': 1, 'surviving': 1, 'complete': 1, 'installation': 1, 'its': 1, 'original': 1, 'location': 1, 'built': 1, 'between': 1, '1892': 1, '1894': 1, 'memory': 1, 'his': 1, 'wife': 1, 'jane': 1, 'frances': 1, 'case': 1, 'funding': 1, 'provided': 1, 'their': 1, 'daughters': 1, 'caroline': 1, 'georgiana': 1, 'part': 1, 'former': 1, 'conservation': 1, 'declared': 1, 'landmark': 

### Clean up and preprocessing to remove non-alphabetic characters, stop words, and to lemmatize.

In [13]:
# Removing non-alphabetic characters.
alpha_only = [t for t in lower_tokens if t.isalpha()]

In [14]:
# Removing English stop words.
english_stops = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn', '']
no_stops = [t for t in alpha_only if t not in english_stops]

In [15]:
# Lemmatizing.
word_lemm = WordNetLemmatizer()
lemm = [word_lemm.lemmatize(t) for t in no_stops]

In [16]:
# Creating an updated and improved bag of words.
newbag = Counter(lemm)
print(newbag.most_common(10))

[('willard', 5), ('chapel', 5), ('memorial', 4), ('hall', 3), ('auburn', 3), ('new', 3), ('york', 3), ('national', 3), ('historic', 3), ('welch', 2)]


You can see the most common words in this particle article and glean key points to aid in topic identification.

# Analyzing multiple articles

## Gensim

In [35]:
multiple_pages = wiki.random(10)

In [36]:
print(multiple_pages)

['Sunflower (2006 film)', 'Adduci', 'Nokia 8310', 'The Brother Moves On', 'ESPNews', 'Im Kyu-tae', 'Konrad Gehringer', 'Snail mail', 'Kampa', 'Brouard']


In [37]:
articles = []
for page in multiple_pages:
    articles.append(wiki.page(page).content)

In [38]:
clean_articles = []
for article in articles:
    tokens = word_tokenize(article)
    lower_tokens = [t.lower() for t in tokens]
    alpha_only = [t for t in lower_tokens if t.isalpha()]
    no_stops = [t for t in alpha_only if t not in english_stops]
    lemm = [word_lemm.lemmatize(t) for t in no_stops]
    clean_articles.append(lemm)

In [39]:
dictionary = Dictionary(clean_articles)

In [40]:
# Creating a gensim corpus.
corpus = [dictionary.doc2bow(article) for article in clean_articles]

### Gensim Bag-of-Words

In [41]:
total_word_count = defaultdict(int)
for word_id, word_count in itertools.chain.from_iterable(corpus):
    total_word_count[word_id] += word_count

In [42]:
sorted_word_count = sorted(total_word_count.items(), key=lambda w: w[1], reverse=True) 

In [43]:
for word_id, word_count in sorted_word_count[:5]:
    print(dictionary.get(word_id), word_count)

espn 60
espnews 58
sport 34
game 33
network 33


### TF-IDF

In [44]:
tfidf = TfidfModel(corpus)

In [45]:
# As an example, we'll use the 2rd article to calculate the significant terms.
tfidf_weights = tfidf[corpus[1]]
sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=True)

In [46]:
# The top 5 weighted words.
for term_id, weight in sorted_tfidf_weights[:5]:
    print(dictionary.get(term_id), weight)

adduci 0.5987257385872076
league 0.25906606272633453
tiger 0.25659674510880326
game 0.21920974538382149
baseball 0.1793534280413085


**You can see that both the bag of words and the TF-IDF methods returned similar/related results. Sports-related words were reflected in both results sets.