In [22]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from time import sleep
import pandas as pd
import re
import requests

In [23]:
def getTitle(soup):
	titleJunk = re.compile(r" - Wikipedia")  # removes " - Wikipedia"

	title = soup.title.get_text()
	title = titleJunk.sub(repl="", string=title)
	return title

# Image for article
def parseImages(soup):
	parseImagesURL = re.compile(r"")
	images = soup.findAll("img")
	nImages = len(images)
	try:
		sideTable = soup.find("table", attrs = {"class": "vertical-navbox"})
		tableImage = sideTable.find("img").get("src")
	except AttributeError:
		print("No image for article:", getTitle(soup))
		tableImage = ""

	return tableImage, nImages

# Main body text
def parseText(soup):
	citations = re.compile(r"\[\d+\]")  # Removes citations

	paragraphs = soup.findAll("p")
	parsedText = ""
	for paragraph in paragraphs:
		text = str(paragraph.get_text())
		noCites = citations.sub(repl="", string=text)
		parsedText += noCites

	nWords = len(parsedText.split())
	nChar = len(parsedText)

	return parsedText, nWords, nChar


def getLinks(soup):
    validLinks = re.compile(r"(?=(^/wiki))(?!.*(:))(?!.*(disambiguation))(?!.*(Main_Page))")

    links = soup.findAll("a")
    returnLinks = set()  # set of extensions to return
    
    paragraphs = soup.findAll("p")
    for p in paragraphs:
        for a in p.findAll("a"):
            href = a.get("href") 
            if validLinks.match(href):
                returnLinks.add(href)
    """
    for link in links:
        href = str(link.get("href"))

        if validLinks.match(href):
            returnLinks.add(href)
    """
    return returnLinks


def getArticles(anExtension, level=0, extensionsSoFar=set(), parent="None"):

    # Parse the HTML, make soup object
    headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0' }
    
    
    #html = urlopen("https://en.wikipedia.org" + anExtension,).read()
    soup = BeautifulSoup(html, "lxml")
    links = getLinks(soup)

    yield {
        "title": [getTitle(soup)],
        "extension": [anExtension],
        "parent": [parent],
        "imgURL": [parseImages(soup)[0]],
        "nChar": [parseText(soup)[2]],
        "nWords": [parseText(soup)[1]],
        "nImg": [parseImages(soup)[1]],
        "nLinks": [len(links)],
        "level": [level],
        "text": [parseText(soup)[0]]
    }

    # If at level 0, 1
    if level < 1:
        for link in links - extensionsSoFar:
            extensionsSoFar.add(link)
            yield from getArticles(link, level + 1, extensionsSoFar, getTitle(soup))

In [24]:
# Starting values
start = "/wiki/Philosophy"
parent = "none"

# Accumulator
data = pd.DataFrame({
        "title": [],
        "extension": [],
        "parent": [],
        "imgURL": [],
        "nChar": [],
        "nWords": [],
        "nImg": [],
        "nLinks": [],
        "level": [],
        "text": []
        })

try:
    for elem in getArticles(start, parent="None"):
        elem = pd.DataFrame(elem)
        data = data.append(elem)
except KeyboardInterrupt:
    pass
except RemoteDisconnected:
    continue 

No image for article: Seinfeld
No image for article: Seinfeld
No image for article: Postcolonialism
No image for article: Postcolonialism
No image for article: Fertile Crescent
No image for article: Fertile Crescent
No image for article: The Matrix (franchise)
No image for article: The Matrix (franchise)
No image for article: Bantu Philosophy
No image for article: Bantu Philosophy
No image for article: Modus ponens
No image for article: Modus ponens
No image for article: Colin McGinn
No image for article: Colin McGinn
No image for article: Decision theory
No image for article: Decision theory
No image for article: Tibet
No image for article: Tibet
No image for article: Animal rights
No image for article: Animal rights
No image for article: Reason
No image for article: Reason
No image for article: Regress argument
No image for article: Regress argument
No image for article: Early Islamic philosophy
No image for article: Early Islamic philosophy
No image for article: Truth
No image for a

RemoteDisconnected: Remote end closed connection without response

In [25]:
# Instantiate the vectorizer and PCA objects
vectorizer = TfidfVectorizer(stop_words="english", lowercase=True)
pca1 = PCA(n_components=5)
pca2 = PCA(n_components=5)

# Vectorize the text
tfidf = vectorizer.fit_transform(data.text).toarray()
tfidfDF = pd.DataFrame(tfidf)

# Join the dataframes
data.reset_index(drop=True, inplace=True)
tfidfDF.reset_index(drop=True, inplace=True)
newData = pd.concat([data, tfidfDF], axis=1)

# Get columns to be used in PCA
preds = ["nChar", "nImg", "nLinks", "nWords"]
textPreds = [elem for elem in list(newData) if type(elem) is int]
preds += textPreds

prComp = pd.DataFrame(pca1.fit_transform(X = newData.ix[:, preds]))
textComp = pd.DataFrame(pca2.fit_transform(X = newData.ix[:, textPreds]))

(272, 49662)