In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from time import sleep
import pandas as pd
import re
import requests
import csv

In [2]:
def getTitle(soup):
	titleJunk = re.compile(r" - Wikipedia")  # removes " - Wikipedia"

	title = soup.title.get_text()
	title = titleJunk.sub(repl="", string=title)
	return title

# Image for article
def parseImages(soup):
	parseImagesURL = re.compile(r"")
	images = soup.findAll("img")
	nImages = len(images)
	try:
		sideTable = soup.find("table", attrs = {"class": "vertical-navbox"})
		tableImage = sideTable.find("img").get("src")
	except AttributeError:
		tableImage = ""

	return tableImage, nImages

# Main body text
def parseText(soup):
	citations = re.compile(r"\[\d+\]")  # Removes citations

	paragraphs = soup.findAll("p")
	parsedText = ""
	for paragraph in paragraphs:
		text = str(paragraph.get_text())
		noCites = citations.sub(repl="", string=text)
		parsedText += noCites

	nWords = len(parsedText.split())
	nChar = len(parsedText)

	return parsedText, nWords, nChar


def getLinks(soup):
    validLinks = re.compile(r"(?=(^/wiki))(?!.*(:))(?!.*(disambiguation))(?!.*(Main_Page))")

    links = soup.findAll("a")
    paragraphs = soup.findAll("p")
    
    returnLinks = set()  # set of extensions to return
    for p in paragraphs:
        for a in p.findAll("a"):
            href = a.get("href") 
            if validLinks.match(href):
                returnLinks.add(href)
    return returnLinks


def getArticles(anExtension, level=0, extensionsSoFar=set(), parent="None"):
    # Parse the HTML, make soup object
    headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0' }
    sleep(1)
    
    # Parse the HTML, make soup object
    headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0' }
    r = requests.get("https://en.wikipedia.org" + anExtension, headers=headers)
    soup = BeautifulSoup(r.content, "lxml")
    links = getLinks(soup)

    yield {
        "title": [getTitle(soup)],
        "extension": [anExtension],
        "parent": [parent],
        "imgURL": [parseImages(soup)[0]],
        "nChar": [parseText(soup)[2]],
        "nWords": [parseText(soup)[1]],
        "nImg": [parseImages(soup)[1]],
        "nLinks": [len(links)],
        "level": [level],
        "text": [parseText(soup)[0]]
    }

    # If at level 0, 1
    if level <= 1:
        for link in links - extensionsSoFar:
            extensionsSoFar.add(link)
            yield from getArticles(link, level + 1, extensionsSoFar, getTitle(soup))

In [3]:
# Starting values
start = "/wiki/Philosophy"
parent = "none"

# Accumulator
data = pd.DataFrame({
        "title": [],
        "extension": [],
        "parent": [],
        "imgURL": [],
        "nChar": [],
        "nWords": [],
        "nImg": [],
        "nLinks": [],
        "level": [],
        "text": []
        })

with open(r'hope.csv', 'a') as f:
    writer = csv.writer(f)
    writer.writerow(list(data))
    try:
        for elem in getArticles(start):
            writer.writerow([ elem[key][0] for key in elem.keys()])
    except KeyboardInterrupt:
        pass
    except Exception as e:
        print(e)
        pass

/wiki/Philosophy
/wiki/Iran
/wiki/Falcons
/wiki/Panthera
/wiki/List_of_Iranian_newspapers
/wiki/Shikand-gumanic_Vichar
/wiki/Mesopotamia
/wiki/Basij
/wiki/World_War_II
/wiki/Regions_of_Iran
/wiki/Iranian_rock
/wiki/Virtual_private_network
/wiki/Downpour_(film)
/wiki/Iranian_Plateau
/wiki/Chogha_Mish
/wiki/Aryan
/wiki/Universal_suffrage
/wiki/Submarine
/wiki/Sassanid_art
/wiki/Franklin_D._Roosevelt
/wiki/Sasanian_Empire
/wiki/Turkey
/wiki/Badab-e_Surt
/wiki/FIVB_World_Rankings
/wiki/Ancient_Iranian_peoples
/wiki/Mandane
/wiki/Iranian_Economic_Reform_Plan#Subsidy_reform_plan
/wiki/Lor_Girl
/wiki/Viguen
/wiki/Calligraphy
/wiki/List_of_countries_by_GDP_(PPP)
/wiki/2009_Iranian_presidential_election_protests
/wiki/Architecture
/wiki/Science_in_Iran#Biotechnology
/wiki/Nabonidus#The_Persian_conquest_of_Babylonia
/wiki/Christmas
/wiki/Sumerian_language#Writing_system
/wiki/Avestan_language
/wiki/Kamanche
/wiki/Tehran_University_of_Medical_Sciences
/wiki/Bears
/wiki/Islamic_Revolutionary_Court

In [None]:
# Instantiate the vectorizer and PCA objects
vectorizer = TfidfVectorizer(stop_words="english", lowercase=True)
pca1 = PCA(n_components=5)
pca2 = PCA(n_components=5)

# Vectorize the text
tfidf = vectorizer.fit_transform(data.text).toarray()
tfidfDF = pd.DataFrame(tfidf)

# Join the dataframes
data.reset_index(drop=True, inplace=True)
tfidfDF.reset_index(drop=True, inplace=True)
newData = pd.concat([data, tfidfDF], axis=1)

# Get columns to be used in PCA
preds = ["nChar", "nImg", "nLinks", "nWords"]
textPreds = [elem for elem in list(newData) if type(elem) is int]
preds += textPreds

prComp = pd.DataFrame(pca1.fit_transform(X = newData.ix[:, preds]))
textComp = pd.DataFrame(pca2.fit_transform(X = newData.ix[:, textPreds]))

In [5]:
pd.read_csv("hope.csv")

(0, 10)